In [None]:
# Importing libraries:

import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from collections import defaultdict
%matplotlib inline
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
nltk.download('stopwords')
nltk.download('punkt')
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.metrics import confusion_matrix, plot_confusion_matrix
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')

# Pre-processing:

In [None]:
train = pd.read_csv('../input/nlp-getting-started/train.csv')
test = pd.read_csv('../input/nlp-getting-started/test.csv')
train.head()

#### Removing Keyword and Location variables from both train and test:

In [None]:
train = train.drop(['keyword','location'], axis = 1)
test = test.drop(['keyword','location'], axis = 1)
train.head()

In [None]:
# Checking Shape of Train and Test sets:
print("Shape of Train set:", train.shape)
print("Shape of Test set:", test.shape)

In [None]:
# Removing duplicates of Train set. There are few duplicates in Test set as well,
# however, duplicates of Test set can'b be removed because the final test with target has to be uploaded as a submission file

train = train.drop_duplicates(subset=['text'], keep='last')
print("Shape of Train set after removing duplicates:", train.shape)

In [None]:
train[train['text'].map(lambda x: x.isascii())]
test[test['text'].map(lambda x: x.isascii())]

# Cleaning Tweets
def clean_tweets(text):
    text = re.sub(r'@[A-Za-z0-9_]+','',text)    # Removing @mentions
    text = re.sub(r'#','',text)                 # Removing #tag symbol
    text = re.sub(r'RT[\s]+',' ',text)          # Remvoing RT
    text = re.sub(r'\n','',text) 
    text = re.sub(r',','',text) 
    text = re.sub(r'.[.]+','',text) 
    text = re.sub(r'\w+:\/\/\S+','',text) 
    text = re.sub(r'https?:\/\/\S+','',text)    # Removing hyperlinks
    text = re.sub(r'/',' ',text)
    text = re.sub(r'-',' ',text)
    text = re.sub(r'_',' ',text)
    text = re.sub(r'!','',text)
    text = re.sub(r':',' ',text)
    text = re.sub(r'$','',text)
    text = re.sub(r'%','',text)
    text = re.sub(r'^','',text)
    text = re.sub(r'&','',text)
    text = re.sub(r'=',' ',text)
    text = re.sub(r' +',' ',text)               # Removing extra whitespaces

    return text

# Removing Emojis
def clean_emoji(inputString):
    return inputString.encode('ascii', 'ignore').decode('ascii')

train['text'] = train['text'].apply(clean_tweets)    # Applying function to clean tweets
train['text'] = train['text'].apply(clean_emoji)     # Applying function to remove emojis
train['text'] = train.text.str.lower()               # Making all texts to lower case
train['text'] = train['text'].str.strip()            # Removing leading and trailing whitespaces

test['text'] = test['text'].apply(clean_tweets)      # Applying function to clean tweets
test['text'] = test['text'].apply(clean_emoji)       # Applying function to remove emojis
test['text'] = test.text.str.lower()                 # Making all texts to lower case
test['text'] = test['text'].str.strip()              # Removing leading and trailing whitespaces
#pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', -1)

## Labels are as follows:
'target' -> This denotes whether a tweet is about a real disaster (1) or not (0)

In [None]:
train['target'].value_counts()

# Setups:

Each of our classification models (SVM, Naive Bayes, Logistic Regression, and Random Forest) were
tested on the following setups:

1. **Setup 1: Removing Punctuation:** All the models are trained and tested after removing punctuations from the corpus.
2. **Setup 2: Removing Stop-words:** All the models are trained and tested after removing stop-words from the corpus.
3. **Setup 3: Removing Numbers:** All the models are trained and tested after removing numbers from the corpus.
4. **Setup 4: Removing Repeating Characters:** All the models are trained and tested after removing repeating characters.
5. **Setup 5: Stemming and Lemmatization:** All the models are trained and tested after applying stemming and lemmatization.
6. **Setup 6: Setup 1–5:** All the models are trained and tested after removing punctuation, stop-words, numbers, repeating words, stemming and lemmatization.
7. **Setup 7: Keeping all above features:** All the models are trained and tested without eliminating any of the above special features.

# Models:
### These models with hyperparameters will be used by all setups, to find the best setup and best model:

In [None]:
# making a dictionary with four models with some parameters:

model_params = {
    
    'SVC' :{
        'model' : SVC(),
        'params' : {
            'C': [0.1, 1, 10, 100], 'gamma': [1, 0.1, 0.01], 'kernel': ['rbf','linear','poly','sigmoid']
        }
    },
    
    'MultinomialNB' :{
        'model' : MultinomialNB(),
        'params' : {
            'alpha' : np.linspace(0.5, 1.5, 6), 'fit_prior' : [True, False]
        }
    },
    
    'logistics_regression' :{
        'model' : LogisticRegression(solver = 'lbfgs', multi_class = 'auto'),
        'params' : {
            'C' : [0.1, 1, 20, 40, 60, 80, 100], 'solver' : ['lbfgs', 'liblinear']
        }
    },
    
    'random_forest' :{
        'model' : RandomForestClassifier(),
        'params' : {
            'n_estimators' : [80,85,90,95,100], 
            'max_depth':[20,30,None], 'criterion':['gini','entropy']
        }
    }
}

# Setup 1: Models after removing Punctuations:

In [None]:
# Creating a df that is copy of the train set.
df = train.copy()

### Removing Punctuations:

In [None]:
import string
string.punctuation

In [None]:
punctuations_list = string.punctuation
def cleaning_punctuations(text):
    translator = str.maketrans('', '', punctuations_list)
    return text.translate(translator)

df['text'] = df['text'].apply(lambda x: cleaning_punctuations(x))

### Splitting data into Train and Test :

In [None]:
# Splitting data into Train and Test sets:
X = df['text']
y = df['target']
X_train, X_test, y_train, y_test =  train_test_split(X, y, test_size = 0.2, random_state = 3)

### Transforming dataset using TF-IDF Vectorizer:

In [None]:
# Extracting features using TF-IDF (1,2) - unigrams and bigrams
vectoriser = TfidfVectorizer(ngram_range=(1,2), max_features=500000)
vectoriser.fit(X_train)
print('No. of feature_words: ', len(vectoriser.get_feature_names()))

# Transforming the data using TD-IDF Vectorizer
X_train = vectoriser.transform(X_train)
X_test  = vectoriser.transform(X_test)

### Results:

In [None]:
%%time

# implemented GridSearchCV for four models using a loop and a previously created dictionary
# in the created variable 'scores', results are stored for each model such as: model, best_score and best_params.


scores = []

for model_name, mp in model_params.items():
    clf = GridSearchCV(mp['model'], mp['params'], cv=5, n_jobs=-1, verbose=1) # Using Cross Validation of 5 and n_jobs=-1 for fast training by using all the processors
    print(mp['model'])
    print('\nTraining the model...')
    best_model = clf.fit(X_train, y_train)                      # Training the model
    clf_pred = best_model.predict(X_test)                       # Predicting the results
    print(confusion_matrix(y_test,clf_pred))                    # Printing Confusion Matrix
    print(metrics.classification_report(y_test, clf_pred))      # Printing Classification Report
    scores.append({                                             # Appending results to 'scores' list
        'model' : model_name,
        'best_score' : best_model.score(X_test, y_test),
        'best_params' : clf.best_params_
    })
    print('\nScore is appended.\n')
    
# Creating data frame with model, best scores and best params:
res = pd.DataFrame(scores, columns=['model', 'best_score', 'best_params'])
res

# Setup 2: Models after removing Stop-words:

In [None]:
# Creating a df that is copy of the train set.
df = train.copy()

### Removing Stop-words:

In [None]:
sw = stopwords.words('english')
df['text'] = df['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (sw)]))

### Splitting data into Train and Test :

In [None]:
# Splitting data into Train and Test sets:
X = df['text']
y = df['target']
X_train, X_test, y_train, y_test =  train_test_split(X, y, test_size = 0.2, random_state = 3)

### Transforming dataset using TF-IDF Vectorizer:

In [None]:
# Extracting features using TF-IDF (1,2) - unigrams and bigrams
vectoriser = TfidfVectorizer(ngram_range=(1,2), max_features=500000)
vectoriser.fit(X_train)
print('No. of feature_words: ', len(vectoriser.get_feature_names()))

# Transforming the data using TD-IDF Vectorizer
X_train = vectoriser.transform(X_train)
X_test  = vectoriser.transform(X_test)

### Results:

In [None]:
%%time

# implemented GridSearchCV for four models using a loop and a previously created dictionary
# in the created variable 'scores', results are stored for each model such as: model, best_score and best_params.


scores = []

for model_name, mp in model_params.items():
    clf = GridSearchCV(mp['model'], mp['params'], cv=5, n_jobs=-1, verbose=1) # Using Cross Validation of 5 and n_jobs=-1 for fast training by using all the processors
    print(mp['model'])
    print('\nTraining the model...')
    best_model = clf.fit(X_train, y_train)                      # Training the model
    clf_pred = best_model.predict(X_test)                       # Predicting the results
    print(confusion_matrix(y_test,clf_pred))                    # Printing Confusion Matrix
    print(metrics.classification_report(y_test, clf_pred))      # Printing Classification Report
    scores.append({                                             # Appending results to 'scores' list
        'model' : model_name,
        'best_score' : best_model.score(X_test, y_test),
        'best_params' : clf.best_params_
    })
    print('\nScore is appended.\n')
    
# Creating data frame with model, best scores and best params:
res = pd.DataFrame(scores, columns=['model', 'best_score', 'best_params'])
res

# Setup 3: Models after removing numbers:

In [None]:
# Creating a df that is copy of the train set.
df = train.copy()

### Removing numbers:

In [None]:
def cleaning_numbers(text):
    return re.sub('[0-9]+', '', text)

df['text'] = df['text'].apply(lambda text: cleaning_numbers(text))

### Splitting data into Train and Test :

In [None]:
# Splitting data into Train and Test sets:
X = df['text']
y = df['target']
X_train, X_test, y_train, y_test =  train_test_split(X, y, test_size = 0.2, random_state = 3)

### Transforming dataset using TF-IDF Vectorizer:

In [None]:
# Extracting features using TF-IDF (1,2) - unigrams and bigrams
vectoriser = TfidfVectorizer(ngram_range=(1,2), max_features=500000)
vectoriser.fit(X_train)
print('No. of feature_words: ', len(vectoriser.get_feature_names()))

# Transforming the data using TD-IDF Vectorizer
X_train = vectoriser.transform(X_train)
X_test  = vectoriser.transform(X_test)

### Results:

In [None]:
%%time

# implemented GridSearchCV for four models using a loop and a previously created dictionary
# in the created variable 'scores', results are stored for each model such as: model, best_score and best_params.


scores = []

for model_name, mp in model_params.items():
    clf = GridSearchCV(mp['model'], mp['params'], cv=5, n_jobs=-1, verbose=1) # Using Cross Validation of 5 and n_jobs=-1 for fast training by using all the processors
    print(mp['model'])
    print('\nTraining the model...')
    best_model = clf.fit(X_train, y_train)                      # Training the model
    clf_pred = best_model.predict(X_test)                       # Predicting the results
    print(confusion_matrix(y_test,clf_pred))                    # Printing Confusion Matrix
    print(metrics.classification_report(y_test, clf_pred))      # Printing Classification Report
    scores.append({                                             # Appending results to 'scores' list
        'model' : model_name,
        'best_score' : best_model.score(X_test, y_test),
        'best_params' : clf.best_params_
    })
    print('\nScore is appended.\n')
    
# Creating data frame with model, best scores and best params:
res = pd.DataFrame(scores, columns=['model', 'best_score', 'best_params'])
res

# Setup 4: Models after removing repeating characters:

In [None]:
# Creating a df that is copy of the train set.
df = train.copy()

### Removing repeating characteres:

In [None]:
tokens = (word_tokenize(i) for i in df.text)
df['text'] = df['text'].apply(nltk.word_tokenize)

pattern = re.compile(r'(.)\1*')

def reduce_sequence_word(word):
    return ''.join([match.group()[:2] if len(match.group()) > 2 else match.group() for match in pattern.finditer(word)])

def reduce_sequence_tweet(tweet):
    return [reduce_sequence_word(word) for word in tweet]

df.text = df.text.apply(lambda tweet: reduce_sequence_tweet(tweet))

### Splitting data into Train and Test :

In [None]:
# Splitting data into Train and Test sets:
X = df['text'].astype(str)
y = df['target'].astype(str)
X_train, X_test, y_train, y_test =  train_test_split(X, y, test_size = 0.2, random_state = 3)

### Transforming dataset using TF-IDF Vectorizer:

In [None]:
# Extracting features using TF-IDF (1,2) - unigrams and bigrams
vectoriser = TfidfVectorizer(ngram_range=(1,2), max_features=500000)
vectoriser.fit(X_train)
print('No. of feature_words: ', len(vectoriser.get_feature_names()))

# Transforming the data using TD-IDF Vectorizer
X_train = vectoriser.transform(X_train)
X_test  = vectoriser.transform(X_test)

### Results:

In [None]:
%%time

# implemented GridSearchCV for four models using a loop and a previously created dictionary
# in the created variable 'scores', results are stored for each model such as: model, best_score and best_params.


scores = []

for model_name, mp in model_params.items():
    clf = GridSearchCV(mp['model'], mp['params'], cv=5, n_jobs=-1, verbose=1) # Using Cross Validation of 5 and n_jobs=-1 for fast training by using all the processors
    print(mp['model'])
    print('\nTraining the model...')
    best_model = clf.fit(X_train, y_train)                      # Training the model
    clf_pred = best_model.predict(X_test)                       # Predicting the results
    print(confusion_matrix(y_test,clf_pred))                    # Printing Confusion Matrix
    print(metrics.classification_report(y_test, clf_pred))      # Printing Classification Report
    scores.append({                                             # Appending results to 'scores' list
        'model' : model_name,
        'best_score' : best_model.score(X_test, y_test),
        'best_params' : clf.best_params_
    })
    print('\nScore is appended.\n')
    
# Creating data frame with model, best scores and best params:
res = pd.DataFrame(scores, columns=['model', 'best_score', 'best_params'])
res

# Setup 5: Applying Stemming and Lemmatization:

In [None]:
# Creating a df that is copy of the train set.
df = train.copy()

### Applying Stemming: 

In [None]:
# Tokenizing tweets:
tokens = (word_tokenize(i) for i in df.text)
df['text'] = df['text'].apply(nltk.word_tokenize)

stemm = SnowballStemmer('english')
df['text'] = df['text'].apply(lambda x: [stemm.stem(y) for y in x])

### Splitting data into Train and Test :

In [None]:
# Splitting data into Train and Test sets:
X = df['text'].astype(str)
y = df['target'].astype(str)
X_train, X_test, y_train, y_test =  train_test_split(X, y, test_size = 0.2, random_state = 3)

### Transforming dataset using TF-IDF Vectorizer:

In [None]:
# Extracting features using TF-IDF (1,2) - unigrams and bigrams
vectoriser = TfidfVectorizer(ngram_range=(1,2), max_features=500000)
vectoriser.fit(X_train)
print('No. of feature_words: ', len(vectoriser.get_feature_names()))

# Transforming the data using TD-IDF Vectorizer
X_train = vectoriser.transform(X_train)
X_test  = vectoriser.transform(X_test)

### Results:

In [None]:
%%time

# implemented GridSearchCV for four models using a loop and a previously created dictionary
# in the created variable 'scores', results are stored for each model such as: model, best_score and best_params.


scores = []

for model_name, mp in model_params.items():
    clf = GridSearchCV(mp['model'], mp['params'], cv=5, n_jobs=-1, verbose=1) # Using Cross Validation of 5 and n_jobs=-1 for fast training by using all the processors
    print(mp['model'])
    print('\nTraining the model...')
    best_model = clf.fit(X_train, y_train)                      # Training the model
    clf_pred = best_model.predict(X_test)                       # Predicting the results
    print(confusion_matrix(y_test,clf_pred))                    # Printing Confusion Matrix
    print(metrics.classification_report(y_test, clf_pred))      # Printing Classification Report
    scores.append({                                             # Appending results to 'scores' list
        'model' : model_name,
        'best_score' : best_model.score(X_test, y_test),
        'best_params' : clf.best_params_
    })
    print('\nScore is appended.\n')
    
# Creating data frame with model, best scores and best params:
res = pd.DataFrame(scores, columns=['model', 'best_score', 'best_params'])
res

# Setup 6: Models after removing all the features:

In [None]:
# Creating a df that is copy of the train set.
df = train.copy()

### Removing Punctuation:

In [None]:
import string
string.punctuation

punctuations_list = string.punctuation
def cleaning_punctuations(text):
    translator = str.maketrans('', '', punctuations_list)
    return text.translate(translator)

df['text'] = df['text'].apply(lambda x: cleaning_punctuations(x))

### Removing Stop-words: 

In [None]:
sw = stopwords.words('english')
df['text'] = df['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (sw)]))

### Removing Numbers:

In [None]:
def cleaning_numbers(text):
    return re.sub('[0-9]+', '', text)

df['text'] = df['text'].apply(lambda text: cleaning_numbers(text))

### Removing repeating characters:

In [None]:
tokens = (word_tokenize(i) for i in df.text)
df['text'] = df['text'].apply(nltk.word_tokenize)

pattern = re.compile(r'(.)\1*')

def reduce_sequence_word(word):
    return ''.join([match.group()[:2] if len(match.group()) > 2 else match.group() for match in pattern.finditer(word)])

def reduce_sequence_tweet(tweet):
    return [reduce_sequence_word(word) for word in tweet]

df.text = df.text.apply(lambda tweet: reduce_sequence_tweet(tweet))

### Applying Stemming and Lemmatization:

In [None]:
stemm = SnowballStemmer('english')
df['text'] = df['text'].apply(lambda x: [stemm.stem(y) for y in x])

### Splitting data into Train and Test :

In [None]:
# Splitting data into Train and Test sets:
X = df['text'].astype(str)
y = df['target'].astype(str)
X_train, X_test, y_train, y_test =  train_test_split(X, y, test_size = 0.2, random_state = 3)

### Transforming dataset using TF-IDF Vectorizer:

In [None]:
# Extracting features using TF-IDF (1,2) - unigrams and bigrams
vectoriser = TfidfVectorizer(ngram_range=(1,2), max_features=500000)
vectoriser.fit(X_train)
print('No. of feature_words: ', len(vectoriser.get_feature_names()))

# Transforming the data using TD-IDF Vectorizer
X_train = vectoriser.transform(X_train)
X_test  = vectoriser.transform(X_test)

### Results:

In [None]:
%%time

# implemented GridSearchCV for four models using a loop and a previously created dictionary
# in the created variable 'scores', results are stored for each model such as: model, best_score and best_params.


scores = []

for model_name, mp in model_params.items():
    clf = GridSearchCV(mp['model'], mp['params'], cv=5, n_jobs=-1, verbose=1) # Using Cross Validation of 5 and n_jobs=-1 for fast training by using all the processors
    print(mp['model'])
    print('\nTraining the model...')
    best_model = clf.fit(X_train, y_train)                      # Training the model
    clf_pred = best_model.predict(X_test)                       # Predicting the results
    print(confusion_matrix(y_test,clf_pred))                    # Printing Confusion Matrix
    print(metrics.classification_report(y_test, clf_pred))      # Printing Classification Report
    scores.append({                                             # Appending results to 'scores' list
        'model' : model_name,
        'best_score' : best_model.score(X_test, y_test),
        'best_params' : clf.best_params_
    })
    print('\nScore is appended.\n')
    
# Creating data frame with model, best scores and best params:
res = pd.DataFrame(scores, columns=['model', 'best_score', 'best_params'])
res

# Setup 7: Models without removing any setup:

In [None]:
# Creating a df that is copy of the train set.
df = train.copy()

### Splitting data into Train and Test :

In [None]:
# Splitting data into Train and Test sets:
X = df['text']
y = df['target']
X_train, X_test, y_train, y_test =  train_test_split(X, y, test_size = 0.2, random_state = 3)

### Transforming dataset using TF-IDF Vectorizer:

In [None]:
# Extracting features using TF-IDF (1,2) - unigrams and bigrams
vectoriser = TfidfVectorizer(ngram_range=(1,2), max_features=500000)
vectoriser.fit(X_train)
print('No. of feature_words: ', len(vectoriser.get_feature_names()))

# Transforming the data using TD-IDF Vectorizer
X_train = vectoriser.transform(X_train)
X_test  = vectoriser.transform(X_test)

### Results:

In [None]:
%%time

# implemented GridSearchCV for four models using a loop and a previously created dictionary
# in the created variable 'scores', results are stored for each model such as: model, best_score and best_params.


scores = []

for model_name, mp in model_params.items():
    clf = GridSearchCV(mp['model'], mp['params'], cv=5, n_jobs=-1, verbose=1) # Using Cross Validation of 5 and n_jobs=-1 for fast training by using all the processors
    print(mp['model'])
    print('\nTraining the model...')
    best_model = clf.fit(X_train, y_train)                      # Training the model
    clf_pred = best_model.predict(X_test)                       # Predicting the results
    print(confusion_matrix(y_test,clf_pred))                    # Printing Confusion Matrix
    print(metrics.classification_report(y_test, clf_pred))      # Printing Classification Report
    scores.append({                                             # Appending results to 'scores' list
        'model' : model_name,
        'best_score' : best_model.score(X_test, y_test),
        'best_params' : clf.best_params_
    })
    print('\nScore is appended.\n')
    
# Creating data frame with model, best scores and best params:
res = pd.DataFrame(scores, columns=['model', 'best_score', 'best_params'])
res

# Creating Submission file:
It can be observed that **Setup-1 and 7** is performing best for SVM model. **Setup 1** will be used. Let's just train this model with 100% training data. This model will be used for predicting test file.

In [None]:
# Creating a df that is copy of the train set.
df = train.copy()

In [None]:
import string
string.punctuation

punctuations_list = string.punctuation
def cleaning_punctuations(text):
    translator = str.maketrans('', '', punctuations_list)
    return text.translate(translator)

df['text'] = df['text'].apply(lambda x: cleaning_punctuations(x))

### Splitting data into Train and Test :

In [None]:
# Not spliiting, Creating X_train and y_train.
# Using 100% data for training SVC model to get better training. Because from Step - 7,
# it can be concluded that SVC model with 'TF-IDF Vectorizer (1,2) - unigrams and bigrams' performs best for this dataset


X_train = df['text']
y_train = df['target']    
X_test = test['text']   

### Transforming dataset using TF-IDF Vectorizer:

In [None]:
# Extracting features using TF-IDF (1,2) - unigrams and bigrams
vectoriser = TfidfVectorizer(ngram_range=(1,2), max_features=500000)
vectoriser.fit(X_train)
print('No. of feature_words: ', len(vectoriser.get_feature_names()))

# Transforming the data using TD-IDF Vectorizer
X_train = vectoriser.transform(X_train)
X_test  = vectoriser.transform(X_test)

### SVC model:

In [None]:
svc = SVC()
hyperParam = {'C': [0.1, 1, 10, 100], 'gamma': [1, 0.1, 0.01], 'kernel': ['rbf','linear','poly','sigmoid']}

gsv = GridSearchCV(svc,hyperParam,cv=5,verbose=1,n_jobs=-1)  # Using Cross Validation of 5 and n_jobs=-1 for fast training by using all the processors
best_model = gsv.fit(X_train, y_train)                       # Training model with X_train and y_train
svc_pred = best_model.predict(X_test)                        # Predicting the results

print("Best HyperParameter: ", gsv.best_params_)

### Submission file:

In [None]:
print(svc_pred)
print(type(svc_pred))

my_array = svc_pred
print(len(my_array))

submission = pd.DataFrame(my_array,columns = ['target'])
submission['id'] = test['id']
submission = submission[['id','target']]
submission.to_csv('submission.csv', index=False)
submission