In [1]:
# Importing libraries:

import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from collections import defaultdict
%matplotlib inline
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
nltk.download('stopwords')
nltk.download('punkt')
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.metrics import confusion_matrix, plot_confusion_matrix
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Pre-processing:

In [2]:
train = pd.read_csv('../input/nlp-getting-started/train.csv')
test = pd.read_csv('../input/nlp-getting-started/test.csv')
train.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


#### Removing Keyword and Location variables from both train and test:

In [3]:
train = train.drop(['keyword','location'], axis = 1)
test = test.drop(['keyword','location'], axis = 1)
train.head()

Unnamed: 0,id,text,target
0,1,Our Deeds are the Reason of this #earthquake M...,1
1,4,Forest fire near La Ronge Sask. Canada,1
2,5,All residents asked to 'shelter in place' are ...,1
3,6,"13,000 people receive #wildfires evacuation or...",1
4,7,Just got sent this photo from Ruby #Alaska as ...,1


In [4]:
# Checking Shape of Train and Test sets:
print("Shape of Train set:", train.shape)
print("Shape of Test set:", test.shape)

Shape of Train set: (7613, 3)
Shape of Test set: (3263, 2)


In [5]:
# Removing duplicates of Train set. There are few duplicates in Test set as well,
# however, duplicates of Test set can'b be removed because the final test with target has to be uploaded as a submission file

train = train.drop_duplicates(subset=['text'], keep='last')
print("Shape of Train set after removing duplicates:", train.shape)

Shape of Train set after removing duplicates: (7503, 3)


In [6]:
train[train['text'].map(lambda x: x.isascii())]
test[test['text'].map(lambda x: x.isascii())]

# Cleaning Tweets
def clean_tweets(text):
    text = re.sub(r'@[A-Za-z0-9_]+','',text)    # Removing @mentions
    text = re.sub(r'#','',text)                 # Removing #tag symbol
    text = re.sub(r'RT[\s]+',' ',text)          # Remvoing RT
    text = re.sub(r'\n','',text) 
    text = re.sub(r',','',text) 
    text = re.sub(r'.[.]+','',text) 
    text = re.sub(r'\w+:\/\/\S+','',text) 
    text = re.sub(r'https?:\/\/\S+','',text)    # Removing hyperlinks
    text = re.sub(r'/',' ',text)
    text = re.sub(r'-',' ',text)
    text = re.sub(r'_',' ',text)
    text = re.sub(r'!','',text)
    text = re.sub(r':',' ',text)
    text = re.sub(r'$','',text)
    text = re.sub(r'%','',text)
    text = re.sub(r'^','',text)
    text = re.sub(r'&','',text)
    text = re.sub(r'=',' ',text)
    text = re.sub(r' +',' ',text)               # Removing extra whitespaces

    return text

# Removing Emojis
def clean_emoji(inputString):
    return inputString.encode('ascii', 'ignore').decode('ascii')

train['text'] = train['text'].apply(clean_tweets)    # Applying function to clean tweets
train['text'] = train['text'].apply(clean_emoji)     # Applying function to remove emojis
train['text'] = train.text.str.lower()               # Making all texts to lower case
train['text'] = train['text'].str.strip()            # Removing leading and trailing whitespaces

test['text'] = test['text'].apply(clean_tweets)      # Applying function to clean tweets
test['text'] = test['text'].apply(clean_emoji)       # Applying function to remove emojis
test['text'] = test.text.str.lower()                 # Making all texts to lower case
test['text'] = test['text'].str.strip()              # Removing leading and trailing whitespaces
#pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', -1)

## Labels are as follows:
'target' -> This denotes whether a tweet is about a real disaster (1) or not (0)

In [7]:
train['target'].value_counts()

0    4307
1    3196
Name: target, dtype: int64

# Setups:

Each of our classification models (SVM, Naive Bayes, Logistic Regression, and Random Forest) were
tested on the following setups:

1. **Setup 1: Removing Punctuation:** All the models are trained and tested after removing punctuations from the corpus.
2. **Setup 2: Removing Stop-words:** All the models are trained and tested after removing stop-words from the corpus.
3. **Setup 3: Removing Numbers:** All the models are trained and tested after removing numbers from the corpus.
4. **Setup 4: Removing Repeating Characters:** All the models are trained and tested after removing repeating characters.
5. **Setup 5: Stemming and Lemmatization:** All the models are trained and tested after applying stemming and lemmatization.
6. **Setup 6: Setup 1–5:** All the models are trained and tested after removing punctuation, stop-words, numbers, repeating words, stemming and lemmatization.
7. **Setup 7: Keeping all above features:** All the models are trained and tested without eliminating any of the above special features.

# Models:
### These models with hyperparameters will be used by all setups, to find the best setup and best model:

In [8]:
# making a dictionary with four models with some parameters:

model_params = {
    
    'SVC' :{
        'model' : SVC(),
        'params' : {
            'C': [0.1, 1, 10, 100], 'gamma': [1, 0.1, 0.01], 'kernel': ['rbf','linear','poly','sigmoid']
        }
    },
    
    'MultinomialNB' :{
        'model' : MultinomialNB(),
        'params' : {
            'alpha' : np.linspace(0.5, 1.5, 6), 'fit_prior' : [True, False]
        }
    },
    
    'logistics_regression' :{
        'model' : LogisticRegression(solver = 'lbfgs', multi_class = 'auto'),
        'params' : {
            'C' : [0.1, 1, 20, 40, 60, 80, 100], 'solver' : ['lbfgs', 'liblinear']
        }
    },
    
    'random_forest' :{
        'model' : RandomForestClassifier(),
        'params' : {
            'n_estimators' : [80,85,90,95,100], 
            'max_depth':[20,30,None], 'criterion':['gini','entropy']
        }
    }
}

# Setup 1: Models after removing Punctuations:

In [9]:
# Creating a df that is copy of the train set.
df = train.copy()

### Removing Punctuations:

In [10]:
import string
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [11]:
punctuations_list = string.punctuation
def cleaning_punctuations(text):
    translator = str.maketrans('', '', punctuations_list)
    return text.translate(translator)

df['text'] = df['text'].apply(lambda x: cleaning_punctuations(x))

### Splitting data into Train and Test :

In [12]:
# Splitting data into Train and Test sets:
X = df['text']
y = df['target']
X_train, X_test, y_train, y_test =  train_test_split(X, y, test_size = 0.2, random_state = 3)

### Transforming dataset using TF-IDF Vectorizer:

In [13]:
# Extracting features using TF-IDF (1,2) - unigrams and bigrams
vectoriser = TfidfVectorizer(ngram_range=(1,2), max_features=500000)
vectoriser.fit(X_train)
print('No. of feature_words: ', len(vectoriser.get_feature_names()))

# Transforming the data using TD-IDF Vectorizer
X_train = vectoriser.transform(X_train)
X_test  = vectoriser.transform(X_test)

No. of feature_words:  62381


### Results:

In [14]:
%%time

# implemented GridSearchCV for four models using a loop and a previously created dictionary
# in the created variable 'scores', results are stored for each model such as: model, best_score and best_params.


scores = []

for model_name, mp in model_params.items():
    clf = GridSearchCV(mp['model'], mp['params'], cv=5, n_jobs=-1, verbose=1) # Using Cross Validation of 5 and n_jobs=-1 for fast training by using all the processors
    print(mp['model'])
    print('\nTraining the model...')
    best_model = clf.fit(X_train, y_train)                      # Training the model
    clf_pred = best_model.predict(X_test)                       # Predicting the results
    print(confusion_matrix(y_test,clf_pred))                    # Printing Confusion Matrix
    print(metrics.classification_report(y_test, clf_pred))      # Printing Classification Report
    scores.append({                                             # Appending results to 'scores' list
        'model' : model_name,
        'best_score' : best_model.score(X_test, y_test),
        'best_params' : clf.best_params_
    })
    print('\nScore is appended.\n')
    
# Creating data frame with model, best scores and best params:
res = pd.DataFrame(scores, columns=['model', 'best_score', 'best_params'])
res

SVC()

Training the model...
Fitting 5 folds for each of 48 candidates, totalling 240 fits
[[735 108]
 [185 473]]
              precision    recall  f1-score   support

           0       0.80      0.87      0.83       843
           1       0.81      0.72      0.76       658

    accuracy                           0.80      1501
   macro avg       0.81      0.80      0.80      1501
weighted avg       0.81      0.80      0.80      1501


Score is appended.

MultinomialNB()

Training the model...
Fitting 5 folds for each of 12 candidates, totalling 60 fits
[[778  65]
 [237 421]]
              precision    recall  f1-score   support

           0       0.77      0.92      0.84       843
           1       0.87      0.64      0.74       658

    accuracy                           0.80      1501
   macro avg       0.82      0.78      0.79      1501
weighted avg       0.81      0.80      0.79      1501


Score is appended.

LogisticRegression()

Training the model...
Fitting 5 folds for eac

Unnamed: 0,model,best_score,best_params
0,SVC,0.804797,"{'C': 1, 'gamma': 1, 'kernel': 'linear'}"
1,MultinomialNB,0.798801,"{'alpha': 0.5, 'fit_prior': True}"
2,logistics_regression,0.790806,"{'C': 20, 'solver': 'liblinear'}"
3,random_forest,0.775483,"{'criterion': 'entropy', 'max_depth': None, 'n_estimators': 85}"


# Setup 2: Models after removing Stop-words:

In [15]:
# Creating a df that is copy of the train set.
df = train.copy()

### Removing Stop-words:

In [16]:
sw = stopwords.words('english')
df['text'] = df['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (sw)]))

### Splitting data into Train and Test :

In [17]:
# Splitting data into Train and Test sets:
X = df['text']
y = df['target']
X_train, X_test, y_train, y_test =  train_test_split(X, y, test_size = 0.2, random_state = 3)

### Transforming dataset using TF-IDF Vectorizer:

In [18]:
# Extracting features using TF-IDF (1,2) - unigrams and bigrams
vectoriser = TfidfVectorizer(ngram_range=(1,2), max_features=500000)
vectoriser.fit(X_train)
print('No. of feature_words: ', len(vectoriser.get_feature_names()))

# Transforming the data using TD-IDF Vectorizer
X_train = vectoriser.transform(X_train)
X_test  = vectoriser.transform(X_test)

No. of feature_words:  51017


### Results:

In [19]:
%%time

# implemented GridSearchCV for four models using a loop and a previously created dictionary
# in the created variable 'scores', results are stored for each model such as: model, best_score and best_params.


scores = []

for model_name, mp in model_params.items():
    clf = GridSearchCV(mp['model'], mp['params'], cv=5, n_jobs=-1, verbose=1) # Using Cross Validation of 5 and n_jobs=-1 for fast training by using all the processors
    print(mp['model'])
    print('\nTraining the model...')
    best_model = clf.fit(X_train, y_train)                      # Training the model
    clf_pred = best_model.predict(X_test)                       # Predicting the results
    print(confusion_matrix(y_test,clf_pred))                    # Printing Confusion Matrix
    print(metrics.classification_report(y_test, clf_pred))      # Printing Classification Report
    scores.append({                                             # Appending results to 'scores' list
        'model' : model_name,
        'best_score' : best_model.score(X_test, y_test),
        'best_params' : clf.best_params_
    })
    print('\nScore is appended.\n')
    
# Creating data frame with model, best scores and best params:
res = pd.DataFrame(scores, columns=['model', 'best_score', 'best_params'])
res

SVC()

Training the model...
Fitting 5 folds for each of 48 candidates, totalling 240 fits
[[728 115]
 [182 476]]
              precision    recall  f1-score   support

           0       0.80      0.86      0.83       843
           1       0.81      0.72      0.76       658

    accuracy                           0.80      1501
   macro avg       0.80      0.79      0.80      1501
weighted avg       0.80      0.80      0.80      1501


Score is appended.

MultinomialNB()

Training the model...
Fitting 5 folds for each of 12 candidates, totalling 60 fits
[[767  76]
 [220 438]]
              precision    recall  f1-score   support

           0       0.78      0.91      0.84       843
           1       0.85      0.67      0.75       658

    accuracy                           0.80      1501
   macro avg       0.81      0.79      0.79      1501
weighted avg       0.81      0.80      0.80      1501


Score is appended.

LogisticRegression()

Training the model...
Fitting 5 folds for eac

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

[[703 140]
 [183 475]]
              precision    recall  f1-score   support

           0       0.79      0.83      0.81       843
           1       0.77      0.72      0.75       658

    accuracy                           0.78      1501
   macro avg       0.78      0.78      0.78      1501
weighted avg       0.78      0.78      0.78      1501


Score is appended.

RandomForestClassifier()

Training the model...
Fitting 5 folds for each of 30 candidates, totalling 150 fits
[[717 126]
 [224 434]]
              precision    recall  f1-score   support

           0       0.76      0.85      0.80       843
           1       0.78      0.66      0.71       658

    accuracy                           0.77      1501
   macro avg       0.77      0.76      0.76      1501
weighted avg       0.77      0.77      0.76      1501


Score is appended.

CPU times: user 19.6 s, sys: 571 ms, total: 20.1 s
Wall time: 34min 41s


Unnamed: 0,model,best_score,best_params
0,SVC,0.802132,"{'C': 1, 'gamma': 1, 'kernel': 'linear'}"
1,MultinomialNB,0.802798,"{'alpha': 0.5, 'fit_prior': True}"
2,logistics_regression,0.78481,"{'C': 40, 'solver': 'liblinear'}"
3,random_forest,0.766822,"{'criterion': 'gini', 'max_depth': None, 'n_estimators': 80}"


# Setup 3: Models after removing numbers:

In [20]:
# Creating a df that is copy of the train set.
df = train.copy()

### Removing numbers:

In [21]:
def cleaning_numbers(text):
    return re.sub('[0-9]+', '', text)

df['text'] = df['text'].apply(lambda text: cleaning_numbers(text))

### Splitting data into Train and Test :

In [22]:
# Splitting data into Train and Test sets:
X = df['text']
y = df['target']
X_train, X_test, y_train, y_test =  train_test_split(X, y, test_size = 0.2, random_state = 3)

### Transforming dataset using TF-IDF Vectorizer:

In [23]:
# Extracting features using TF-IDF (1,2) - unigrams and bigrams
vectoriser = TfidfVectorizer(ngram_range=(1,2), max_features=500000)
vectoriser.fit(X_train)
print('No. of feature_words: ', len(vectoriser.get_feature_names()))

# Transforming the data using TD-IDF Vectorizer
X_train = vectoriser.transform(X_train)
X_test  = vectoriser.transform(X_test)

No. of feature_words:  60344


### Results:

In [24]:
%%time

# implemented GridSearchCV for four models using a loop and a previously created dictionary
# in the created variable 'scores', results are stored for each model such as: model, best_score and best_params.


scores = []

for model_name, mp in model_params.items():
    clf = GridSearchCV(mp['model'], mp['params'], cv=5, n_jobs=-1, verbose=1) # Using Cross Validation of 5 and n_jobs=-1 for fast training by using all the processors
    print(mp['model'])
    print('\nTraining the model...')
    best_model = clf.fit(X_train, y_train)                      # Training the model
    clf_pred = best_model.predict(X_test)                       # Predicting the results
    print(confusion_matrix(y_test,clf_pred))                    # Printing Confusion Matrix
    print(metrics.classification_report(y_test, clf_pred))      # Printing Classification Report
    scores.append({                                             # Appending results to 'scores' list
        'model' : model_name,
        'best_score' : best_model.score(X_test, y_test),
        'best_params' : clf.best_params_
    })
    print('\nScore is appended.\n')
    
# Creating data frame with model, best scores and best params:
res = pd.DataFrame(scores, columns=['model', 'best_score', 'best_params'])
res

SVC()

Training the model...
Fitting 5 folds for each of 48 candidates, totalling 240 fits
[[733 110]
 [187 471]]
              precision    recall  f1-score   support

           0       0.80      0.87      0.83       843
           1       0.81      0.72      0.76       658

    accuracy                           0.80      1501
   macro avg       0.80      0.79      0.80      1501
weighted avg       0.80      0.80      0.80      1501


Score is appended.

MultinomialNB()

Training the model...
Fitting 5 folds for each of 12 candidates, totalling 60 fits
[[772  71]
 [236 422]]
              precision    recall  f1-score   support

           0       0.77      0.92      0.83       843
           1       0.86      0.64      0.73       658

    accuracy                           0.80      1501
   macro avg       0.81      0.78      0.78      1501
weighted avg       0.81      0.80      0.79      1501


Score is appended.

LogisticRegression()

Training the model...
Fitting 5 folds for eac

Unnamed: 0,model,best_score,best_params
0,SVC,0.802132,"{'C': 100, 'gamma': 0.01, 'kernel': 'sigmoid'}"
1,MultinomialNB,0.79547,"{'alpha': 0.5, 'fit_prior': True}"
2,logistics_regression,0.787475,"{'C': 40, 'solver': 'lbfgs'}"
3,random_forest,0.774817,"{'criterion': 'gini', 'max_depth': None, 'n_estimators': 90}"


# Setup 4: Models after removing repeating characters:

In [25]:
# Creating a df that is copy of the train set.
df = train.copy()

### Removing repeating characteres:

In [26]:
tokens = (word_tokenize(i) for i in df.text)
df['text'] = df['text'].apply(nltk.word_tokenize)

pattern = re.compile(r'(.)\1*')

def reduce_sequence_word(word):
    return ''.join([match.group()[:2] if len(match.group()) > 2 else match.group() for match in pattern.finditer(word)])

def reduce_sequence_tweet(tweet):
    return [reduce_sequence_word(word) for word in tweet]

df.text = df.text.apply(lambda tweet: reduce_sequence_tweet(tweet))

### Splitting data into Train and Test :

In [27]:
# Splitting data into Train and Test sets:
X = df['text'].astype(str)
y = df['target'].astype(str)
X_train, X_test, y_train, y_test =  train_test_split(X, y, test_size = 0.2, random_state = 3)

### Transforming dataset using TF-IDF Vectorizer:

In [28]:
# Extracting features using TF-IDF (1,2) - unigrams and bigrams
vectoriser = TfidfVectorizer(ngram_range=(1,2), max_features=500000)
vectoriser.fit(X_train)
print('No. of feature_words: ', len(vectoriser.get_feature_names()))

# Transforming the data using TD-IDF Vectorizer
X_train = vectoriser.transform(X_train)
X_test  = vectoriser.transform(X_test)

No. of feature_words:  62005


### Results:

In [29]:
%%time

# implemented GridSearchCV for four models using a loop and a previously created dictionary
# in the created variable 'scores', results are stored for each model such as: model, best_score and best_params.


scores = []

for model_name, mp in model_params.items():
    clf = GridSearchCV(mp['model'], mp['params'], cv=5, n_jobs=-1, verbose=1) # Using Cross Validation of 5 and n_jobs=-1 for fast training by using all the processors
    print(mp['model'])
    print('\nTraining the model...')
    best_model = clf.fit(X_train, y_train)                      # Training the model
    clf_pred = best_model.predict(X_test)                       # Predicting the results
    print(confusion_matrix(y_test,clf_pred))                    # Printing Confusion Matrix
    print(metrics.classification_report(y_test, clf_pred))      # Printing Classification Report
    scores.append({                                             # Appending results to 'scores' list
        'model' : model_name,
        'best_score' : best_model.score(X_test, y_test),
        'best_params' : clf.best_params_
    })
    print('\nScore is appended.\n')
    
# Creating data frame with model, best scores and best params:
res = pd.DataFrame(scores, columns=['model', 'best_score', 'best_params'])
res

SVC()

Training the model...
Fitting 5 folds for each of 48 candidates, totalling 240 fits
[[732 111]
 [187 471]]
              precision    recall  f1-score   support

           0       0.80      0.87      0.83       843
           1       0.81      0.72      0.76       658

    accuracy                           0.80      1501
   macro avg       0.80      0.79      0.80      1501
weighted avg       0.80      0.80      0.80      1501


Score is appended.

MultinomialNB()

Training the model...
Fitting 5 folds for each of 12 candidates, totalling 60 fits
[[779  64]
 [233 425]]
              precision    recall  f1-score   support

           0       0.77      0.92      0.84       843
           1       0.87      0.65      0.74       658

    accuracy                           0.80      1501
   macro avg       0.82      0.78      0.79      1501
weighted avg       0.81      0.80      0.80      1501


Score is appended.

LogisticRegression()

Training the model...
Fitting 5 folds for eac

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

[[703 140]
 [172 486]]
              precision    recall  f1-score   support

           0       0.80      0.83      0.82       843
           1       0.78      0.74      0.76       658

    accuracy                           0.79      1501
   macro avg       0.79      0.79      0.79      1501
weighted avg       0.79      0.79      0.79      1501


Score is appended.

RandomForestClassifier()

Training the model...
Fitting 5 folds for each of 30 candidates, totalling 150 fits
[[771  72]
 [266 392]]
              precision    recall  f1-score   support

           0       0.74      0.91      0.82       843
           1       0.84      0.60      0.70       658

    accuracy                           0.77      1501
   macro avg       0.79      0.76      0.76      1501
weighted avg       0.79      0.77      0.77      1501


Score is appended.

CPU times: user 29.7 s, sys: 1.83 s, total: 31.6 s
Wall time: 36min 15s


Unnamed: 0,model,best_score,best_params
0,SVC,0.801466,"{'C': 1, 'gamma': 1, 'kernel': 'linear'}"
1,MultinomialNB,0.802132,"{'alpha': 0.5, 'fit_prior': True}"
2,logistics_regression,0.792139,"{'C': 80, 'solver': 'lbfgs'}"
3,random_forest,0.774817,"{'criterion': 'entropy', 'max_depth': None, 'n_estimators': 95}"


# Setup 5: Applying Stemming and Lemmatization:

In [30]:
# Creating a df that is copy of the train set.
df = train.copy()

### Applying Stemming: 

In [31]:
# Tokenizing tweets:
tokens = (word_tokenize(i) for i in df.text)
df['text'] = df['text'].apply(nltk.word_tokenize)

stemm = SnowballStemmer('english')
df['text'] = df['text'].apply(lambda x: [stemm.stem(y) for y in x])

### Splitting data into Train and Test :

In [32]:
# Splitting data into Train and Test sets:
X = df['text'].astype(str)
y = df['target'].astype(str)
X_train, X_test, y_train, y_test =  train_test_split(X, y, test_size = 0.2, random_state = 3)

### Transforming dataset using TF-IDF Vectorizer:

In [33]:
# Extracting features using TF-IDF (1,2) - unigrams and bigrams
vectoriser = TfidfVectorizer(ngram_range=(1,2), max_features=500000)
vectoriser.fit(X_train)
print('No. of feature_words: ', len(vectoriser.get_feature_names()))

# Transforming the data using TD-IDF Vectorizer
X_train = vectoriser.transform(X_train)
X_test  = vectoriser.transform(X_test)

No. of feature_words:  57871


### Results:

In [34]:
%%time

# implemented GridSearchCV for four models using a loop and a previously created dictionary
# in the created variable 'scores', results are stored for each model such as: model, best_score and best_params.


scores = []

for model_name, mp in model_params.items():
    clf = GridSearchCV(mp['model'], mp['params'], cv=5, n_jobs=-1, verbose=1) # Using Cross Validation of 5 and n_jobs=-1 for fast training by using all the processors
    print(mp['model'])
    print('\nTraining the model...')
    best_model = clf.fit(X_train, y_train)                      # Training the model
    clf_pred = best_model.predict(X_test)                       # Predicting the results
    print(confusion_matrix(y_test,clf_pred))                    # Printing Confusion Matrix
    print(metrics.classification_report(y_test, clf_pred))      # Printing Classification Report
    scores.append({                                             # Appending results to 'scores' list
        'model' : model_name,
        'best_score' : best_model.score(X_test, y_test),
        'best_params' : clf.best_params_
    })
    print('\nScore is appended.\n')
    
# Creating data frame with model, best scores and best params:
res = pd.DataFrame(scores, columns=['model', 'best_score', 'best_params'])
res

SVC()

Training the model...
Fitting 5 folds for each of 48 candidates, totalling 240 fits
[[709 134]
 [184 474]]
              precision    recall  f1-score   support

           0       0.79      0.84      0.82       843
           1       0.78      0.72      0.75       658

    accuracy                           0.79      1501
   macro avg       0.79      0.78      0.78      1501
weighted avg       0.79      0.79      0.79      1501


Score is appended.

MultinomialNB()

Training the model...
Fitting 5 folds for each of 12 candidates, totalling 60 fits
[[774  69]
 [232 426]]
              precision    recall  f1-score   support

           0       0.77      0.92      0.84       843
           1       0.86      0.65      0.74       658

    accuracy                           0.80      1501
   macro avg       0.81      0.78      0.79      1501
weighted avg       0.81      0.80      0.79      1501


Score is appended.

LogisticRegression()

Training the model...
Fitting 5 folds for eac

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

[[695 148]
 [178 480]]
              precision    recall  f1-score   support

           0       0.80      0.82      0.81       843
           1       0.76      0.73      0.75       658

    accuracy                           0.78      1501
   macro avg       0.78      0.78      0.78      1501
weighted avg       0.78      0.78      0.78      1501


Score is appended.

RandomForestClassifier()

Training the model...
Fitting 5 folds for each of 30 candidates, totalling 150 fits
[[769  74]
 [259 399]]
              precision    recall  f1-score   support

           0       0.75      0.91      0.82       843
           1       0.84      0.61      0.71       658

    accuracy                           0.78      1501
   macro avg       0.80      0.76      0.76      1501
weighted avg       0.79      0.78      0.77      1501


Score is appended.

CPU times: user 30.3 s, sys: 1.43 s, total: 31.7 s
Wall time: 34min 36s


Unnamed: 0,model,best_score,best_params
0,SVC,0.788141,"{'C': 10, 'gamma': 0.1, 'kernel': 'sigmoid'}"
1,MultinomialNB,0.799467,"{'alpha': 0.5, 'fit_prior': True}"
2,logistics_regression,0.782811,"{'C': 20, 'solver': 'lbfgs'}"
3,random_forest,0.778148,"{'criterion': 'entropy', 'max_depth': None, 'n_estimators': 100}"


# Setup 6: Models after removing all the features:

In [35]:
# Creating a df that is copy of the train set.
df = train.copy()

### Removing Punctuation:

In [36]:
import string
string.punctuation

punctuations_list = string.punctuation
def cleaning_punctuations(text):
    translator = str.maketrans('', '', punctuations_list)
    return text.translate(translator)

df['text'] = df['text'].apply(lambda x: cleaning_punctuations(x))

### Removing Stop-words: 

In [37]:
sw = stopwords.words('english')
df['text'] = df['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (sw)]))

### Removing Numbers:

In [38]:
def cleaning_numbers(text):
    return re.sub('[0-9]+', '', text)

df['text'] = df['text'].apply(lambda text: cleaning_numbers(text))

### Removing repeating characters:

In [39]:
tokens = (word_tokenize(i) for i in df.text)
df['text'] = df['text'].apply(nltk.word_tokenize)

pattern = re.compile(r'(.)\1*')

def reduce_sequence_word(word):
    return ''.join([match.group()[:2] if len(match.group()) > 2 else match.group() for match in pattern.finditer(word)])

def reduce_sequence_tweet(tweet):
    return [reduce_sequence_word(word) for word in tweet]

df.text = df.text.apply(lambda tweet: reduce_sequence_tweet(tweet))

### Applying Stemming and Lemmatization:

In [40]:
stemm = SnowballStemmer('english')
df['text'] = df['text'].apply(lambda x: [stemm.stem(y) for y in x])

### Splitting data into Train and Test :

In [41]:
# Splitting data into Train and Test sets:
X = df['text'].astype(str)
y = df['target'].astype(str)
X_train, X_test, y_train, y_test =  train_test_split(X, y, test_size = 0.2, random_state = 3)

### Transforming dataset using TF-IDF Vectorizer:

In [42]:
# Extracting features using TF-IDF (1,2) - unigrams and bigrams
vectoriser = TfidfVectorizer(ngram_range=(1,2), max_features=500000)
vectoriser.fit(X_train)
print('No. of feature_words: ', len(vectoriser.get_feature_names()))

# Transforming the data using TD-IDF Vectorizer
X_train = vectoriser.transform(X_train)
X_test  = vectoriser.transform(X_test)

No. of feature_words:  46081


### Results:

In [43]:
%%time

# implemented GridSearchCV for four models using a loop and a previously created dictionary
# in the created variable 'scores', results are stored for each model such as: model, best_score and best_params.


scores = []

for model_name, mp in model_params.items():
    clf = GridSearchCV(mp['model'], mp['params'], cv=5, n_jobs=-1, verbose=1) # Using Cross Validation of 5 and n_jobs=-1 for fast training by using all the processors
    print(mp['model'])
    print('\nTraining the model...')
    best_model = clf.fit(X_train, y_train)                      # Training the model
    clf_pred = best_model.predict(X_test)                       # Predicting the results
    print(confusion_matrix(y_test,clf_pred))                    # Printing Confusion Matrix
    print(metrics.classification_report(y_test, clf_pred))      # Printing Classification Report
    scores.append({                                             # Appending results to 'scores' list
        'model' : model_name,
        'best_score' : best_model.score(X_test, y_test),
        'best_params' : clf.best_params_
    })
    print('\nScore is appended.\n')
    
# Creating data frame with model, best scores and best params:
res = pd.DataFrame(scores, columns=['model', 'best_score', 'best_params'])
res

SVC()

Training the model...
Fitting 5 folds for each of 48 candidates, totalling 240 fits
[[713 130]
 [179 479]]
              precision    recall  f1-score   support

           0       0.80      0.85      0.82       843
           1       0.79      0.73      0.76       658

    accuracy                           0.79      1501
   macro avg       0.79      0.79      0.79      1501
weighted avg       0.79      0.79      0.79      1501


Score is appended.

MultinomialNB()

Training the model...
Fitting 5 folds for each of 12 candidates, totalling 60 fits
[[760  83]
 [222 436]]
              precision    recall  f1-score   support

           0       0.77      0.90      0.83       843
           1       0.84      0.66      0.74       658

    accuracy                           0.80      1501
   macro avg       0.81      0.78      0.79      1501
weighted avg       0.80      0.80      0.79      1501


Score is appended.

LogisticRegression()

Training the model...
Fitting 5 folds for eac

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

[[701 142]
 [180 478]]
              precision    recall  f1-score   support

           0       0.80      0.83      0.81       843
           1       0.77      0.73      0.75       658

    accuracy                           0.79      1501
   macro avg       0.78      0.78      0.78      1501
weighted avg       0.78      0.79      0.78      1501


Score is appended.

RandomForestClassifier()

Training the model...
Fitting 5 folds for each of 30 candidates, totalling 150 fits
[[728 115]
 [232 426]]
              precision    recall  f1-score   support

           0       0.76      0.86      0.81       843
           1       0.79      0.65      0.71       658

    accuracy                           0.77      1501
   macro avg       0.77      0.76      0.76      1501
weighted avg       0.77      0.77      0.77      1501


Score is appended.

CPU times: user 20.7 s, sys: 602 ms, total: 21.3 s
Wall time: 31min 42s


Unnamed: 0,model,best_score,best_params
0,SVC,0.794137,"{'C': 1, 'gamma': 1, 'kernel': 'linear'}"
1,MultinomialNB,0.796802,"{'alpha': 0.5, 'fit_prior': True}"
2,logistics_regression,0.785476,"{'C': 20, 'solver': 'liblinear'}"
3,random_forest,0.768821,"{'criterion': 'gini', 'max_depth': None, 'n_estimators': 80}"


# Setup 7: Models without removing any setup:

In [44]:
# Creating a df that is copy of the train set.
df = train.copy()

### Splitting data into Train and Test :

In [45]:
# Splitting data into Train and Test sets:
X = df['text']
y = df['target']
X_train, X_test, y_train, y_test =  train_test_split(X, y, test_size = 0.2, random_state = 3)

### Transforming dataset using TF-IDF Vectorizer:

In [46]:
# Extracting features using TF-IDF (1,2) - unigrams and bigrams
vectoriser = TfidfVectorizer(ngram_range=(1,2), max_features=500000)
vectoriser.fit(X_train)
print('No. of feature_words: ', len(vectoriser.get_feature_names()))

# Transforming the data using TD-IDF Vectorizer
X_train = vectoriser.transform(X_train)
X_test  = vectoriser.transform(X_test)

No. of feature_words:  62117


### Results:

In [47]:
%%time

# implemented GridSearchCV for four models using a loop and a previously created dictionary
# in the created variable 'scores', results are stored for each model such as: model, best_score and best_params.


scores = []

for model_name, mp in model_params.items():
    clf = GridSearchCV(mp['model'], mp['params'], cv=5, n_jobs=-1, verbose=1) # Using Cross Validation of 5 and n_jobs=-1 for fast training by using all the processors
    print(mp['model'])
    print('\nTraining the model...')
    best_model = clf.fit(X_train, y_train)                      # Training the model
    clf_pred = best_model.predict(X_test)                       # Predicting the results
    print(confusion_matrix(y_test,clf_pred))                    # Printing Confusion Matrix
    print(metrics.classification_report(y_test, clf_pred))      # Printing Classification Report
    scores.append({                                             # Appending results to 'scores' list
        'model' : model_name,
        'best_score' : best_model.score(X_test, y_test),
        'best_params' : clf.best_params_
    })
    print('\nScore is appended.\n')
    
# Creating data frame with model, best scores and best params:
res = pd.DataFrame(scores, columns=['model', 'best_score', 'best_params'])
res

SVC()

Training the model...
Fitting 5 folds for each of 48 candidates, totalling 240 fits
[[736 107]
 [187 471]]
              precision    recall  f1-score   support

           0       0.80      0.87      0.83       843
           1       0.81      0.72      0.76       658

    accuracy                           0.80      1501
   macro avg       0.81      0.79      0.80      1501
weighted avg       0.81      0.80      0.80      1501


Score is appended.

MultinomialNB()

Training the model...
Fitting 5 folds for each of 12 candidates, totalling 60 fits
[[777  66]
 [233 425]]
              precision    recall  f1-score   support

           0       0.77      0.92      0.84       843
           1       0.87      0.65      0.74       658

    accuracy                           0.80      1501
   macro avg       0.82      0.78      0.79      1501
weighted avg       0.81      0.80      0.80      1501


Score is appended.

LogisticRegression()

Training the model...
Fitting 5 folds for eac

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

[[705 138]
 [180 478]]
              precision    recall  f1-score   support

           0       0.80      0.84      0.82       843
           1       0.78      0.73      0.75       658

    accuracy                           0.79      1501
   macro avg       0.79      0.78      0.78      1501
weighted avg       0.79      0.79      0.79      1501


Score is appended.

RandomForestClassifier()

Training the model...
Fitting 5 folds for each of 30 candidates, totalling 150 fits
[[758  85]
 [256 402]]
              precision    recall  f1-score   support

           0       0.75      0.90      0.82       843
           1       0.83      0.61      0.70       658

    accuracy                           0.77      1501
   macro avg       0.79      0.76      0.76      1501
weighted avg       0.78      0.77      0.77      1501


Score is appended.

CPU times: user 28.7 s, sys: 1.44 s, total: 30.2 s
Wall time: 36min 10s


Unnamed: 0,model,best_score,best_params
0,SVC,0.804131,"{'C': 1, 'gamma': 1, 'kernel': 'linear'}"
1,MultinomialNB,0.800799,"{'alpha': 0.5, 'fit_prior': True}"
2,logistics_regression,0.788141,"{'C': 40, 'solver': 'lbfgs'}"
3,random_forest,0.772818,"{'criterion': 'gini', 'max_depth': None, 'n_estimators': 95}"


# Creating Submission file:
It can be observed that **Setup-1 and 7** is performing best for SVM model. **Setup 1** will be used. Let's just train this model with 100% training data. This model will be used for predicting test file.

In [48]:
# Creating a df that is copy of the train set.
df = train.copy()

In [49]:
import string
string.punctuation

punctuations_list = string.punctuation
def cleaning_punctuations(text):
    translator = str.maketrans('', '', punctuations_list)
    return text.translate(translator)

df['text'] = df['text'].apply(lambda x: cleaning_punctuations(x))

### Splitting data into Train and Test :

In [50]:
# Not spliiting, Creating X_train and y_train.
# Using 100% data for training SVC model to get better training. Because from Step - 7,
# it can be concluded that SVC model with 'TF-IDF Vectorizer (1,2) - unigrams and bigrams' performs best for this dataset


X_train = df['text']
y_train = df['target']    
X_test = test['text']   

### Transforming dataset using TF-IDF Vectorizer:

In [51]:
# Extracting features using TF-IDF (1,2) - unigrams and bigrams
vectoriser = TfidfVectorizer(ngram_range=(1,2), max_features=500000)
vectoriser.fit(X_train)
print('No. of feature_words: ', len(vectoriser.get_feature_names()))

# Transforming the data using TD-IDF Vectorizer
X_train = vectoriser.transform(X_train)
X_test  = vectoriser.transform(X_test)

No. of feature_words:  74470


### SVC model:

In [52]:
svc = SVC()
hyperParam = {'C': [0.1, 1, 10, 100], 'gamma': [1, 0.1, 0.01], 'kernel': ['rbf','linear','poly','sigmoid']}

gsv = GridSearchCV(svc,hyperParam,cv=5,verbose=1,n_jobs=-1)  # Using Cross Validation of 5 and n_jobs=-1 for fast training by using all the processors
best_model = gsv.fit(X_train, y_train)                       # Training model with X_train and y_train
svc_pred = best_model.predict(X_test)                        # Predicting the results

print("Best HyperParameter: ", gsv.best_params_)

Fitting 5 folds for each of 48 candidates, totalling 240 fits
Best HyperParameter:  {'C': 10, 'gamma': 0.1, 'kernel': 'sigmoid'}


### Submission file:

In [53]:
print(svc_pred)
print(type(svc_pred))

my_array = svc_pred
print(len(my_array))

submission = pd.DataFrame(my_array,columns = ['target'])
submission['id'] = test['id']
submission = submission[['id','target']]
submission.to_csv('submission.csv', index=False)
submission

[1 1 1 ... 1 1 1]
<class 'numpy.ndarray'>
3263


Unnamed: 0,id,target
0,0,1
1,2,1
2,3,1
3,9,0
4,11,1
...,...,...
3258,10861,1
3259,10865,1
3260,10868,1
3261,10874,1
