## Import libraries

In [1]:
import pandas as pd
import numpy as np
import string
from sklearn.feature_extraction.text import TfidfVectorizer as tvect
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV as GSCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier

## Import data

In [3]:
original_train = pd.read_csv('train.csv')

In [4]:
train_data = original_train.copy()

In [5]:
original_train.head()

Unnamed: 0,sentiment,message,tweetid
0,1,PolySciMajor EPA chief doesn't think carbon di...,625221
1,1,It's not like we lack evidence of anthropogeni...,126103
2,2,RT @RawStory: Researchers say we have three ye...,698562
3,1,#TodayinMaker# WIRED : 2016 was a pivotal year...,573736
4,1,"RT @SoyNovioDeTodas: It's 2016, and a racist, ...",466954


In [6]:
train_data.head()

Unnamed: 0,sentiment,message,tweetid
0,1,PolySciMajor EPA chief doesn't think carbon di...,625221
1,1,It's not like we lack evidence of anthropogeni...,126103
2,2,RT @RawStory: Researchers say we have three ye...,698562
3,1,#TodayinMaker# WIRED : 2016 was a pivotal year...,573736
4,1,"RT @SoyNovioDeTodas: It's 2016, and a racist, ...",466954


Check for missing values:

In [7]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15819 entries, 0 to 15818
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   sentiment  15819 non-null  int64 
 1   message    15819 non-null  object
 2   tweetid    15819 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 370.9+ KB


In [8]:
train_data['sentiment'].isnull().sum()

0

In [9]:
train_data['message'].isnull().sum()

0

In [10]:
train_data['tweetid'].isnull().sum()

0

## Data clean-up

Convert everything to lower case:

In [11]:
train_data['message'] = train_data['message'].str.lower()

Remove punctuation:

In [12]:
def remove_punctuation_numbers(msg):
    punc_numbers = string.punctuation
    return ''.join([l for l in msg if l not in punc_numbers])
train_data['message'] = train_data['message'].apply(remove_punctuation_numbers)

Seperate feature and response:

In [13]:
y = train_data['sentiment']
X = train_data['message']

Remove English stop words:

In [14]:
vectorizer = tvect(ngram_range=(1,2), min_df=2, stop_words="english")
X_trans = vectorizer.fit_transform(X)

In [15]:
X_train, X_val, y_train, y_val = train_test_split(X_trans, y, test_size=0.2, shuffle=True, stratify=y, random_state=11)

KNN hyperparameters:

In [48]:
ks = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 20, 50, 100]

In [49]:
param_grid = {'n_neighbors': ks}

In [61]:
grid_knn = GSCV(KNeighborsClassifier(), param_grid, scoring='f1_macro')

In [62]:
grid_knn.fit(X_train, y_train)
grid_knn.best_params_

{'n_neighbors': 7}

SVC hyperparameters:

In [None]:
nfolds = 2

Cs = [10, 20, 50, 100]
gammas = [0.001, 0.01, 0.1, 1]

param_grid = {
    'C'     : Cs, 
    'gamma' : gammas
    }

grid_SVM = GSCV(SVC(), param_grid, scoring='f1_macro', cv=nfolds)
grid_SVM.fit(X_train, y_train)

In [75]:
grid_SVM.best_params_

{'C': 10, 'gamma': 0.1}

Check which model is better:

In [76]:
names = ['Logistic Regression', 'Nearest Neighbors', 
         'Linear SVM', 'RBF SVM',          
         'Decision Tree', 'Random Forest',  'AdaBoost']

In [87]:
classifiers = [
    LogisticRegression(), 
    KNeighborsClassifier(7),
    SVC(kernel="linear", C=0.025),
    SVC(gamma=0.1, C=10),
    DecisionTreeClassifier(max_depth=5),
    RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),    
    AdaBoostClassifier()
]

In [89]:
results = []

models = {}
#confusion = {}
class_report = {}


for name, clf in zip(names, classifiers):    
    print ('Fitting {:s} model...'.format(name))
    run_time = %timeit -q -o clf.fit(X_train, y_train)
    
    print ('... predicting')
    y_pred = clf.predict(X_train)   
    y_pred_test = clf.predict(X_val)
    
    print ('... scoring')
    accuracy  = metrics.accuracy_score(y_train, y_pred)
    precision = metrics.precision_score(y_train, y_pred, average='macro')
    recall    = metrics.recall_score(y_train, y_pred, average='macro')
    
    f1        = metrics.f1_score(y_train, y_pred, average='macro')    
    f1_test   = metrics.f1_score(y_val, y_pred_test, average='macro')    
    
    # Save the results to dictionaries
    models[name] = clf    
    #confusion[name] = metrics.confusion_matrix(y_train, y_pred)
    #class_report[name] = metrics.classification_report(y_train, y_pred)
    
    results.append([name, accuracy, precision, recall, f1, f1_test])

    
results = pd.DataFrame(results, columns=['Classifier', 'Accuracy', 'Precision', 'Recall', 'F1 Train', 'F1 Test'])
results.set_index('Classifier', inplace= True)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

... predicting
... scoring
... predicting
... scoring
... predicting
... scoring


  _warn_prf(average, modifier, msg_start, len(result))


... predicting
... scoring
... predicting
... scoring
... predicting
... scoring


  _warn_prf(average, modifier, msg_start, len(result))


... predicting
... scoring


In [90]:
results.sort_values('F1 Train', ascending=False)

Unnamed: 0_level_0,Accuracy,Precision,Recall,F1 Train,F1 Test
Classifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
RBF SVM,0.96776,0.974644,0.953279,0.963531,0.641618
Logistic Regression,0.853892,0.90528,0.730786,0.787555,0.592096
Nearest Neighbors,0.752746,0.723014,0.656556,0.683052,0.555943
AdaBoost,0.60964,0.602973,0.454813,0.491381,0.471619
Decision Tree,0.577242,0.650808,0.337292,0.339889,0.328961
Linear SVM,0.539234,0.134808,0.25,0.175163,0.175154
Random Forest,0.539234,0.134808,0.25,0.175163,0.175154


## Test data

In [None]:
original_test = pd.read_csv('test.csv')

In [None]:
test_data = original_test.copy()

In [None]:
X_test = test_data['message']

In [None]:
test_vect = vectorizer.transform(X_test)

In [None]:
y_pred = ......predict(test_vect)

In [None]:
test_data['sentiment'] = y_pred

In [None]:
test_data.head()

In [None]:
# Final .csv for submission
#test[['tweetid','sentiment']].to_csv('testsubmission.csv', index=False)