In [42]:
#Imports
import numpy as np
import pandas as pd
import re

from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer

In [2]:
#Settings
dataset_path = "/kaggle/input/covid-19-nlp-text-classification/"

In [3]:
#Load dataset
train_dataset = pd.read_csv(dataset_path + "Corona_NLP_train.csv", encoding="ISO-8859-1")
test_dataset = pd.read_csv(dataset_path + "Corona_NLP_test.csv", encoding="ISO-8859-1")

In [4]:
train_dataset

Unnamed: 0,UserName,ScreenName,Location,TweetAt,OriginalTweet,Sentiment
0,3799,48751,London,16-03-2020,@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...,Neutral
1,3800,48752,UK,16-03-2020,advice Talk to your neighbours family to excha...,Positive
2,3801,48753,Vagabonds,16-03-2020,Coronavirus Australia: Woolworths to give elde...,Positive
3,3802,48754,,16-03-2020,My food stock is not the only one which is emp...,Positive
4,3803,48755,,16-03-2020,"Me, ready to go at supermarket during the #COV...",Extremely Negative
...,...,...,...,...,...,...
41152,44951,89903,"Wellington City, New Zealand",14-04-2020,Airline pilots offering to stock supermarket s...,Neutral
41153,44952,89904,,14-04-2020,Response to complaint not provided citing COVI...,Extremely Negative
41154,44953,89905,,14-04-2020,You know itÂs getting tough when @KameronWild...,Positive
41155,44954,89906,,14-04-2020,Is it wrong that the smell of hand sanitizer i...,Neutral


In [5]:
#I will use only tweet text for classification
train_X = train_dataset["OriginalTweet"]
train_y = train_dataset["Sentiment"]

test_X = test_dataset["OriginalTweet"]
test_y = test_dataset["Sentiment"]

In [6]:
#All possible values for output
print(set(train_y))

{'Positive', 'Extremely Negative', 'Negative', 'Extremely Positive', 'Neutral'}


In [7]:
#For task purpose we need only Positive, Neutral and Negative, lets exchange labels to numbers
#I will create dictionary for this
label2num = { 'Positive': 2, 'Extremely Positive': 2, 'Neutral': 1, 'Extremely Negative': 0, 'Negative': 0 }
num2label = { 0: 'Negative', 1:'Neutral', 2:'Positive' }

#Apply dict
train_y = [label2num[x] for x in train_y]
test_y = [label2num[x] for x in test_y]

In [10]:
#Example tweet
print(train_X[4])
print(train_y[4])

Me, ready to go at supermarket during the #COVID19 outbreak.

Not because I'm paranoid, but because my food stock is litteraly empty. The #coronavirus is a serious thing, but please, don't panic. It causes shortage...

#CoronavirusFrance #restezchezvous #StayAtHome #confinement https://t.co/usmuaLq72n
0


In [9]:
#Sizes of train and test sets
print("Train set size: ", len(train_X))
print("Test set size: ", len(test_X))

Train set size:  41157
Test set size:  3798


In [28]:
#Lets create some preprocessing pipeline

#Create custom one for various preprocessings with text 
class PreprocessText(BaseEstimator, TransformerMixin):    
    def fit(self, X, y = None):
        return self
    
    def transform(self, X, y=None):
        X_ = X.copy()
        X_ = [re.sub(r'http\S+', '', x) for x in X_] #Delete links
        return X_

#Rest will do count vectorizer (lowercase, split, stopwords)
#Create pipeline for preprocessing
preprocessing_pipeline = Pipeline([
    ('delete_links', PreprocessText()),
    ('tfidf_vectorizer', TfidfVectorizer(ngram_range=(1,1), lowercase=True, stop_words='english')),
])

In [29]:
#List of models to check
models = []
models.append(("MultinomialNB",MultinomialNB()))
models.append(("LogisticRegression",LogisticRegression(solver='liblinear')))
models.append(("LinearSVC", LinearSVC()))
models.append(("KNeighbors",KNeighborsClassifier()))
models.append(("DecisionTree",DecisionTreeClassifier()))
models.append(("RandomForest",RandomForestClassifier()))

In [23]:
#Evaluate function which prints some crucial metrics
def evaluate_data(predicted, true, avg="weighted"):
    print("Precision: ", precision_score(predicted, true, average=avg))
    print("Recall: ", recall_score(predicted, true, average=avg))
    print("F1: ", f1_score(predicted,true, average=avg))

In [30]:
#Check diffrent models
for name, model in models:
    text_preprocess_train = Pipeline([
        ('preprocess', preprocessing_pipeline),
        (name, model),
    ])
    text_preprocess_train.fit(train_X, train_y)
    print(name)
    print("Train set")
    evaluate_data(text_preprocess_train.predict(train_X), train_y)
    print("Test set")
    evaluate_data(text_preprocess_train.predict(test_X), test_y)
    print("\n")

MultinomialNB
Train set
Precision:  0.8783144209616598
Recall:  0.7346502417571737
F1:  0.7846823123855224
Test set
Precision:  0.7909718717326698
Recall:  0.6327014218009479
F1:  0.6873768177975856


LogisticRegression
Train set
Precision:  0.8876341344139663
Recall:  0.8816969166848896
F1:  0.8829740018711892
Test set
Precision:  0.7854617546384094
Recall:  0.7696155871511322
F1:  0.7745623808395536


LinearSVC
Train set
Precision:  0.9707635500567147
Recall:  0.9704060062686785
F1:  0.9704690408647652
Test set
Precision:  0.8005391151410735
Recall:  0.7962085308056872
F1:  0.7978000942910557


KNeighbors
Train set
Precision:  0.9383891446422725
Recall:  0.23731078552858567
F1:  0.3165752443731525
Test set
Precision:  0.9876168272497111
Recall:  0.16850974196945762
F1:  0.27874452946466305


DecisionTree
Train set
Precision:  0.9999271065860657
Recall:  0.9999271083898243
F1:  0.999927106363966
Test set
Precision:  0.5917699768638598
Recall:  0.5924170616113744
F1:  0.58950961958558


In [38]:
#Based on models I will choose best one to tune hyperparameters with Grid Search
#LinearSVC
linearsvc_params = {
    'model__C': [0.1, 0.3, 1.0, 3.0, 10, 30, 100],
}
linearcsv_pipeline = Pipeline([
    ('preprocess', preprocessing_pipeline),
    ('model', LinearSVC(max_iter=3000)),
])
linearsvc = GridSearchCV(linearcsv_pipeline, linearsvc_params, cv=6)
linearsvc.fit(train_X, train_y)



GridSearchCV(cv=6,
             estimator=Pipeline(steps=[('preprocess',
                                        Pipeline(steps=[('delete_links',
                                                         PreprocessText()),
                                                        ('tfidf_vectorizer',
                                                         TfidfVectorizer(stop_words='english'))])),
                                       ('model', LinearSVC(max_iter=3000))]),
             param_grid={'model__C': [0.1, 0.3, 1.0, 3.0, 10, 30, 100]})

In [39]:
print(linearsvc.best_params_) 

{'model__C': 1.0}


In [40]:
evaluate_data(linearsvc.predict(train_X), train_y)

Precision:  0.9707635500567147
Recall:  0.9704060062686785
F1:  0.9704690408647652


In [41]:
evaluate_data(linearsvc.predict(test_X), test_y)

Precision:  0.8005391151410735
Recall:  0.7962085308056872
F1:  0.7978000942910557


In [43]:
#As you can see RandomForest and DecisionTree got overfitted, so it could be good idea to try it with capped depth
#Will use randomized search because of larger number of params
randomforest_params = {
    'model__n_estimators': [1, 3, 10, 30, 100, 300],
    'model__max_depth': [6, 8, 12, 16, 20, 24],
    'model__min_samples_split': [2, 3, 4, 5],
    'model__min_samples_leaf': [1, 2, 3, 4],
}
randomforest_pipeline = Pipeline([
    ('preprocess', preprocessing_pipeline),
    ('model', RandomForestClassifier()),
])
randomforest = RandomizedSearchCV(randomforest_pipeline, randomforest_params, cv=6, n_iter=25)
randomforest.fit(train_X, train_y)

print(randomforest.best_params_) 
evaluate_data(randomforest.predict(train_X), train_y)
evaluate_data(randomforest.predict(test_X), test_y)

{'model__n_estimators': 100, 'model__min_samples_split': 2, 'model__min_samples_leaf': 3, 'model__max_depth': 24}
Precision:  0.8711910535370904
Recall:  0.5781033603032291
F1:  0.6523295820628661


  _warn_prf(average, modifier, msg_start, len(result))


Precision:  0.8412621817067512
Recall:  0.5613480779357557
F1:  0.6238626987408531
