In [42]:
#Imports
import numpy as np
import pandas as pd
import re

from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer

In [2]:
#Settings
dataset_path = "/kaggle/input/covid-19-nlp-text-classification/"

In [3]:
#Load dataset
train_dataset = pd.read_csv(dataset_path + "Corona_NLP_train.csv", encoding="ISO-8859-1")
test_dataset = pd.read_csv(dataset_path + "Corona_NLP_test.csv", encoding="ISO-8859-1")

In [4]:
train_dataset

In [5]:
#I will use only tweet text for classification
train_X = train_dataset["OriginalTweet"]
train_y = train_dataset["Sentiment"]

test_X = test_dataset["OriginalTweet"]
test_y = test_dataset["Sentiment"]

In [6]:
#All possible values for output
print(set(train_y))

In [7]:
#For task purpose we need only Positive, Neutral and Negative, lets exchange labels to numbers
#I will create dictionary for this
label2num = { 'Positive': 2, 'Extremely Positive': 2, 'Neutral': 1, 'Extremely Negative': 0, 'Negative': 0 }
num2label = { 0: 'Negative', 1:'Neutral', 2:'Positive' }

#Apply dict
train_y = [label2num[x] for x in train_y]
test_y = [label2num[x] for x in test_y]

In [10]:
#Example tweet
print(train_X[4])
print(train_y[4])

In [9]:
#Sizes of train and test sets
print("Train set size: ", len(train_X))
print("Test set size: ", len(test_X))

In [28]:
#Lets create some preprocessing pipeline

#Create custom one for various preprocessings with text 
class PreprocessText(BaseEstimator, TransformerMixin):    
    def fit(self, X, y = None):
        return self
    
    def transform(self, X, y=None):
        X_ = X.copy()
        X_ = [re.sub(r'http\S+', '', x) for x in X_] #Delete links
        return X_

#Rest will do count vectorizer (lowercase, split, stopwords)
#Create pipeline for preprocessing
preprocessing_pipeline = Pipeline([
    ('delete_links', PreprocessText()),
    ('tfidf_vectorizer', TfidfVectorizer(ngram_range=(1,1), lowercase=True, stop_words='english')),
])

In [29]:
#List of models to check
models = []
models.append(("MultinomialNB",MultinomialNB()))
models.append(("LogisticRegression",LogisticRegression(solver='liblinear')))
models.append(("LinearSVC", LinearSVC()))
models.append(("KNeighbors",KNeighborsClassifier()))
models.append(("DecisionTree",DecisionTreeClassifier()))
models.append(("RandomForest",RandomForestClassifier()))

In [23]:
#Evaluate function which prints some crucial metrics
def evaluate_data(predicted, true, avg="weighted"):
    print("Precision: ", precision_score(predicted, true, average=avg))
    print("Recall: ", recall_score(predicted, true, average=avg))
    print("F1: ", f1_score(predicted,true, average=avg))

In [30]:
#Check diffrent models
for name, model in models:
    text_preprocess_train = Pipeline([
        ('preprocess', preprocessing_pipeline),
        (name, model),
    ])
    text_preprocess_train.fit(train_X, train_y)
    print(name)
    print("Train set")
    evaluate_data(text_preprocess_train.predict(train_X), train_y)
    print("Test set")
    evaluate_data(text_preprocess_train.predict(test_X), test_y)
    print("\n")

In [38]:
#Based on models I will choose best one to tune hyperparameters with Grid Search
#LinearSVC
linearsvc_params = {
    'model__C': [0.1, 0.3, 1.0, 3.0, 10, 30, 100],
}
linearcsv_pipeline = Pipeline([
    ('preprocess', preprocessing_pipeline),
    ('model', LinearSVC(max_iter=3000)),
])
linearsvc = GridSearchCV(linearcsv_pipeline, linearsvc_params, cv=6)
linearsvc.fit(train_X, train_y)

In [39]:
print(linearsvc.best_params_) 

In [40]:
evaluate_data(linearsvc.predict(train_X), train_y)

In [41]:
evaluate_data(linearsvc.predict(test_X), test_y)

In [43]:
#As you can see RandomForest and DecisionTree got overfitted, so it could be good idea to try it with capped depth
#Will use randomized search because of larger number of params
randomforest_params = {
    'model__n_estimators': [1, 3, 10, 30, 100, 300],
    'model__max_depth': [6, 8, 12, 16, 20, 24],
    'model__min_samples_split': [2, 3, 4, 5],
    'model__min_samples_leaf': [1, 2, 3, 4],
}
randomforest_pipeline = Pipeline([
    ('preprocess', preprocessing_pipeline),
    ('model', RandomForestClassifier()),
])
randomforest = RandomizedSearchCV(randomforest_pipeline, randomforest_params, cv=6, n_iter=25)
randomforest.fit(train_X, train_y)

print(randomforest.best_params_) 
evaluate_data(randomforest.predict(train_X), train_y)
evaluate_data(randomforest.predict(test_X), test_y)