# Binary classification of COVID-19 tweets using tf-idf vectorization & PCA

## Import modules & data

In [1]:
import pandas as pd
import numpy as np
import re
from nltk import word_tokenize, pos_tag
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import NMF, PCA, TruncatedSVD, FastICA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_validate, KFold, GridSearchCV
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, roc_auc_score

import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to C:\Users\Sunday
[nltk_data]     Okechukwu\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
tweets = pd.read_csv('COVID19_Dataset-text_labels_only.csv')

In [3]:
tweets

Unnamed: 0,Is_Unreliable,Category,Tweet
0,1,"1, 3, 6, 9",We are living in scary times in Canada. Gov’t ...
1,1,"1, 6, 8, 9","Just as bad in Canada. In fact, our government..."
2,1,"1, 4, 9",It was only a matter of time before the mainst...
3,1,"6, 8",Russia's taking no chances: Foreigners infecte...
4,1,"6, 8, 9",Although there is now a presumptive confirmed ...
...,...,...,...
555,0,,BREAKING: Harvard classes will move online sta...
556,0,,Singularity University is hosting a FREE Virtu...
557,0,,Coronavirus: how does it spread and what are t...
558,0,,Stanford just cancelled classes for the rest o...


## Clean & normalize text

In [4]:
# This is a standard pre-processing function
def clean_text(str_list, lemmatize = True):
    clean_list = []
    
    for text in str_list:
        # to drop pound sign from hash tags
        text = re.sub(r'#', '', text)
        words = word_tokenize(text)
        clean_words = []
        
        for word in words:            
            # drop words with fewer than 2 characters; drop any punctuation "words"
            if (len(word) > 1) and (re.match(r'^\w+$', word)):

                if lemmatize==True:
                    lemmatizer=WordNetLemmatizer()
                    word1 = lemmatizer.lemmatize(word)
                    #if word!=word1:
                        #print(word1)
                        #print(word)
                    
                clean_words.append(word1)
        clean_text = ' '.join(clean_words)
        clean_list.append(clean_text)
    
    return clean_list

In [5]:
# run on all tweets
tweets['clean_tweet'] = clean_text(tweets['Tweet'])

In [6]:
tweets

Unnamed: 0,Is_Unreliable,Category,Tweet,clean_tweet
0,1,"1, 3, 6, 9",We are living in scary times in Canada. Gov’t ...,We are living in scary time in Canada Gov refu...
1,1,"1, 6, 8, 9","Just as bad in Canada. In fact, our government...",Just a bad in Canada In fact our government is...
2,1,"1, 4, 9",It was only a matter of time before the mainst...,It wa only matter of time before the mainstrea...
3,1,"6, 8",Russia's taking no chances: Foreigners infecte...,Russia taking no chance Foreigners infected wi...
4,1,"6, 8, 9",Although there is now a presumptive confirmed ...,Although there is now presumptive confirmed ca...
...,...,...,...,...
555,0,,BREAKING: Harvard classes will move online sta...,BREAKING Harvard class will move online starti...
556,0,,Singularity University is hosting a FREE Virtu...,Singularity University is hosting FREE Virtual...
557,0,,Coronavirus: how does it spread and what are t...,Coronavirus how doe it spread and what are the...
558,0,,Stanford just cancelled classes for the rest o...,Stanford just cancelled class for the rest of ...


In [7]:
tweets['clean_tweet'][5]

'hooray finally there is propaganda banner telling people they can make baby if getting bored staying home coronavirus'

## Instantiate vectorizers - tf-idf

In [8]:
tfidf = TfidfVectorizer(lowercase = True,
                        stop_words = 'english',
                        ngram_range = (1,1))

## Instantiate topic modelers - NMF, truncatedSVD, PCA, and ICA

In [9]:
pca = PCA()

# number of components (topics) to try:
ncomps = [50, 75, 100]

## Create sparse-to-dense transformer

In [10]:
# inspiration taken from this StackOverflow post:
# https://stackoverflow.com/questions/28384680/scikit-learns-pipeline-a-sparse-matrix-was-passed-but-dense-data-is-required
# and this TowardsDataScience post:
# https://towardsdatascience.com/custom-transformers-and-ml-data-pipelines-with-python-20ea2a7adb65

from sklearn.base import TransformerMixin

In [11]:
class SparseToDense(TransformerMixin):
    
    def fit(self, X, y = None, **fit_params):
        return self
    
    def transform(self, X, y = None, **fit_params):
        return X.toarray()

## Set up CV

In [12]:
# pull out relevant data
X = tweets['clean_tweet']
y = tweets['Is_Unreliable']

X_count = tfidf.fit_transform(X)
#print(X_count)

In [13]:
# create pipeline
pipe = Pipeline([
    ('vectorize', tfidf),
    ('densify', SparseToDense()),
    ('scale', StandardScaler()),
    ('dim_red', pca),
    ('classify', SVC())
])

In [14]:
# SVC hyperparams to optimize
kernel = ['rbf', 'linear', 'poly', 'sigmoid']
C = [0.001, 0.01, 0.1, 1, 10]
# set up parameter grid
params = {
    'dim_red__n_components': ncomps,
    'classify__kernel': kernel,
    'classify__C': C
}

In [15]:
# Set CV scheme for inner and outer loops
inner_cv = KFold(n_splits = 3, shuffle = True, random_state = 1)
outer_cv = KFold(n_splits = 5, shuffle = True, random_state = 1)

# Set up GridSearch for inner loop
grid_SVC = GridSearchCV(pipe, params, cv = inner_cv)
#grid_SVC.fit(X, y)

# Nested CV scores
scores = cross_validate(grid_SVC,
                        X = X,
                        y = y,
                        cv = outer_cv,
                        scoring = ['roc_auc', 'accuracy', 'f1', 'precision', 'recall'],
                        return_estimator = True)
auc = scores['test_roc_auc']
accuracy = scores['test_accuracy']
f1 = scores['test_f1']
precision = scores['test_precision']
recall = scores['test_recall']
estimators = scores['estimator']

In [16]:
print(accuracy)
accuracy.mean()

print(precision)
precision.mean()

print(recall)
recall.mean()

print(f1)
f1.mean()

[0.72321429 0.77678571 0.70535714 0.74107143 0.69642857]
[0.73584906 0.7254902  0.71428571 0.81632653 0.70491803]
[0.69642857 0.77083333 0.70175439 0.66666667 0.72881356]
[0.71559633 0.74747475 0.7079646  0.73394495 0.71666667]


0.724329460062999

In [17]:
for i in estimators:
    print(i.best_params_)
    print('\n')

{'classify__C': 0.01, 'classify__kernel': 'linear', 'dim_red__n_components': 75}


{'classify__C': 10, 'classify__kernel': 'linear', 'dim_red__n_components': 50}


{'classify__C': 10, 'classify__kernel': 'linear', 'dim_red__n_components': 75}


{'classify__C': 1, 'classify__kernel': 'sigmoid', 'dim_red__n_components': 100}


{'classify__C': 0.1, 'classify__kernel': 'linear', 'dim_red__n_components': 50}


