In [181]:
import pandas as pd
import numpy as np
import spacy
import re
import emoji

EXTERNAL_DATA_COUNT = 30000
TEXT_CLEANING_RE = "@\S+|https?:\S+|http?:\S|[^A-Za-z0-9]+"

#### Read labeled data from Stanford sentiment140 dataset

In [182]:
df = pd.read_csv("sentiment140.csv", 
                 names=['target', 'id', 'date', 'flag', 'user', 'text'],
                 encoding='latin-1')
df.head()

Unnamed: 0,target,id,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


This shows that values 0 and 4 exists for target, which symbolizes sentiment. 
From dataset description, 0 is neg and 4 is pos

In [183]:
df = df[['target','text']].copy()

The original dataset is very big (1.6 million entries). We have decided to reduce the dataset to <insert number> entries, with equal number of positive and negative entries. 

In [184]:
df1 = pd.DataFrame(df)
neg_df = df[df['target'] == 0].head(EXTERNAL_DATA_COUNT)
pos_df = df1[df1['target'] == 4].head(EXTERNAL_DATA_COUNT)
frames = [neg_df, pos_df]
df = pd.concat(frames)

Now, read manually labeled data from COVID topic

In [185]:
df_manual = pd.read_csv("manually_label_samples_labeled.csv")

#label the polarity of each entry for choosing entries with sentimentt
df_manual.loc[df_manual.sentiment==-1,'sentiment']=4
df_manual.loc[df_manual.sentiment==0,'polarity']=1
df_manual.loc[df_manual.sentiment==4,'polarity']=1
df_manual.loc[df_manual.sentiment==2,'polarity']=0
df_manual = df_manual.loc[df_manual.polarity == 1]

df_manual = df_manual[['text', 'sentiment']].copy()
frames = [df, df_manual]
df = pd.concat(frames)

df.loc[df.sentiment==0,'target']=0
df.loc[df.sentiment==4,'target']=4
df.loc[df.sentiment==2,'target']=2

df = df.drop('sentiment', axis = 1)
df.head()

Unnamed: 0,target,text
0,0.0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0.0,is upset that he can't update his Facebook by ...
2,0.0,@Kenichan I dived many times for the ball. Man...
3,0.0,my whole body feels itchy and like its on fire
4,0.0,"@nationwideclass no, it's not behaving at all...."


### Preprocessing

First, regular expression to remove links and @s

Next, spelling reduction (as stanford group proj, reduce huuuungry to huungry)

Then, microtext normalization using netlinguo dictionary and emoji library

Finally, lemmatization

In [186]:
netlingo = pd.read_csv("netlingo.csv")
#This netlingo csv is crawled from the netlinguo webpage

netlingo_dict = dict(zip(netlingo.abbr, netlingo.meaning))

In [187]:
#lemmatization using spaCY
spacy_nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
#Try to use GPU for faster processing
gpu_preference = spacy.prefer_gpu()

In [188]:
def preprocess(text):
    # Remove link,user and special characters
    text = re.sub(TEXT_CLEANING_RE, ' ', str(text).lower()).strip()
    tokens = []
    prev_char = ' '
    curr_char = ' '
    consecutive_char_count = 0
    position = 0
    
    text = emoji.demojize(text)
    text = text.replace(":"," ")
    text = ' '.join(text.split())
    
    for token in text.split():
        if token in netlingo_dict:
            token = netlingo_dict.get(token)
            #lemma here
            for t in token: 
                doc = spacy_nlp(t)
                tokens.append([t.lemma_ for t in doc][0])
        else:
            for char in token: 
                curr_char = char
                if curr_char == prev_char:
                    consecutive_char_count += 1
                else:
                    consecutive_char_count = 0
                position += 1
                prev_char = char
                if consecutive_char_count > 1:
                    position -= 1
                    token =  token[:position] + token[position+1:] #remove char
                    continue
            #lemma here
            doc = spacy_nlp(token)
            tokens.append([token.lemma_ for token in doc][0])
            
            
    return " ".join(tokens)

In [189]:
%%time
df.text = df.text.apply(preprocess)

Wall time: 16min 16s


In [190]:
df

Unnamed: 0,target,text
0,0.0,aww that s a bummer you shoulda get david carr...
1,0.0,be upset that he can t update his facebook by ...
2,0.0,I dive many time for the ball manage to save 5...
3,0.0,my whole body feel itchy and like its on fire
4,0.0,no it s not behave at all I m mad why be I her...
...,...,...
989,4.0,he s active again lol finally tapo na quaranti...
992,0.0,I just write this sentence when stalin say a m...
994,0.0,work from home be fun until you re generate yo...
998,4.0,africafactszone I m defend the corruption sa p...


### Split training data and test data

In [231]:
from sklearn.model_selection import train_test_split

X = list(df['text'])
y = list(df['target'])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

### Create vectorizer using n-grams (n = 1, 2)

In [232]:
from sklearn.feature_extraction.text import CountVectorizer

# create a bag of words for only unigrams and bigrams
cv = CountVectorizer(analyzer = 'word',ngram_range=(1,2))

# convert training data to bag of words
X_train_cv = cv.fit_transform(X_train)
X_test_cv = cv.transform(X_test)

### Multinomial Naive Bayes classifier training

In [233]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import f1_score

# train naive bayes classifier
clf_nb = MultinomialNB()
clf_nb.fit(X_train_cv, y_train)

# create predictions
y_pred = clf_nb.predict(X_test_cv)

# find f-1 score
score = f1_score(y_test, y_pred, average='micro')
print('F-1 score : {}'.format(np.round(score,4)))

F-1 score : 0.7695


In [234]:
from sklearn.metrics import precision_recall_fscore_support
precision_recall_fscore_support(y_test, y_pred, average='macro')

(0.7726536262362811, 0.7691330779480346, 0.7686461435793841, None)

### Logistic Regression (MaxEnt) classifier training

In [235]:
from sklearn.linear_model import LogisticRegression

# train maxent (logistic)
clf_lr = LogisticRegression(random_state=0, max_iter=300)
clf_lr.fit(X_train_cv, y_train)

# create predictions
y_pred = clf_lr.predict(X_test_cv)

# find f-1 score
score = f1_score(y_test, y_pred, average='micro')
print('F-1 score : {}'.format(np.round(score,4)))

F-1 score : 0.7801


In [236]:
precision_recall_fscore_support(y_test, y_pred, average='macro')

(0.7803899124815148, 0.7802281812125249, 0.7801201381993648, None)

Now that training of both classifiers are done, we should move on to test the classifiers

### Test using labeled data that are used in training

To examine the classifiers on sentiment classification only, we only choose the data entries with sentiment. Subjectivity classifiers are examined in another notebook

In [237]:
#Apply preprocessing on test data
df_manual.text = df_manual.text.apply(preprocess)

In [238]:
df_manual

Unnamed: 0,text,sentiment
0,yes in the right circumstance there be a case ...,0
3,currently down with covid admin two neighbour ...,0
6,why be that wretche woman dlaminizuma on tv bl...,0
7,while the number of people hospitalize with co...,4
9,tomw18105328 kamvtv laugh in pandemic shut down,0
...,...,...
989,he s active again lol finally tapo na quaranti...,4
992,I just write this sentence when stalin say a m...,0
994,work from home be fun until you re generate yo...,0
998,africafactszone I m defend the corruption sa p...,4


#### Multinomial NB test

In [239]:
test = list(df_manual['text'])
ref = list(df_manual['sentiment'])
test_cv = cv.transform(test)
pred = clf_nb.predict(test_cv)

#df_test['predicted_polarity'] = pred
score = f1_score(ref, pred, average='micro')
print('F-1 score : {}'.format(np.round(score,4)))

F-1 score : 0.9139


In [240]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
print(classification_report(ref, pred))

              precision    recall  f1-score   support

           0       0.90      0.97      0.93       266
           4       0.94      0.82      0.87       152

    accuracy                           0.91       418
   macro avg       0.92      0.89      0.90       418
weighted avg       0.92      0.91      0.91       418



#### Logistic Regression test

In [241]:
pred = clf_lr.predict(test_cv)

#df_test['predicted_polarity'] = pred
score = f1_score(ref, pred, average='micro')
print('F-1 score : {}'.format(np.round(score,4)))

F-1 score : 0.9282


In [242]:
print(classification_report(ref, pred))

              precision    recall  f1-score   support

           0       0.93      0.96      0.94       266
           4       0.92      0.88      0.90       152

    accuracy                           0.93       418
   macro avg       0.93      0.92      0.92       418
weighted avg       0.93      0.93      0.93       418



### Test using labeled data that are not used in training

In [243]:
df5 = pd.read_json("manually_label_samples_test_labeled.json", orient='record')

df5.loc[df5.sentiment==-1,'sentiment']=4
df5.loc[df5.sentiment==0,'polarity']=1
df5.loc[df5.sentiment==4,'polarity']=1
df5.loc[df5.sentiment==2,'polarity']=0

df5 = df5.loc[df5.polarity == 1]
df5.text = df5.text.apply(preprocess)

#### Multinomial NB test

In [244]:
test = list(df5['text'])
ref = list(df5['sentiment'])
test_cv = cv.transform(test)
pred = clf_nb.predict(test_cv)

#df5['predicted_polarity'] = pred
score = f1_score(ref, pred, average='micro')
print('F-1 score : {}'.format(np.round(score,4)))

F-1 score : 0.7244


In [245]:
print(classification_report(ref, pred))

              precision    recall  f1-score   support

           0       0.78      0.82      0.80       104
           4       0.60      0.54      0.57        52

    accuracy                           0.72       156
   macro avg       0.69      0.68      0.68       156
weighted avg       0.72      0.72      0.72       156



#### Logistic Regression test

In [246]:
pred = clf_lr.predict(test_cv)

#df5['predicted_polarity'] = pred
score = f1_score(ref, pred, average='micro')
print('F-1 score : {}'.format(np.round(score,4)))

F-1 score : 0.7115


In [247]:
print(classification_report(ref, pred))

              precision    recall  f1-score   support

           0       0.80      0.75      0.78       104
           4       0.56      0.63      0.59        52

    accuracy                           0.71       156
   macro avg       0.68      0.69      0.69       156
weighted avg       0.72      0.71      0.72       156



### Test using every labeled data in COVID topic

In [248]:
frames = [df_manual, df5]
df_test = pd.concat(frames)
df_test

Unnamed: 0,text,sentiment,created_at,lang,author_id,id,keyword,geo,polarity
0,yes in the right circumstance there be a case ...,0,NaT,,,,,,
3,currently down with covid admin two neighbour ...,0,NaT,,,,,,
6,why be that wretche woman dlaminizuma on tv bl...,0,NaT,,,,,,
7,while the number of people hospitalize with co...,4,NaT,,,,,,
9,tomw18105328 kamvtv laugh in pandemic shut down,0,NaT,,,,,,
...,...,...,...,...,...,...,...,...,...
377,greet stupid play with my human be a extreme e...,0,2022-04-02 15:52:57+00:00,en,1.429572e+18,1.510284e+18,[lockdown],,1.0
380,dailymailuk live with covid mean accept some s...,0,2022-03-30 08:20:01+00:00,en,1.482627e+18,1.509083e+18,[social distancing],,1.0
387,happy april avoid allfool standwithukraine sto...,4,2022-04-01 10:59:29+00:00,en,6.172872e+08,1.509848e+18,[#wearamask],,1.0
388,if there be more frequent access to testing me...,0,2022-04-02 06:18:24+00:00,en,1.427558e+18,1.510140e+18,[lockdown],,1.0


#### Multinomial NB test

In [249]:
test = list(df_test['text'])
ref = list(df_test['sentiment'])
test_cv = cv.transform(test)
pred = clf_nb.predict(test_cv)

score = f1_score(ref, pred, average='micro')
print('F-1 score : {}'.format(np.round(score,4)))

F-1 score : 0.8624


In [250]:
print(classification_report(ref, pred))

              precision    recall  f1-score   support

           0       0.87      0.93      0.90       370
           4       0.85      0.75      0.79       204

    accuracy                           0.86       574
   macro avg       0.86      0.84      0.85       574
weighted avg       0.86      0.86      0.86       574



#### Logistic Regression test

In [251]:
pred = clf_lr.predict(test_cv)

#df5['predicted_polarity'] = pred
score = f1_score(ref, pred, average='micro')
print('F-1 score : {}'.format(np.round(score,4)))

F-1 score : 0.8693


In [252]:
print(classification_report(ref, pred))

              precision    recall  f1-score   support

           0       0.90      0.90      0.90       370
           4       0.82      0.81      0.82       204

    accuracy                           0.87       574
   macro avg       0.86      0.86      0.86       574
weighted avg       0.87      0.87      0.87       574



### Now, applying this model to the set for labelling

In [253]:
clean_df = pd.read_csv("classfied_data_pol_final.csv")
clean_df

  clean_df = pd.read_csv("classfied_data_pol_final.csv")


Unnamed: 0,created_at,author_id,text,id,lang,keyword,geo,withheld,polarity
0,2022-04-02 17:54:23+00:00,15066159,march#to#june#will#be#a#bit#sparse#for#sightin...,1510314716178046976,en,"['#lockdown', 'lockdown']",,,1.0
1,2022-04-02 17:47:49+00:00,292619181,alan#interview#prime#minister#boris#johnson#an...,1510313065589481472,en,"['#lockdown', 'lockdown']",,,0.0
2,2022-04-02 17:44:12+00:00,15720519,how#to#talk#to#your#kid#about#school#shooting#...,1510312154284630016,en,"['#lockdown', 'lockdown']",,,0.0
3,2022-04-02 17:42:46+00:00,1280441028064018432,fat#loss#in#a#week#weight#loss#for#free#traini...,1510311792698101760,en,"['#lockdown', 'lockdown']",,,0.0
4,2022-04-02 17:34:58+00:00,1449377983211450368,make#a#name#get#a#check#merchandise#royality#d...,1510309831005069312,en,"['#lockdown', 'lockdown']",,,0.0
...,...,...,...,...,...,...,...,...,...
66857,2022-04-02 05:00:19+00:00,1307968835694125056,goddessphotosau#sure#do#a#new#hobby#come#out#o...,1510119915801309184,en,['lockdown'],,,1.0
66858,2022-04-02 05:00:01+00:00,2269577210,5#determine#the#strategy#for#pandemic#manageme...,1510119841281228800,en,['lockdown'],,,0.0
66859,2022-04-02 04:59:59+00:00,3493412003,an#old#story#but#think#about#this#again#now#th...,1510119833366675456,en,['lockdown'],,,1.0
66860,2022-04-02 04:59:53+00:00,1301866054839185408,tax#oz#even#then#his#wa#covid#0#forever#lockdo...,1510119806930145280,en,['lockdown'],,,1.0


In [254]:
clean_df.loc[clean_df.polarity==0,'sentiment']=2
clean_df.loc[clean_df.polarity==1,'sentiment']=-1

input_df = clean_df.loc[clean_df.polarity == 1]
neutral_df = clean_df.loc[clean_df.polarity == 0]

input_list = list(input_df['text'])
input_cv = cv.transform(input_list)
pred = clf_nb.predict(input_cv)

sentiment = pred.tolist()
input_df = input_df.assign(sentiment = sentiment)
input_df

Unnamed: 0,created_at,author_id,text,id,lang,keyword,geo,withheld,polarity,sentiment
0,2022-04-02 17:54:23+00:00,15066159,march#to#june#will#be#a#bit#sparse#for#sightin...,1510314716178046976,en,"['#lockdown', 'lockdown']",,,1.0,4.0
6,2022-04-02 17:20:37+00:00,1193612999803723776,I#recommend#everyone#to#watch#v#for#vendetta#i...,1510306220040761344,en,"['#lockdown', '#quarantine', 'coronavirus', 'q...",,,1.0,0.0
8,2022-04-02 17:01:41+00:00,1327891543596953600,shanghai#begin#second#stage#of#citywide#lockdo...,1510301454657110016,en,"['#lockdown', 'lockdown']",,,1.0,0.0
9,2022-04-02 16:52:31+00:00,77123276,this#be#beyond#outrageous#those#poor#baby#shan...,1510299149236940800,en,"['#lockdown', 'lockdown']",,,1.0,0.0
10,2022-04-02 16:47:03+00:00,1025960188057214976,child#concentration#camp#in#shanghai#these#chi...,1510297771739430912,en,"['#lockdown', 'lockdown']",,,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...
66854,2022-04-02 05:00:59+00:00,26940008,the#crisis#be#the#direct#result#of#covid#measu...,1510120083724673024,en,['lockdown'],,,1.0,0.0
66856,2022-04-02 05:00:24+00:00,1464991420561899520,big#april#fool#day#joke#be#the#6ixbuzz#post#ab...,1510119938492710912,en,['lockdown'],,,1.0,0.0
66857,2022-04-02 05:00:19+00:00,1307968835694125056,goddessphotosau#sure#do#a#new#hobby#come#out#o...,1510119915801309184,en,['lockdown'],,,1.0,0.0
66859,2022-04-02 04:59:59+00:00,3493412003,an#old#story#but#think#about#this#again#now#th...,1510119833366675456,en,['lockdown'],,,1.0,0.0


In [255]:
frame = [input_df, neutral_df]
output_df = pd.concat(frame)
output_df

Unnamed: 0,created_at,author_id,text,id,lang,keyword,geo,withheld,polarity,sentiment
0,2022-04-02 17:54:23+00:00,15066159,march#to#june#will#be#a#bit#sparse#for#sightin...,1510314716178046976,en,"['#lockdown', 'lockdown']",,,1.0,4.0
6,2022-04-02 17:20:37+00:00,1193612999803723776,I#recommend#everyone#to#watch#v#for#vendetta#i...,1510306220040761344,en,"['#lockdown', '#quarantine', 'coronavirus', 'q...",,,1.0,0.0
8,2022-04-02 17:01:41+00:00,1327891543596953600,shanghai#begin#second#stage#of#citywide#lockdo...,1510301454657110016,en,"['#lockdown', 'lockdown']",,,1.0,0.0
9,2022-04-02 16:52:31+00:00,77123276,this#be#beyond#outrageous#those#poor#baby#shan...,1510299149236940800,en,"['#lockdown', 'lockdown']",,,1.0,0.0
10,2022-04-02 16:47:03+00:00,1025960188057214976,child#concentration#camp#in#shanghai#these#chi...,1510297771739430912,en,"['#lockdown', 'lockdown']",,,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...
66844,2022-04-02 05:02:50+00:00,350772690,more#than#l#o#c#a#t#I#o#n#000#banker#and#trade...,1510120548398891008,en,['lockdown'],,,0.0,2.0
66851,2022-04-02 05:01:15+00:00,1280478739793448960,how#the#key#of#sushant#s#room#go#miss#how#they...,1510120151978369024,en,['lockdown'],,,0.0,2.0
66855,2022-04-02 05:00:48+00:00,1314347140450193408,nmhrkssr#itsselfy#itsssr#as#per#I#know#1#ssr#p...,1510120039143276544,en,['lockdown'],,,0.0,2.0
66858,2022-04-02 05:00:01+00:00,2269577210,5#determine#the#strategy#for#pandemic#manageme...,1510119841281228800,en,['lockdown'],,,0.0,2.0


In [256]:
#output_df.to_json("classified_twitter_data.json", orient='records', indent=2)

In [257]:
sentiment_list = output_df.sentiment.tolist()

In [258]:
clean_df = pd.read_json("cleaned_data.json", orient='record')
clean_df = clean_df.assign(sentiment = sentiment_list)
clean_df

Unnamed: 0,created_at,author_id,text,id,lang,keyword,geo,withheld,sentiment
0,2022-04-02 17:54:23+00:00,15066159,@andy_goodey March to June will be a bit spars...,1510314716178046976,en,"[#lockdown, lockdown]",,,4.0
1,2022-04-02 17:47:49+00:00,292619181,Alan interviews Prime Minister Boris Johnson. ...,1510313065589481472,en,"[#lockdown, lockdown]",,,0.0
2,2022-04-02 17:44:12+00:00,15720519,How To Talk To Your Kids About School Shooting...,1510312154284630016,en,"[#lockdown, lockdown]",,,0.0
3,2022-04-02 17:42:46+00:00,1280441028064018432,FAT LOSS IN a WEEK weight loss\n FOR FREE TRA...,1510311792698101760,en,"[#lockdown, lockdown]",,,0.0
4,2022-04-02 17:34:58+00:00,1449377983211450368,Making a name get a check merchandise ROYALITY...,1510309831005069312,en,"[#lockdown, lockdown]",,,0.0
...,...,...,...,...,...,...,...,...,...
66857,2022-04-02 05:00:19+00:00,1307968835694125056,@RainMorgan33 @Goddessphotosau Sure did! A new...,1510119915801309184,en,[lockdown],,,2.0
66858,2022-04-02 05:00:01+00:00,2269577210,@CBSNews 5/ determining the strategies for pan...,1510119841281228800,en,[lockdown],,,2.0
66859,2022-04-02 04:59:59+00:00,3493412003,"An old story, but thinking about this again no...",1510119833366675456,en,[lockdown],,,2.0
66860,2022-04-02 04:59:53+00:00,1301866054839185408,@barbara32805432 @tax_oz Even then his wa covi...,1510119806930145280,en,[lockdown],,,2.0


In [259]:
pd.set_option('display.max_columns', None)
clean_df.text[2]

'How To Talk To Your Kids About School Shootings - Advice Offered By Psychologist And Award Winning Selfie Film Maker Barbara Becker Holstein https://t.co/OCPJq1Rset #bullying #lockdown https://t.co/k5WEyGv2kQ'

In [260]:
clean_df.to_json("classified_twitter_data_final.json", orient='records', indent=2)