In [185]:
import pandas as pd
import numpy as np
import spacy
import re
import emoji

EXTERNAL_DATA_COUNT = 750
TEXT_CLEANING_RE = "@\S+|https?:\S+|http?:\S|[^A-Za-z0-9]+"

#### Read labeled data from IMDB rotten tomato movie review dataset

In [186]:
df = pd.read_csv("complete10000.csv")
df

Unnamed: 0,text,polarity
0,b'skip work to see it at the first opportunity...,1
1,when raj's family move to england to get invol...,0
2,the clues are few and time is running out for ...,0
3,"b""fessenden continues to do interesting work ,...",1
4,"throughout this time , their lives have been e...",0
...,...,...
9995,"b""has enough wit , energy and geniality to ple...",1
9996,patricia confides to her good-hearted maid mar...,0
9997,today the mighty trident ssbns form a deterren...,0
9998,"this first film , a deliberately allegorical v...",0


In [187]:
df1 = df
neg_df = df[df['polarity'] == 0].head(EXTERNAL_DATA_COUNT)
pos_df = df1[df1['polarity'] == 1].head(EXTERNAL_DATA_COUNT)
pos_df

Unnamed: 0,text,polarity
0,b'skip work to see it at the first opportunity...,1
3,"b""fessenden continues to do interesting work ,...",1
5,"b""as predictable as the outcome of a globetrot...",1
7,"b'together , miller , kuras and the actresses ...",1
10,"b""a lousy movie that's not merely unwatchable ...",1
...,...,...
1494,"b""it's a lovely film with lovely performances ...",1
1496,"b"" . . . with the candy-like taste of it fadin...",1
1497,"b""i loved looking at this movie . i just didn'...",1
1498,b'graced with the kind of social texture and r...,1


In [188]:
def remove_b(text):
    text = str(text)[2:]
    return text
pos_df.text = pos_df.text.apply(remove_b)
pos_df

Unnamed: 0,text,polarity
0,skip work to see it at the first opportunity ....,1
3,"fessenden continues to do interesting work , a...",1
5,as predictable as the outcome of a globetrotte...,1
7,"together , miller , kuras and the actresses ma...",1
10,"a lousy movie that's not merely unwatchable , ...",1
...,...,...
1494,it's a lovely film with lovely performances by...,1
1496,. . . with the candy-like taste of it fading ...,1
1497,i loved looking at this movie . i just didn't ...,1
1498,graced with the kind of social texture and rea...,1


In [189]:
frames = [neg_df, pos_df]
df = pd.concat(frames)

df

Unnamed: 0,text,polarity
1,when raj's family move to england to get invol...,0
2,the clues are few and time is running out for ...,0
4,"throughout this time , their lives have been e...",0
6,"a dark psychological drama , i love your work ...",0
8,after finding the plane and its crew torn to s...,0
...,...,...
1494,it's a lovely film with lovely performances by...,1
1496,. . . with the candy-like taste of it fading ...,1
1497,i loved looking at this movie . i just didn't ...,1
1498,graced with the kind of social texture and rea...,1


Now, read manually labeled data from COVID topic

In [190]:
df_manual = pd.read_csv("manually_label_samples_labeled.csv")

#label the polarity of each entry for choosing entries with sentimentt
df_manual.loc[df_manual.sentiment==-1,'sentiment']=4
df_manual.loc[df_manual.sentiment==0,'polarity']=1
df_manual.loc[df_manual.sentiment==4,'polarity']=1
df_manual.loc[df_manual.sentiment==2,'polarity']=0

df_manual = df_manual[['text', 'polarity']].copy()
frames = [df, df_manual]
df = pd.concat(frames)

df.head()

Unnamed: 0,text,polarity
1,when raj's family move to england to get invol...,0.0
2,the clues are few and time is running out for ...,0.0
4,"throughout this time , their lives have been e...",0.0
6,"a dark psychological drama , i love your work ...",0.0
8,after finding the plane and its crew torn to s...,0.0


### Preprocessing

First, regular expression to remove links and @s

Next, spelling reduction (as stanford group proj, reduce huuuungry to huungry)

Then, microtext normalization using netlinguo dictionary and emoji library

Finally, lemmatization

In [191]:
netlingo = pd.read_csv("netlingo.csv")
#This netlingo csv is crawled from the netlinguo webpage

netlingo_dict = dict(zip(netlingo.abbr, netlingo.meaning))

In [192]:
#lemmatization using spaCY
spacy_nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
#Try to use GPU for faster processing
gpu_preference = spacy.prefer_gpu()

In [193]:
def preprocess(text):
    # Remove link,user and special characters
    text = re.sub(TEXT_CLEANING_RE, ' ', str(text).lower()).strip()
    tokens = []
    prev_char = ' '
    curr_char = ' '
    consecutive_char_count = 0
    position = 0
    
    text = emoji.demojize(text)
    text = text.replace(":"," ")
    text = ' '.join(text.split())
    
    for token in text.split():
        if token in netlingo_dict:
            token = netlingo_dict.get(token)
            #lemma here
            for t in token: 
                doc = spacy_nlp(t)
                tokens.append([t.lemma_ for t in doc][0])
        else:
            for char in token: 
                curr_char = char
                if curr_char == prev_char:
                    consecutive_char_count += 1
                else:
                    consecutive_char_count = 0
                position += 1
                prev_char = char
                if consecutive_char_count > 1:
                    position -= 1
                    token =  token[:position] + token[position+1:] #remove char
                    continue
            #lemma here
            doc = spacy_nlp(token)
            tokens.append([token.lemma_ for token in doc][0])
            
            
    return "#".join(tokens)

In [194]:
%%time
df.text = df.text.apply(preprocess)

Wall time: 1min 12s


In [195]:
df

Unnamed: 0,text,polarity
1,when#raj#s#family#move#to#england#to#get#invol...,0.0
2,the#clue#be#few#and#time#be#run#out#for#the#st...,0.0
4,throughout#this#time#their#live#have#be#entwin...,0.0
6,a#dark#psychological#drama#I#love#your#work#ex...,0.0
8,after#find#the#plane#and#its#crew#tear#to#shre...,0.0
...,...,...
995,for#all#those#who#be#new#to#this#work#from#hom...,0.0
996,full#stack#web#developer#remotework#remotejob#wfh,0.0
997,where#will#collaboration#tool#for#hybridwork#g...,0.0
998,africafactszone#I#m#defend#the#corruption#sa#p...,1.0


def whitespace_indicator(text):
    tokens = []
    for token in text.split():
        tokens.append(token)
    
    return "#".join(tokens)
df = df_storage
df.text = df.text.apply(whitespace_indicator)
df

### Split training data and test data

In [196]:
from sklearn.model_selection import train_test_split

X = list(df['text'])
y = list(df['polarity'])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

### Create vectorizer using n-grams (n = 2 ~ 5)

Note that we do not remove stop words, as previous experiments show that removing stop words (even with a customized stopword list that preserves negation words) has a small but harmful effect on the results

In [197]:
from sklearn.feature_extraction.text import CountVectorizer

# create a bag of words for n = 2 - 5
cv = CountVectorizer(analyzer = 'char',ngram_range=(2,5))

# convert training data to bag of words
X_train_cv = cv.fit_transform(X_train)
X_test_cv = cv.transform(X_test)

### Multinomial Naive Bayes classifier training

In [198]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import f1_score

# train naive bayes classifier
clf_nb = MultinomialNB()
clf_nb.fit(X_train_cv, y_train)

# create predictions
y_pred = clf_nb.predict(X_test_cv)

# find f-1 score
score = f1_score(y_test, y_pred, average='micro')
print('F-1 score : {}'.format(np.round(score,4)))

F-1 score : 0.7648


In [199]:
from sklearn.metrics import precision_recall_fscore_support
precision_recall_fscore_support(y_test, y_pred, average='macro')

(0.7744017066937035, 0.7661108607480847, 0.7632850241545894, None)

### Logistic Regression (MaxEnt) classifier training

In [200]:
from sklearn.linear_model import LogisticRegression

# train maxent (logistic)
clf_lr = LogisticRegression(random_state=0, max_iter=250)
clf_lr.fit(X_train_cv, y_train)

# create predictions
y_pred = clf_lr.predict(X_test_cv)

# find f-1 score
score = f1_score(y_test, y_pred, average='micro')
print('F-1 score : {}'.format(np.round(score,4)))

F-1 score : 0.7776


In [201]:
precision_recall_fscore_support(y_test, y_pred, average='macro')

(0.7776753712237583, 0.7777151870211807, 0.7775977226006794, None)

Now that training of both classifiers are done, we should move on to test the classifiers

### Test using labeled data that are used in training

To examine the classifiers on sentiment classification only, we only choose the data entries with sentiment. Subjectivity classifiers are examined in another notebook

In [202]:
#Apply preprocessing on test data
df_manual.text = df_manual.text.apply(preprocess)

In [203]:
df_manual

Unnamed: 0,text,polarity
0,yes#in#the#right#circumstance#there#be#a#case#...,1.0
1,governance#and#executive#assistant#remotejob#r...,0.0
2,cnbcmakeit#for#all#those#who#be#new#to#this#wo...,0.0
3,currently#down#with#covid#admin#two#neighbour#...,1.0
4,when#hire#talent#especially#remote#talent#your...,0.0
...,...,...
995,for#all#those#who#be#new#to#this#work#from#hom...,0.0
996,full#stack#web#developer#remotework#remotejob#wfh,0.0
997,where#will#collaboration#tool#for#hybridwork#g...,0.0
998,africafactszone#I#m#defend#the#corruption#sa#p...,1.0


df_manual = df_store_manual
df_manual.text = df_manual.text.apply(whitespace_indicator)

#### Multinomial NB test

In [204]:
test = list(df_manual['text'])
ref = list(df_manual['polarity'])
test_cv = cv.transform(test)
pred = clf_nb.predict(test_cv)

#df_test['predicted_polarity'] = pred
score = f1_score(ref, pred, average='micro')
print('F-1 score : {}'.format(np.round(score,4)))

F-1 score : 0.879


In [205]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
print(classification_report(ref, pred))

              precision    recall  f1-score   support

         0.0       0.92      0.87      0.89       582
         1.0       0.83      0.89      0.86       418

    accuracy                           0.88      1000
   macro avg       0.87      0.88      0.88      1000
weighted avg       0.88      0.88      0.88      1000



#### Logistic Regression test

In [206]:
pred = clf_lr.predict(test_cv)

#df_test['predicted_polarity'] = pred
score = f1_score(ref, pred, average='micro')
print('F-1 score : {}'.format(np.round(score,4)))

F-1 score : 0.9


In [207]:
print(classification_report(ref, pred))

              precision    recall  f1-score   support

         0.0       0.92      0.91      0.91       582
         1.0       0.88      0.89      0.88       418

    accuracy                           0.90      1000
   macro avg       0.90      0.90      0.90      1000
weighted avg       0.90      0.90      0.90      1000



It seems that both classifiers score high in this test. 

Previous tests have shown that the scores may vary +/- 0.005

### Test using labeled data that are not used in training

In [208]:
df5 = pd.read_json("manually_label_samples_test_labeled.json", orient='record')

df5.loc[df5.sentiment==-1,'sentiment']=4
df5.loc[df5.sentiment==0,'polarity']=1
df5.loc[df5.sentiment==4,'polarity']=1
df5.loc[df5.sentiment==2,'polarity']=0

df5 = df5[['text', 'polarity']].copy()
df5.text = df5.text.apply(preprocess)

df5 = df_store_5
df5.text = df5.text.apply(whitespace_indicator)

#### Multinomial NB test

In [209]:
test = list(df5['text'])
ref = list(df5['polarity'])
test_cv = cv.transform(test)
pred = clf_nb.predict(test_cv)

#df5['predicted_polarity'] = pred
score = f1_score(ref, pred, average='micro')
print('F-1 score : {}'.format(np.round(score,4)))

F-1 score : 0.7125


In [210]:
print(classification_report(ref, pred))

              precision    recall  f1-score   support

         0.0       0.78      0.74      0.76       244
         1.0       0.62      0.67      0.65       156

    accuracy                           0.71       400
   macro avg       0.70      0.71      0.70       400
weighted avg       0.72      0.71      0.71       400



#### Logistic Regression test

In [211]:
pred = clf_lr.predict(test_cv)

#df5['predicted_polarity'] = pred
score = f1_score(ref, pred, average='micro')
print('F-1 score : {}'.format(np.round(score,4)))

F-1 score : 0.71


In [212]:
print(classification_report(ref, pred))

              precision    recall  f1-score   support

         0.0       0.75      0.79      0.77       244
         1.0       0.64      0.59      0.61       156

    accuracy                           0.71       400
   macro avg       0.69      0.69      0.69       400
weighted avg       0.71      0.71      0.71       400



### Test using every labeled data in COVID topic

In [213]:
frames = [df_manual, df5]
df_test = pd.concat(frames)
df_test

Unnamed: 0,text,polarity
0,yes#in#the#right#circumstance#there#be#a#case#...,1.0
1,governance#and#executive#assistant#remotejob#r...,0.0
2,cnbcmakeit#for#all#those#who#be#new#to#this#wo...,0.0
3,currently#down#with#covid#admin#two#neighbour#...,1.0
4,when#hire#talent#especially#remote#talent#your...,0.0
...,...,...
395,do#you#know#1#3#of#the#we#population#be#age#50...,0.0
396,I#wait#for#bee#to#get#home#from#work#so#I#can#...,0.0
397,n#ppl#really#boutta#debate#where#covid#23#come...,0.0
398,least#we#dinnae#droon#folk#fir#mess#aboot#with...,0.0


#### Multinomial NB test

In [214]:
test = list(df_test['text'])
ref = list(df_test['polarity'])
test_cv = cv.transform(test)
pred = clf_nb.predict(test_cv)

score = f1_score(ref, pred, average='micro')
print('F-1 score : {}'.format(np.round(score,4)))

F-1 score : 0.8314


In [215]:
print(classification_report(ref, pred))

              precision    recall  f1-score   support

         0.0       0.88      0.83      0.85       826
         1.0       0.78      0.83      0.80       574

    accuracy                           0.83      1400
   macro avg       0.83      0.83      0.83      1400
weighted avg       0.83      0.83      0.83      1400



#### Logistic Regression test

In [216]:
pred = clf_lr.predict(test_cv)

#df5['predicted_polarity'] = pred
score = f1_score(ref, pred, average='micro')
print('F-1 score : {}'.format(np.round(score,4)))

F-1 score : 0.8457


In [217]:
print(classification_report(ref, pred))

              precision    recall  f1-score   support

         0.0       0.87      0.87      0.87       826
         1.0       0.82      0.81      0.81       574

    accuracy                           0.85      1400
   macro avg       0.84      0.84      0.84      1400
weighted avg       0.85      0.85      0.85      1400



### Now, classify the crawled data and output

In [218]:
clean_df = pd.read_json("cleaned_data.json", orient='record')
clean_df.text= clean_df.text.apply(preprocess)
X = list(clean_df['text'])
X_cv = cv.transform(X)
pred = clf_nb.predict(X_cv)
polarity = pred.tolist()
clean_df_1 = clean_df
clean_df_1 = clean_df_1.assign(polarity = polarity)
clean_df_1.to_csv("classfied_data_pol_final.csv", index=False)

In [219]:
clean_df_1

Unnamed: 0,created_at,author_id,text,id,lang,keyword,geo,withheld,polarity
0,2022-04-02 17:54:23+00:00,15066159,march#to#june#will#be#a#bit#sparse#for#sightin...,1510314716178046976,en,"[#lockdown, lockdown]",,,1.0
1,2022-04-02 17:47:49+00:00,292619181,alan#interview#prime#minister#boris#johnson#an...,1510313065589481472,en,"[#lockdown, lockdown]",,,0.0
2,2022-04-02 17:44:12+00:00,15720519,how#to#talk#to#your#kid#about#school#shooting#...,1510312154284630016,en,"[#lockdown, lockdown]",,,0.0
3,2022-04-02 17:42:46+00:00,1280441028064018432,fat#loss#in#a#week#weight#loss#for#free#traini...,1510311792698101760,en,"[#lockdown, lockdown]",,,0.0
4,2022-04-02 17:34:58+00:00,1449377983211450368,make#a#name#get#a#check#merchandise#royality#d...,1510309831005069312,en,"[#lockdown, lockdown]",,,0.0
...,...,...,...,...,...,...,...,...,...
66857,2022-04-02 05:00:19+00:00,1307968835694125056,goddessphotosau#sure#do#a#new#hobby#come#out#o...,1510119915801309184,en,[lockdown],,,1.0
66858,2022-04-02 05:00:01+00:00,2269577210,5#determine#the#strategy#for#pandemic#manageme...,1510119841281228800,en,[lockdown],,,0.0
66859,2022-04-02 04:59:59+00:00,3493412003,an#old#story#but#think#about#this#again#now#th...,1510119833366675456,en,[lockdown],,,1.0
66860,2022-04-02 04:59:53+00:00,1301866054839185408,tax#oz#even#then#his#wa#covid#0#forever#lockdo...,1510119806930145280,en,[lockdown],,,1.0
