In [35]:
# create a tweet covid-19 classifier using fasttext

In [36]:
# import libraries
import fasttext
import pandas as pd
import numpy as np
import re

In [37]:
# load the data
data = pd.read_csv('../Learning_ML/NLP/datasets/Corona_NLP_train.csv', encoding='latin1')

In [38]:
data.shape

(41157, 6)

In [39]:
data.head()

Unnamed: 0,UserName,ScreenName,Location,TweetAt,OriginalTweet,Sentiment
0,3799,48751,London,16-03-2020,@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...,Neutral
1,3800,48752,UK,16-03-2020,advice Talk to your neighbours family to excha...,Positive
2,3801,48753,Vagabonds,16-03-2020,Coronavirus Australia: Woolworths to give elde...,Positive
3,3802,48754,,16-03-2020,My food stock is not the only one which is emp...,Positive
4,3803,48755,,16-03-2020,"Me, ready to go at supermarket during the #COV...",Extremely Negative


In [40]:
data['Sentiment'].value_counts()

Sentiment
Positive              11422
Negative               9917
Neutral                7713
Extremely Positive     6624
Extremely Negative     5481
Name: count, dtype: int64

In [41]:
df_to_use = data.sample(5000)

In [42]:
df_to_use['Sentiment'].value_counts()

Sentiment
Positive              1379
Negative              1262
Neutral                940
Extremely Positive     813
Extremely Negative     606
Name: count, dtype: int64

In [43]:
# replace the sentiment values labels eg -> Extremely Positive -> __label__extremely_positive
df_to_use.replace({'Sentiment': {'Extremely Positive': 'Extremely_Positive', 'Extremely Negative': 'Extremely_Negative'}}, inplace=True)


In [44]:
df_to_use['Sentiment'].value_counts()

Sentiment
Positive              1379
Negative              1262
Neutral                940
Extremely_Positive     813
Extremely_Negative     606
Name: count, dtype: int64

In [45]:
df_to_use.head()

Unnamed: 0,UserName,ScreenName,Location,TweetAt,OriginalTweet,Sentiment
13520,17319,62271,"Puerto Rico, USA",21-03-2020,How to Protect Yourself Against #Coronavirus W...,Positive
14031,17830,62782,Plymouth,21-03-2020,Well done to the if you ve seen any goods at i...,Extremely_Positive
20672,24471,69423,Canada,25-03-2020,koka Thank you to everyone who is working du...,Positive
24264,28063,73015,"Bath, England & Westminster, CO",26-03-2020,The price of some Spanish vegetables has risen...,Negative
25164,28963,73915,,30-03-2020,Could Supermarket Runs get worse? #coronavirus...,Extremely_Negative


In [46]:
# replace the sentiment values labels eg -> Extremely Positive -> __label__extremely_positive
df_to_use['Sentiment'] = '__label__' + df_to_use['Sentiment'].str.lower()
# make lowercas

In [47]:
df_to_use.head()

Unnamed: 0,UserName,ScreenName,Location,TweetAt,OriginalTweet,Sentiment
13520,17319,62271,"Puerto Rico, USA",21-03-2020,How to Protect Yourself Against #Coronavirus W...,__label__positive
14031,17830,62782,Plymouth,21-03-2020,Well done to the if you ve seen any goods at i...,__label__extremely_positive
20672,24471,69423,Canada,25-03-2020,koka Thank you to everyone who is working du...,__label__positive
24264,28063,73015,"Bath, England & Westminster, CO",26-03-2020,The price of some Spanish vegetables has risen...,__label__negative
25164,28963,73915,,30-03-2020,Could Supermarket Runs get worse? #coronavirus...,__label__extremely_negative


In [50]:
import spacy

nlp = spacy.load('en_core_web_sm')
def preprocess_text(text):
    text = text.lower()
    # remove non-alphabetic characters
    text = re.sub(r'[^a-zA-A]', ' ', text)
    # remove whitespaces
    text = re.sub(r' +', ' ', text)
    # remove stop words
    doc = nlp(text)
    doc = ' '.join([token.text for token in doc if not token.is_stop])
    return doc.strip()

In [53]:
preprocess_text("I can not wait to visit the zoo! #fun 232")

'wait visit zoo fun'

In [54]:
df_to_use['tweet'] = df_to_use['Sentiment'] + ' ' + df_to_use['OriginalTweet'].apply(preprocess_text)

In [55]:
df_to_use.head()

Unnamed: 0,UserName,ScreenName,Location,TweetAt,OriginalTweet,Sentiment,tweet
13520,17319,62271,"Puerto Rico, USA",21-03-2020,How to Protect Yourself Against #Coronavirus W...,__label__positive,__label__positive protect coronavirus pumping ...
14031,17830,62782,Plymouth,21-03-2020,Well done to the if you ve seen any goods at i...,__label__extremely_positive,__label__extremely_positive ve seen goods infl...
20672,24471,69423,Canada,25-03-2020,koka Thank you to everyone who is working du...,__label__positive,__label__positive koka thank working scary pan...
24264,28063,73015,"Bath, England & Westminster, CO",26-03-2020,The price of some Spanish vegetables has risen...,__label__negative,__label__negative price spanish vegetables ris...
25164,28963,73915,,30-03-2020,Could Supermarket Runs get worse? #coronavirus...,__label__extremely_negative,__label__extremely_negative supermarket runs w...


In [56]:
# split into train and test
from sklearn.model_selection import train_test_split

train, test = train_test_split(df_to_use, test_size=0.2, random_state=42)

In [57]:
train.shape

(4000, 7)

In [58]:
test.shape

(1000, 7)

In [59]:
# save the train and test data
train.to_csv('train.txt', index=False, header=False, columns=['tweet'])
test.to_csv('test.txt', index=False, header=False, columns=['tweet'])

In [69]:
# train the model
model = fasttext.train_supervised(input='train.txt', lr=0.02, epoch=1000, wordNgrams=2, verbose=2, minCount=1, loss='hs')

Read 0M words
Number of words:  16436
Number of labels: 5
Progress: 100.0% words/sec/thread: 2309307 lr:  0.000000 avg.loss:  0.123681 ETA:   0h 0m 0s


In [70]:
# test the model
model.test('test.txt') # -> (number of samples, precision, recall)
# accuracy

(1000, 0.432, 0.432)

In [71]:
# save the model
model.save_model('model.bin')

In [85]:
# load the model
model = fasttext.load_model('model.bin')



In [86]:
text = preprocess_text('The covid-19 pandemic has affected the world in many ways, it has caused a lot of deaths and economic crisis. We need to take it seriously. #covid19')

model.predict(text)

(('__label__negative',), array([0.70393723]))

In [87]:

text = preprocess_text('I am happy that the covid-19 pandemic is over. I can now go out and have fun with my friends. #covid19')

model.predict(text)

(('__label__extremely_positive',), array([0.57035691]))

In [88]:
model.get_nearest_neighbors('sad')

[(0.9975319504737854, 'scams'),
 (0.9968896508216858, 'crash'),
 (0.996627151966095, 'problem'),
 (0.9957302212715149, 'wrong'),
 (0.9955958127975464, 'police'),
 (0.995547890663147, 'unemployment'),
 (0.9941920042037964, 'cuts'),
 (0.9941766262054443, 'oilandgas'),
 (0.9933328628540039, 'midnight'),
 (0.9927522540092468, 'worse')]

In [89]:
model.get_nearest_neighbors('happy')

[(0.9991841316223145, 'heroes'),
 (0.9988672137260437, 'ensure'),
 (0.9988539814949036, 'free'),
 (0.9987916946411133, 'amazing'),
 (0.9987616539001465, 'safe'),
 (0.997303307056427, 'bonus'),
 (0.9968224167823792, 'strong'),
 (0.9967992305755615, 'beautiful'),
 (0.9964870810508728, 'wisdom'),
 (0.9964268803596497, 'assure')]

In [90]:
model.get_nearest_neighbors('coronavirus')

[(0.9492374062538147, 'buying'),
 (0.9093716144561768, 'covid'),
 (0.8786273002624512, 'coronacrisis'),
 (0.8765828013420105, 'limits'),
 (0.8214386105537415, 'companies'),
 (0.8096031546592712, 'hardest'),
 (0.7893620133399963, 'left'),
 (0.7832221388816833, 'price'),
 (0.769260585308075, 'people'),
 (0.753709077835083, 'prices')]