In [1]:
import pandas as pd
import importlib
import seaborn as sns
import preprocessing
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from models import vader_sa as vader
from models import afinn_sa as afinn
from models import hyperopt_pt as optimizer
import utils
importlib.reload(optimizer)

<module 'models.hyperopt_pt' from '/Users/venugopalbhatia/Documents/Computational Methods for Informatics/Assignments/Project/models/hyperopt_pt.py'>

# Agenda
<ol>

<li> Create method for basic cleaning, removing retweet tags,etc. </li>
<li> Create method for POS tagging and lemmatization </li>
<li> Run Sentiment analysis on data with basic cleaning using Vader and Afinn. </li>
<li> Train multiple classifiers including Naive Bayes and SVMs on the data. Use HyperOpt for hyperparameter tuning. Also selecting the best model using HyperOpt-sklearn. </li>

</ol>

In [2]:
preprocessor = preprocessing.Preprocessor()

## Looking at Reannotated data and preparing training data

In [2]:
df_tweetData = pd.read_csv("./data/Corona_NLP_train.csv")
df_tweetData

Unnamed: 0,UserName,ScreenName,Location,TweetAt,OriginalTweet,Sentiment
0,3799,48751,London,16-03-2020,@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...,Neutral
1,3800,48752,UK,16-03-2020,advice Talk to your neighbours family to excha...,Positive
2,3801,48753,Vagabonds,16-03-2020,Coronavirus Australia: Woolworths to give elde...,Positive
3,3802,48754,,16-03-2020,My food stock is not the only one which is emp...,Positive
4,3803,48755,,16-03-2020,"Me, ready to go at supermarket during the #COV...",Extremely Negative
...,...,...,...,...,...,...
41152,44951,89903,"Wellington City, New Zealand",14-04-2020,Airline pilots offering to stock supermarket s...,Neutral
41153,44952,89904,,14-04-2020,Response to complaint not provided citing COVI...,Extremely Negative
41154,44953,89905,,14-04-2020,You know its getting tough when @KameronWilds...,Positive
41155,44954,89906,,14-04-2020,Is it wrong that the smell of hand sanitizer i...,Neutral


In [3]:
df_tweetData.columns

Index(['UserName', 'ScreenName', 'Location', 'TweetAt', 'OriginalTweet',
       'Sentiment'],
      dtype='object')

In [14]:
df_tweetData['Location'].value_counts()

London                            540
United States                     528
London, England                   520
New York, NY                      395
Washington, DC                    373
                                 ... 
Loxahatchee, FL                     1
The American Riviera                1
Lawrence MA                         1
Priced out of the East Bay, 19      1
Gloucestershire, UK                 1
Name: Location, Length: 12220, dtype: int64

In [4]:
preprocessor.simple_clean(df_tweetData,'OriginalTweet','message_cleaned')
preprocessor.clean_tweets(df_tweetData,'message_cleaned','message_tokenized')
preprocessor.lemmatize_tweets(df_tweetData,'message_tokenized','message_tokens_lemmatized')

In [5]:
print(df_tweetData.iloc[41154]['OriginalTweet'])
print(df_tweetData.iloc[41154]['message_cleaned'])
print(df_tweetData.iloc[41154]['message_tokenized'])
print(df_tweetData.iloc[41154]['message_tokens_lemmatized'])

You know its getting tough when @KameronWilds  is rationing toilet paper #coronavirus #toiletpaper @kroger martinsville, help us out!!
You know it s getting tough when   is rationing toilet paper  coronavirus  toiletpaper   martinsville  help us out
['know', 'getting', 'tough', 'rationing', 'toilet', 'paper', 'coronavirus', 'toiletpaper', 'martinsville', 'help', 'us']
['know', 'get', 'tough', 'rationing', 'toilet', 'paper', 'coronavirus', 'toiletpaper', 'martinsville', 'help', 'u']


In [6]:
sentiment_classes = list(df_tweetData['Sentiment'].unique())
sentiment_classes

['Neutral', 'Positive', 'Extremely Negative', 'Negative', 'Extremely Positive']

In [7]:
numerical_sentiment = {
    "Extremely Negative": -1,
    "Negative":-1,
    "Neutral":0,
    "Positive":1,
    "Extremely Positive":1
}

df_tweetData['Sentiment_numerical'] = df_tweetData['Sentiment'].map(numerical_sentiment)

In [8]:
df_tweetData.to_csv("./data/TweetData_train_processed_ml.csv")

## VADER & afinn based sentiment analysis on training data

In [9]:
df_vader = vader.sentiment_scores(df_tweetData,'message_cleaned')
df_vader.columns = 'vader_' + df_vader.columns
df_tweetData = df_tweetData.join(df_vader)
afinn.sentiment_scores(df_tweetData,'message_cleaned')
df_tweetData


Unnamed: 0,UserName,ScreenName,Location,TweetAt,OriginalTweet,Sentiment,retweeted_from,mentions,hashtags,links,message_cleaned,message_tokenized,message_tokens_lemmatized,Sentiment_numerical,vader_dict,vader_neg,vader_neu,vader_pos,vader_compound,afinn_score
0,3799,48751,London,16-03-2020,@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...,Neutral,[],"[MeNyrbie, Phil_Gahan, Chrisitv]",[],"[https://t.co/iFz9FAn2Pa, https://t.co/xX6ghGF...",and and,[],[],0,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",0.000,1.000,0.000,0.0000,0.0
1,3800,48752,UK,16-03-2020,advice Talk to your neighbours family to excha...,Positive,[],[],[],[],advice Talk to your neighbours family to excha...,"[advice, Talk, neighbours, family, exchange, p...","[advice, talk, neighbour, family, exchange, ph...",1,"{'neg': 0.0, 'neu': 0.923, 'pos': 0.077, 'comp...",0.000,0.923,0.077,0.2500,1.0
2,3801,48753,Vagabonds,16-03-2020,Coronavirus Australia: Woolworths to give elde...,Positive,[],[],[],[https://t.co/bInCA9Vp8P],Coronavirus Australia Woolworths to give elde...,"[Coronavirus, Australia, Woolworths, give, eld...","[coronavirus, australia, woolworths, give, eld...",1,"{'neg': 0.0, 'neu': 0.812, 'pos': 0.188, 'comp...",0.000,0.812,0.188,0.4588,0.0
3,3802,48754,,16-03-2020,My food stock is not the only one which is emp...,Positive,[],[],"[COVID19france, COVID_19, COVID19, coronavirus...",[https://t.co/zrlG0Z520j],My food stock is not the only one which is emp...,"[food, stock, one, empty, PLEASE, panic, ENOUG...","[food, stock, one, empty, please, panic, enoug...",1,"{'neg': 0.101, 'neu': 0.735, 'pos': 0.164, 'co...",0.101,0.735,0.164,0.4824,0.0
4,3803,48755,,16-03-2020,"Me, ready to go at supermarket during the #COV...",Extremely Negative,[],[],"[COVID19, coronavirus, CoronavirusFrance, rest...",[https://t.co/usmuaLq72n],Me ready to go at supermarket during the COV...,"[ready, go, supermarket, COVID19, outbreak, pa...","[ready, go, supermarket, covid19, outbreak, pa...",-1,"{'neg': 0.238, 'neu': 0.669, 'pos': 0.093, 'co...",0.238,0.669,0.093,-0.7506,-7.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41152,44951,89903,"Wellington City, New Zealand",14-04-2020,Airline pilots offering to stock supermarket s...,Neutral,[],[],"[NZ, COVID]",[https://t.co/cz89uA0HNp],Airline pilots offering to stock supermarket s...,"[Airline, pilots, offering, stock, supermarket...","[airline, pilot, offer, stock, supermarket, sh...",0,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",0.000,1.000,0.000,0.0000,0.0
41153,44952,89904,,14-04-2020,Response to complaint not provided citing COVI...,Extremely Negative,[],[],[],[],Response to complaint not provided citing COVI...,"[Response, complaint, provided, citing, COVID,...","[response, complaint, provide, cite, covid, 19...",-1,"{'neg': 0.198, 'neu': 0.802, 'pos': 0.0, 'comp...",0.198,0.802,0.000,-0.6369,-3.0
41154,44953,89905,,14-04-2020,You know its getting tough when @KameronWilds...,Positive,[],"[KameronWilds, kroger]","[coronavirus, toiletpaper]",[],You know it s getting tough when is rationin...,"[know, getting, tough, rationing, toilet, pape...","[know, get, tough, rationing, toilet, paper, c...",1,"{'neg': 0.078, 'neu': 0.781, 'pos': 0.141, 'co...",0.078,0.781,0.141,0.2960,2.0
41155,44954,89906,,14-04-2020,Is it wrong that the smell of hand sanitizer i...,Neutral,[],[],"[coronavirus, COVID19, coronavirus]",[],Is it wrong that the smell of hand sanitizer i...,"[wrong, smell, hand, sanitizer, starting, turn...","[wrong, smell, hand, sanitizer, start, turn, c...",0,"{'neg': 0.139, 'neu': 0.717, 'pos': 0.143, 'co...",0.139,0.717,0.143,0.0258,-2.0


In [10]:
df_tweetData["lemmatized_message"] = df_tweetData["message_tokens_lemmatized"].apply(lambda x: " ".join(x))
df_tweetData['Tweet_Length'] = df_tweetData['lemmatized_message'].apply(len)
df_tweetData.to_csv("./data/Corona_Tweet_Data_processed_ml.csv")

## Bayesian Hyperparameter tuning for various ML models
### Models included: 'svm','naive_bayes','knn','random_forest','decision_tree'
### Models trained using 5 fold stratified CV. 
### Number of Features also treated as a Hyperparameter.


In [13]:
NB = optimizer.optimize_model(model = 'naive_bayes', algo = tpe.suggest,max_evals = 50,df = df_tweetData,message_tokenized = 'message_tokens_lemmatized',x = 'feature',y='Sentiment_numerical')
NB.get_optimum_hyperparameters()

100%|██████████| 50/50 [02:54<00:00,  3.49s/trial, best loss: -0.6854965093800853]


({'alpha': 0.3871070316515677, 'numFeatures': 1930},
 <hyperopt.base.Trials at 0x7fc95d20a3d0>)

In [14]:
randomForest = optimizer.optimize_model(model = 'random_forest', algo = tpe.suggest,max_evals = 50,df = df_tweetData,message_tokenized = 'message_tokens_lemmatized',x = 'feature',y='Sentiment_numerical')
randomForest.get_optimum_hyperparameters()

100%|██████████| 50/50 [02:02<00:00,  2.46s/trial, best loss: -0.5240417339506397]


({'criterion': 0,
  'max_depth': 13,
  'max_features': 3,
  'n_estimators': 12,
  'numFeatures': 586},
 <hyperopt.base.Trials at 0x7fc90afde700>)

In [15]:
SVC = optimizer.optimize_model(model = 'svm', algo = tpe.suggest,max_evals = 50,df = df_tweetData,message_tokenized = 'message_tokens_lemmatized',x = 'feature',y='Sentiment_numerical')
SVC.get_optimum_hyperparameters()

  0%|          | 0/50 [00:00<?, ?trial/s, best loss=?]

In [None]:
decision_tree = optimizer.optimize_model(model = 'decision_tree', algo = tpe.suggest,max_evals = 50,df = df_tweetData,message_tokenized = 'message_tokens_lemmatized',x = 'feature',y='Sentiment_numerical')
decision_tree.get_optimum_hyperparameters()

In [None]:
knn = optimizer.optimize_model(model = 'knn', algo = tpe.suggest,max_evals = 50,df = df_tweetData,message_tokenized = 'message_tokens_lemmatized',x = 'feature',y='Sentiment_num_cl')
knn.get_optimum_hyperparameters()