## Machine Learning for Predicting Sentiment Intensity

In [2]:
#Importing required Libraries

import numpy as np
import pandas as pd
import nltk
import string
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

import warnings
warnings.filterwarnings("ignore")

#### Working on Training Set

In [3]:
anger_train=pd.read_csv("anger.csv")
fear_train=pd.read_csv("fear.csv")
joy_train=pd.read_csv("joy.csv")
sadness_train=pd.read_csv("sadness.csv")

#### Combining all training set

In [4]:
dataset=pd.concat((anger_train, fear_train, joy_train, sadness_train))
dataset

Unnamed: 0,id,tweet,emotion,score
0,10000,How the fu*k! Who the heck! moved my fridge!.....,anger,0.938
1,10001,So my Indian Uber driver just called someone t...,anger,0.896
2,10002,@DPD_UK I asked for my parcel to be delivered ...,anger,0.896
3,10003,so ef whichever butt wipe pulled the fire alar...,anger,0.896
4,10004,Don't join @BTCare they put the phone down on ...,anger,0.896
...,...,...,...,...
781,40781,@VivienLloyd Thank you so much! Just home - st...,sadness,0.104
782,40782,Just put the winter duvet on â˜ƒï¸â„ï¸ðŸŒ¬â...,sadness,0.104
783,40783,@SilkInSide @TommyJoeRatliff that's so pretty!...,sadness,0.088
784,40784,@BluesfestByron second artist announcement loo...,sadness,0.083


In [5]:
dataset.reset_index(level=None, drop=True, inplace=True)
dataset

Unnamed: 0,id,tweet,emotion,score
0,10000,How the fu*k! Who the heck! moved my fridge!.....,anger,0.938
1,10001,So my Indian Uber driver just called someone t...,anger,0.896
2,10002,@DPD_UK I asked for my parcel to be delivered ...,anger,0.896
3,10003,so ef whichever butt wipe pulled the fire alar...,anger,0.896
4,10004,Don't join @BTCare they put the phone down on ...,anger,0.896
...,...,...,...,...
3642,40781,@VivienLloyd Thank you so much! Just home - st...,sadness,0.104
3643,40782,Just put the winter duvet on â˜ƒï¸â„ï¸ðŸŒ¬â...,sadness,0.104
3644,40783,@SilkInSide @TommyJoeRatliff that's so pretty!...,sadness,0.088
3645,40784,@BluesfestByron second artist announcement loo...,sadness,0.083


In [6]:
for i in range(0,4):
    print(dataset['emotion'].value_counts().index[i],"is ", 
          np.round((dataset['emotion'].value_counts()[i]/dataset.shape[0]), 4)*100,"%")

fear is  31.45 %
anger is  24.43 %
joy is  22.57 %
sadness is  21.55 %


In [7]:
remove_punctuation = dict((ord(char), None) for char in string.punctuation)
remove_numbers = dict((ord(str(num)), None) for num in np.arange(0,10))

#### Text Preprocessing

In [8]:
def clean_text(data):
        import re
        rem1=r'#[A-Za-z0-9_]+'
        rem2=r'@[A-Za-z0-9_]+'
        rem3=r'https?://[A-Za-z0-9./]+'
        pair=r'|'.join((rem1,rem2,rem3))
        tweet_=re.sub(pair,'', data)
        letters_only=re.sub('[^a-zA-Z]', ' ',tweet_)
        lower_case=letters_only.lower()
        string_=(lower_case.translate(remove_punctuation)).translate(remove_numbers)
        stoplist = stopwords.words('english')
        clean=[x for x in nltk.word_tokenize(string_) if x not in stoplist]
        cleaned= ' '.join(clean)
        
        return cleaned

In [9]:
dataset['tweet'][2]

'@DPD_UK I asked for my parcel to be delivered to a pick up store not my address #fuming #poorcustomerservice'

In [10]:
clean_text(dataset['tweet'][2])

'asked parcel delivered pick store address'

In [11]:
dataset.shape

(3647, 4)

In [12]:
for i in range(0, dataset.shape[0]):
    dataset['tweet'][i]=clean_text(dataset['tweet'][i])
dataset

Unnamed: 0,id,tweet,emotion,score
0,10000,fu k heck moved fridge knock landlord door,anger,0.938
1,10001,indian uber driver called someone n word movin...,anger,0.896
2,10002,asked parcel delivered pick store address,anger,0.896
3,10003,ef whichever butt wipe pulled fire alarm davis...,anger,0.896
4,10004,join put phone talk rude taking money acc will...,anger,0.896
...,...,...,...,...
3642,40781,thank much home stunned happy think sunk yet wow,sadness,0.104
3643,40782,put winter duvet,sadness,0.104
3644,40783,pretty love sky background purple highlights d...,sadness,0.088
3645,40784,second artist announcement looking good,sadness,0.083


#### Converting to Vector for ML model

In [11]:
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer()

In [12]:
x=vect.fit_transform(dataset['tweet'])
x

<3647x6993 sparse matrix of type '<class 'numpy.int64'>'
	with 26278 stored elements in Compressed Sparse Row format>

#### Loading Developement Set

In [13]:
anger_dev=pd.read_csv("anger_dev.csv")
anger_dev

Unnamed: 0,id,tweet,emotion,score
0,10857,@ZubairSabirPTI pls dont insult the word 'Molna',anger,0.479
1,10858,@ArcticFantasy I would have almost took offens...,anger,0.458
2,10859,@IllinoisLoyalty that Rutgers game was an abom...,anger,0.562
3,10860,@CozanGaming that's what lisa asked before she...,anger,0.500
4,10861,Sometimes I get mad over something so minuscul...,anger,0.708
...,...,...,...,...
79,10936,@Jen_ny69 People will always get offended ever...,anger,0.562
80,10937,@gayla_weeks1 I try not to let my anger seep i...,anger,0.625
81,10938,I hope my hustle don't offend nobody,anger,0.292
82,10939,"Just watched Django Unchained, Other people ma...",anger,0.229


In [14]:
for i in range(0, anger_dev.shape[0]):
    anger_dev['tweet'][i]=clean_text(anger_dev['tweet'][i])
anger_dev

Unnamed: 0,id,tweet,emotion,score
0,10857,pls dont insult word molna,anger,0.479
1,10858,would almost took offense actually snapped,anger,0.458
2,10859,rutgers game abomination affront god man must ...,anger,0.562
3,10860,lisa asked started raging call heh,anger,0.500
4,10861,sometimes get mad something minuscule try ruin...,anger,0.708
...,...,...,...,...
79,10936,people always get offended everyone situation ...,anger,0.562
80,10937,try let anger seep reviews resent time wasted ...,anger,0.625
81,10938,hope hustle offend nobody,anger,0.292
82,10939,watched django unchained people may frown titt...,anger,0.229


In [15]:
dev=vect.transform(anger_dev['tweet'])
dev

<84x6993 sparse matrix of type '<class 'numpy.int64'>'
	with 500 stored elements in Compressed Sparse Row format>

#### Model Selection

**Random Forest**

In [23]:
from sklearn.ensemble import RandomForestRegressor
forest=RandomForestRegressor(n_estimators=160)

forest.fit(x,dataset['score'])

RandomForestRegressor(n_estimators=160)

##### Prediction on Developement Set

In [24]:
y_predf=forest.predict(dev)

In [25]:
from sklearn import metrics

In [30]:
# MSE
metrics.mean_squared_error(anger_dev['score'], y_predf)

0.02176336866035388

In [27]:
score=np.array(anger_dev['score'])

In [28]:
data=pd.DataFrame({'Actual':score, 'Predicted':y_predf})
data

Unnamed: 0,Actual,Predicted
0,0.479,0.361573
1,0.458,0.390236
2,0.562,0.537890
3,0.500,0.518528
4,0.708,0.430963
...,...,...
79,0.562,0.479443
80,0.625,0.410225
81,0.292,0.433305
82,0.229,0.482423


In [29]:
# Checking Correlation
data.corr()

Unnamed: 0,Actual,Predicted
Actual,1.0,0.362916
Predicted,0.362916,1.0


**XGBoost**

In [118]:
from xgboost import XGBRegressor
xgb=XGBRegressor(n_estimators=180, learning_rate=0.2)

xgb.fit(x,dataset['score'])

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.2, max_delta_step=0, max_depth=6,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=180, n_jobs=4, num_parallel_tree=1, random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [119]:
y_predx=xgb.predict(dev)

In [120]:
from sklearn import metrics

In [121]:
metrics.mean_squared_error(anger_dev['score'], y_predx)

0.021102320400045366

In [122]:
data1=pd.DataFrame({'Actual':score, 'Predicted':y_predx})
data1

Unnamed: 0,Actual,Predicted
0,0.479,0.483262
1,0.458,0.416259
2,0.562,0.508033
3,0.500,0.501005
4,0.708,0.430512
...,...,...
79,0.562,0.527304
80,0.625,0.430830
81,0.292,0.485638
82,0.229,0.432587


In [123]:
data1.corr()

Unnamed: 0,Actual,Predicted
Actual,1.0,0.366358
Predicted,0.366358,1.0


#### Choosing XGBoost for Predicting Test Set

In [135]:
anger_test=pd.read_csv("anger_test.csv")
fear_test=pd.read_csv("fear_test.csv")
joy_test=pd.read_csv("joy_test.csv")
sadness_test=pd.read_csv("sadness_test.csv")

In [136]:
anger_test.head()

Unnamed: 0,id,tweet,emotion,score
0,10941,At the point today where if someone says somet...,anger,NONE
1,10942,@CorningFootball IT'S GAME DAY!!!! T MIN...,anger,NONE
2,10943,This game has pissed me off more than any othe...,anger,NONE
3,10944,@spamvicious I've just found out it's Candice ...,anger,NONE
4,10945,@moocowward @mrsajhargreaves @Melly77 @GaryBar...,anger,NONE


In [137]:
fear_test.head()

Unnamed: 0,id,tweet,emotion,score
0,21257,#Matthew 25; 1-13\nCould somebody shoot a #vid...,fear,NONE
1,21258,@bkero @whispersystems Which really sucks beca...,fear,NONE
2,21259,Be #afraid of the #quiet ones they are the one...,fear,NONE
3,21260,@riinkanei he's a horrible person and now i ga...,fear,NONE
4,21261,What we fear doing most is usually what we mos...,fear,NONE


In [138]:
def change(data):
    for i in range(0, data.shape[0]):
        data['tweet'][i]=clean_text(data['tweet'][i])
    return vect.transform(data['tweet'])

In [139]:
a=change(anger_test)
a

<760x6993 sparse matrix of type '<class 'numpy.int64'>'
	with 4504 stored elements in Compressed Sparse Row format>

#### *Anger-test*

In [140]:
anger_test['score']=xgb.predict(a)
anger_test

Unnamed: 0,id,tweet,emotion,score
0,10941,point today someone says something remotely ki...,anger,0.473742
1,10942,game day minus,anger,0.394090
2,10943,game pissed game year blood boiling time turn,anger,0.679387
3,10944,found candice candace pout likes,anger,0.483262
4,10945,come mum th k tweets,anger,0.483262
...,...,...,...,...
755,11696,supposed animosity bullshit con iranians,anger,0.483262
756,11697,byu offense score vs wvu,anger,0.418885
757,11698,id love c gyimah action coach holding grudge,anger,0.416081
758,11699,forgiving means operating god spirit amp god u...,anger,0.402183


In [144]:
anger_test.to_csv("anger_p.csv", index=False, header=True)

#### *Fear-test*

In [141]:
f=change(fear_test)
fear_test['score']=xgb.predict(f)
fear_test

Unnamed: 0,id,tweet,emotion,score
0,21257,ncould somebody shoot nit could videos time tu...,fear,0.630587
1,21258,really sucks typing mobile device always horri...,fear,0.709014
2,21259,ones ones actually,fear,0.445982
3,21260,horrible person gag see people quote,fear,0.517090
4,21261,fear usually need tim ferriss,fear,0.483262
...,...,...,...,...
990,22247,vs atlanta yr vs rockies dbacks yr combined vs...,fear,0.437248
991,22248,shaking,fear,0.483262
992,22249,guys dating yet dating mack north ohio bewdley,fear,0.483262
993,22250,listening eurythmics nme polish gothic metal b...,fear,0.483262


In [145]:
fear_test.to_csv("fear_p.csv", index=False, header=True)

#### *Joy-test*

In [142]:
j=change(joy_test)
joy_test['score']=xgb.predict(j)
joy_test

Unnamed: 0,id,tweet,emotion,score
0,30902,must knowing means adj happy cheerful,joy,0.511956
1,30903,old saying shared one gained another day,joy,0.494778
2,30904,bridget jones baby bloody hilarious,joy,0.625841
3,30905,sparkling water makes life sparkly,joy,0.309209
4,30906,tired everybody telling chill everythings ok f...,joy,0.630141
...,...,...,...,...
709,31611,tired body mind sparkling teeth say followers ...,joy,0.345593
710,31612,refuse chirp chirp girl,joy,0.483262
711,31613,hard stifle laughter overheard comment really ...,joy,0.669452
712,31614,walking little boy red shirt years age shoutin...,joy,0.317993


In [146]:
joy_test.to_csv("joy_p.csv", index=False, header=True)

#### *Sadness-test*

In [143]:
s=change(sadness_test)
sadness_test['score']=xgb.predict(s)
sadness_test

Unnamed: 0,id,tweet,emotion,score
0,40860,teens sons left car get haircuts praying storm...,sadness,0.483262
1,40861,teens sons left car get haircuts praying storm...,sadness,0.483262
2,40862,hartramsey suplift still discouraged means lis...,sadness,0.437686
3,40863,nearly dropped phone sink hahahaha,sadness,0.536457
4,40864,whenever feeling sad listen monsta x hug teddy...,sadness,0.759841
...,...,...,...,...
668,41528,candice constantly pout,sadness,0.483262
669,41529,cc talked week still initiate refund transaction,sadness,0.528793
670,41530,pull afew weeks ago sadly theres game audio so...,sadness,0.477793
671,41531,buying art supplies debating serious buy acryl...,sadness,0.343752


In [147]:
sadness_test.to_csv("sadness_p.csv", index=False, header=True)