## Deep Learning for Predicting Sentiment Intensity

In [1]:
#Importing required Libraries

import numpy as np
import pandas as pd
import nltk
import string
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

import warnings
warnings.filterwarnings("ignore")

#### Working on Training Set

In [2]:
anger_train=pd.read_csv("anger.csv")
fear_train=pd.read_csv("fear.csv")
joy_train=pd.read_csv("joy.csv")
sadness_train=pd.read_csv("sadness.csv")

In [3]:
dataset=pd.concat((anger_train, fear_train, joy_train, sadness_train))
dataset

Unnamed: 0,id,tweet,emotion,score
0,10000,How the fu*k! Who the heck! moved my fridge!.....,anger,0.938
1,10001,So my Indian Uber driver just called someone t...,anger,0.896
2,10002,@DPD_UK I asked for my parcel to be delivered ...,anger,0.896
3,10003,so ef whichever butt wipe pulled the fire alar...,anger,0.896
4,10004,Don't join @BTCare they put the phone down on ...,anger,0.896
...,...,...,...,...
781,40781,@VivienLloyd Thank you so much! Just home - st...,sadness,0.104
782,40782,Just put the winter duvet on â˜ƒï¸â„ï¸ðŸŒ¬â...,sadness,0.104
783,40783,@SilkInSide @TommyJoeRatliff that's so pretty!...,sadness,0.088
784,40784,@BluesfestByron second artist announcement loo...,sadness,0.083


In [4]:
dataset.isnull().sum()

id         0
tweet      0
emotion    0
score      0
dtype: int64

In [5]:
dataset.reset_index(level=None, drop=True, inplace=True)
dataset

Unnamed: 0,id,tweet,emotion,score
0,10000,How the fu*k! Who the heck! moved my fridge!.....,anger,0.938
1,10001,So my Indian Uber driver just called someone t...,anger,0.896
2,10002,@DPD_UK I asked for my parcel to be delivered ...,anger,0.896
3,10003,so ef whichever butt wipe pulled the fire alar...,anger,0.896
4,10004,Don't join @BTCare they put the phone down on ...,anger,0.896
...,...,...,...,...
3642,40781,@VivienLloyd Thank you so much! Just home - st...,sadness,0.104
3643,40782,Just put the winter duvet on â˜ƒï¸â„ï¸ðŸŒ¬â...,sadness,0.104
3644,40783,@SilkInSide @TommyJoeRatliff that's so pretty!...,sadness,0.088
3645,40784,@BluesfestByron second artist announcement loo...,sadness,0.083


In [6]:
dataset['emotion'].value_counts()

fear       1147
anger       891
joy         823
sadness     786
Name: emotion, dtype: int64

In [7]:
for i in range(0,4):
    print(dataset['emotion'].value_counts().index[i],"is ", 
          np.round((dataset['emotion'].value_counts()[i]/dataset.shape[0]), 4)*100,"%")

fear is  31.45 %
anger is  24.43 %
joy is  22.57 %
sadness is  21.55 %


* Data is quite balanced

#### Text Preprocessing (Cleaning)

In [8]:
remove_punctuation = dict((ord(char), None) for char in string.punctuation)
remove_numbers = dict((ord(str(num)), None) for num in np.arange(0,10))

In [9]:
def clean_text(data):
        import re
        rem1=r'#[A-Za-z0-9_]+'
        rem2=r'@[A-Za-z0-9_]+'
        rem3=r'https?://[A-Za-z0-9./]+'
        pair=r'|'.join((rem1,rem2,rem3))
        tweet_=re.sub(pair,'', data)
        letters_only=re.sub('[^a-zA-Z]', ' ',tweet_)
        lower_case=letters_only.lower()
        string_=(lower_case.translate(remove_punctuation)).translate(remove_numbers)
        stoplist = stopwords.words('english')
        clean=[x for x in nltk.word_tokenize(string_) if x not in stoplist]
        cleaned= ' '.join(clean)
        
        return cleaned

In [10]:
for i in range(0, dataset.shape[0]):
    dataset['tweet'][i]=clean_text(dataset['tweet'][i])
dataset

Unnamed: 0,id,tweet,emotion,score
0,10000,fu k heck moved fridge knock landlord door,anger,0.938
1,10001,indian uber driver called someone n word movin...,anger,0.896
2,10002,asked parcel delivered pick store address,anger,0.896
3,10003,ef whichever butt wipe pulled fire alarm davis...,anger,0.896
4,10004,join put phone talk rude taking money acc will...,anger,0.896
...,...,...,...,...
3642,40781,thank much home stunned happy think sunk yet wow,sadness,0.104
3643,40782,put winter duvet,sadness,0.104
3644,40783,pretty love sky background purple highlights d...,sadness,0.088
3645,40784,second artist announcement looking good,sadness,0.083


#### Converting to array to list to perform operations

In [11]:
data = dataset['tweet'].values.tolist()

In [12]:
data

['fu k heck moved fridge knock landlord door',
 'indian uber driver called someone n word moving vehicle jumped',
 'asked parcel delivered pick store address',
 'ef whichever butt wipe pulled fire alarm davis bc sound asleep',
 'join put phone talk rude taking money acc willynilly',
 'blood boiling',
 'still got whole season wentworth watch stupid cunt work ruins us',
 'tracking show equipment delivered service suddenly delayed already weeks',
 'legit furious people fucking idiots',
 'suppose work wtf dude thanks pissing',
 'im mad power rangers im incensed im furious',
 'wont use using guys cant get nothing right',
 'bitches aggravate like inspires biggest cunt known man kind',
 'come glasgow night working fucking gutted waiting appearance ages',
 'fuking fuming',
 'zero help customer service pushing buck back forth promising callbacks happen',
 'mention gra guy stops let ppl front go wtf blood boiling',
 'hate lawn mower soul condemn fiery pits hell',
 'people offended kendall ends p

In [18]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

max_words = 5000
max_len = 200
def to_vectors(data):
    
    tokenizer = Tokenizer(num_words=max_words)
    tokenizer.fit_on_texts(data)

    # converts into 2D array
    sequences = tokenizer.texts_to_sequences(data)
    return pad_sequences(sequences, maxlen=max_len)


In [14]:
tweets=to_vectors(data)
tweets

array([[   0,    0,    0, ..., 2059, 2060,  962],
       [   0,    0,    0, ...,  429, 2061, 1168],
       [   0,    0,    0, ...,  430,  829,  963],
       ...,
       [   0,    0,    0, ...,  347, 1082,   63],
       [   0,    0,    0, ..., 3500,  187,   10],
       [   0,    0,    0, ...,  712,    7,   34]])

In [15]:
tweets.shape

(3647, 200)

#### Using LSTM for training

In [19]:
from keras.models import Sequential
from keras import layers


rnn = Sequential()
rnn.add(layers.Embedding(max_words, 20)) # using embedding layer to convert the vectors into tensors
rnn.add(layers.LSTM(15,dropout=0.5))
#rnn.add(layers.LSTM(15))
rnn.add(layers.Dense(1,activation='sigmoid'))


rnn.compile(optimizer='RMSProp',loss='mse', metrics=['mse'])
rnn.fit(tweets, dataset['score'].ravel(), epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x1745f795400>

#### Predicting on developement set

In [20]:
anger_dev=pd.read_csv("anger_dev.csv")
anger_dev

Unnamed: 0,id,tweet,emotion,score
0,10857,@ZubairSabirPTI pls dont insult the word 'Molna',anger,0.479
1,10858,@ArcticFantasy I would have almost took offens...,anger,0.458
2,10859,@IllinoisLoyalty that Rutgers game was an abom...,anger,0.562
3,10860,@CozanGaming that's what lisa asked before she...,anger,0.500
4,10861,Sometimes I get mad over something so minuscul...,anger,0.708
...,...,...,...,...
79,10936,@Jen_ny69 People will always get offended ever...,anger,0.562
80,10937,@gayla_weeks1 I try not to let my anger seep i...,anger,0.625
81,10938,I hope my hustle don't offend nobody,anger,0.292
82,10939,"Just watched Django Unchained, Other people ma...",anger,0.229


In [21]:
for i in range(0, anger_dev.shape[0]):
    anger_dev['tweet'][i]=clean_text(anger_dev['tweet'][i])
anger_dev

Unnamed: 0,id,tweet,emotion,score
0,10857,pls dont insult word molna,anger,0.479
1,10858,would almost took offense actually snapped,anger,0.458
2,10859,rutgers game abomination affront god man must ...,anger,0.562
3,10860,lisa asked started raging call heh,anger,0.500
4,10861,sometimes get mad something minuscule try ruin...,anger,0.708
...,...,...,...,...
79,10936,people always get offended everyone situation ...,anger,0.562
80,10937,try let anger seep reviews resent time wasted ...,anger,0.625
81,10938,hope hustle offend nobody,anger,0.292
82,10939,watched django unchained people may frown titt...,anger,0.229


In [22]:
dev = anger_dev['tweet'].values.tolist()

In [23]:
tweet=to_vectors(dev)
tweet

array([[  0,   0,   0, ...,   7, 149, 150],
       [  0,   0,   0, ...,  39,  40, 153],
       [  0,   0,   0, ...,  19,   9, 157],
       ...,
       [  0,   0,   0, ..., 436,  16, 437],
       [  0,   0,   0, ...,  32, 441, 442],
       [  0,   0,   0, ..., 100, 145, 444]])

In [24]:
rnn.evaluate(tweet, anger_dev['score'].ravel())



[0.02353881672024727, 0.02353881672024727]

In [25]:
y_pred=rnn.predict(tweet)

In [26]:
y_pred_out=y_pred.ravel()
y_pred_out

array([0.48647508, 0.63652277, 0.57361966, 0.5775033 , 0.5414392 ,
       0.5414392 , 0.5057754 , 0.5674544 , 0.48366922, 0.48366922,
       0.4777313 , 0.55303586, 0.541138  , 0.3740271 , 0.3740271 ,
       0.52019095, 0.52019095, 0.5085964 , 0.5348288 , 0.4176253 ,
       0.5953248 , 0.59689534, 0.5196039 , 0.5196039 , 0.6479106 ,
       0.3281256 , 0.46466494, 0.44786423, 0.4074757 , 0.6386478 ,
       0.33453524, 0.44475102, 0.58538336, 0.48101154, 0.2676575 ,
       0.5833416 , 0.5833416 , 0.44911736, 0.40173432, 0.5436402 ,
       0.40403384, 0.36344764, 0.49279392, 0.49279392, 0.55915743,
       0.3523501 , 0.6583074 , 0.4882983 , 0.33785796, 0.42259562,
       0.42259562, 0.30356526, 0.35168743, 0.36780006, 0.36780006,
       0.48504406, 0.6432126 , 0.55165124, 0.37005463, 0.45679215,
       0.40559   , 0.4377302 , 0.42999542, 0.42999542, 0.33697698,
       0.47454044, 0.4482162 , 0.4159259 , 0.4159259 , 0.6459558 ,
       0.26712853, 0.26712853, 0.51643676, 0.635412  , 0.49807

In [27]:
score=np.array(anger_dev['score'])

In [28]:
data1=pd.DataFrame({'Actual':score, 'Predicted':y_pred_out})
data1

Unnamed: 0,Actual,Predicted
0,0.479,0.486475
1,0.458,0.636523
2,0.562,0.573620
3,0.500,0.577503
4,0.708,0.541439
...,...,...
79,0.562,0.549163
80,0.625,0.423595
81,0.292,0.365766
82,0.229,0.436410


In [29]:
data1.corr()

Unnamed: 0,Actual,Predicted
Actual,1.0,0.358029
Predicted,0.358029,1.0


#### Prediction on Test Set

In [30]:
anger_test=pd.read_csv("anger_test.csv")
fear_test=pd.read_csv("fear_test.csv")
joy_test=pd.read_csv("joy_test.csv")
sadness_test=pd.read_csv("sadness_test.csv")

In [31]:
anger_test.head()

Unnamed: 0,id,tweet,emotion,score
0,10941,At the point today where if someone says somet...,anger,NONE
1,10942,@CorningFootball IT'S GAME DAY!!!! T MIN...,anger,NONE
2,10943,This game has pissed me off more than any othe...,anger,NONE
3,10944,@spamvicious I've just found out it's Candice ...,anger,NONE
4,10945,@moocowward @mrsajhargreaves @Melly77 @GaryBar...,anger,NONE


In [32]:
joy_test

Unnamed: 0,id,tweet,emotion,score
0,30902,You must be knowing #blithe means (adj.) Happ...,joy,NONE
1,30903,Old saying 'A #smile shared is one gained for ...,joy,NONE
2,30904,Bridget Jones' Baby was bloody hilarious ðŸ˜… ...,joy,NONE
3,30905,@Elaminova sparkling water makes your life spa...,joy,NONE
4,30906,I'm tired of everybody telling me to chill out...,joy,NONE
...,...,...,...,...
709,31611,With a very tired body and mind and sparkling ...,joy,NONE
710,31612,I refuse to be a chirp chirp girl,joy,NONE
711,31613,It was very hard to stifle my laughter after I...,joy,NONE
712,31614,"While I was walking, a little boy in a red shi...",joy,NONE


In [37]:
def change(data):
    for i in range(0, data.shape[0]):
        data['tweet'][i]=clean_text(data['tweet'][i])
    lis = data['tweet'].values.tolist()
    return to_vectors(lis)

#### *Anger-test*

In [38]:
a=change(anger_test)
a

array([[   0,    0,    0, ..., 1088,   49,  259],
       [   0,    0,    0, ...,   50,   26, 1089],
       [   0,    0,    0, ...,   51,   27,  260],
       ...,
       [   0,    0,    0, ..., 2401,  252,   58],
       [   0,    0,    0, ...,  105,   58,  312],
       [   0,    0,    0, ...,   38,  131,  170]])

In [39]:
a.shape

(760, 200)

In [41]:
anger_test['score']=(rnn.predict(a)).ravel()
anger_test

Unnamed: 0,id,tweet,emotion,score
0,10941,point today someone says something remotely ki...,anger,0.514672
1,10942,game day minus,anger,0.466170
2,10943,game pissed game year blood boiling time turn,anger,0.452610
3,10944,found candice candace pout likes,anger,0.410202
4,10945,come mum th k tweets,anger,0.567684
...,...,...,...,...
755,11696,supposed animosity bullshit con iranians,anger,0.413249
756,11697,byu offense score vs wvu,anger,0.437063
757,11698,id love c gyimah action coach holding grudge,anger,0.317028
758,11699,forgiving means operating god spirit amp god u...,anger,0.533166


#### *Fear-test*

In [47]:
f=change(fear_test)
fear_test['score']=(rnn.predict(f)).ravel()
fear_test.head()

Unnamed: 0,id,tweet,emotion,score
0,21257,ncould somebody shoot nit could videos time tu...,fear,0.470751
1,21258,really sucks typing mobile device always horri...,fear,0.626901
2,21259,ones ones actually,fear,0.40736
3,21260,horrible person gag see people quote,fear,0.5899
4,21261,fear usually need tim ferriss,fear,0.392631


#### *Joy-test*

In [48]:
j=change(joy_test)
joy_test['score']=(rnn.predict(j)).ravel()
joy_test.head()

Unnamed: 0,id,tweet,emotion,score
0,30902,must knowing means adj happy cheerful,joy,0.3887
1,30903,old saying shared one gained another day,joy,0.410518
2,30904,bridget jones baby bloody hilarious,joy,0.373866
3,30905,sparkling water makes life sparkly,joy,0.487773
4,30906,tired everybody telling chill everythings ok f...,joy,0.554609


#### *Sadness-test*

In [49]:
s=change(sadness_test)
sadness_test['score']=(rnn.predict(s)).ravel()
sadness_test.head()

Unnamed: 0,id,tweet,emotion,score
0,40860,teens sons left car get haircuts praying storm...,sadness,0.435554
1,40861,teens sons left car get haircuts praying storm...,sadness,0.435554
2,40862,hartramsey suplift still discouraged means lis...,sadness,0.318882
3,40863,nearly dropped phone sink hahahaha,sadness,0.46994
4,40864,whenever feeling sad listen monsta x hug teddy...,sadness,0.443424
