In [53]:
import numpy as np
import pandas as pd
import nltk
nltk.download('wordnet')


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\SUJATA\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [54]:
data = pd.read_csv('./Sentiment.csv')
data.head(5)

Unnamed: 0,id,candidate,candidate_confidence,relevant_yn,relevant_yn_confidence,sentiment,sentiment_confidence,subject_matter,subject_matter_confidence,candidate_gold,...,relevant_yn_gold,retweet_count,sentiment_gold,subject_matter_gold,text,tweet_coord,tweet_created,tweet_id,tweet_location,user_timezone
0,1,No candidate mentioned,1.0,yes,1.0,Neutral,0.6578,None of the above,1.0,,...,,5,,,RT @NancyLeeGrahn: How did everyone feel about...,,2015-08-07 09:54:46 -0700,629697200650592256,,Quito
1,2,Scott Walker,1.0,yes,1.0,Positive,0.6333,None of the above,1.0,,...,,26,,,RT @ScottWalker: Didn't catch the full #GOPdeb...,,2015-08-07 09:54:46 -0700,629697199560069120,,
2,3,No candidate mentioned,1.0,yes,1.0,Neutral,0.6629,None of the above,0.6629,,...,,27,,,RT @TJMShow: No mention of Tamir Rice and the ...,,2015-08-07 09:54:46 -0700,629697199312482304,,
3,4,No candidate mentioned,1.0,yes,1.0,Positive,1.0,None of the above,0.7039,,...,,138,,,RT @RobGeorge: That Carly Fiorina is trending ...,,2015-08-07 09:54:45 -0700,629697197118861312,Texas,Central Time (US & Canada)
4,5,Donald Trump,1.0,yes,1.0,Positive,0.7045,None of the above,1.0,,...,,156,,,RT @DanScavino: #GOPDebate w/ @realDonaldTrump...,,2015-08-07 09:54:45 -0700,629697196967903232,,Arizona


In [68]:
data = data[['text','sentiment']]
data.head()

Unnamed: 0,text,sentiment
0,RT @ScottWalker: Didn't catch the full #GOPdeb...,1
1,RT @RobGeorge: That Carly Fiorina is trending ...,1
2,RT @DanScavino: #GOPDebate w/ @realDonaldTrump...,1
3,"RT @GregAbbott_TX: @TedCruz: ""On my first day ...",1
4,RT @warriorwoman91: I liked her and was happy ...,0


In [56]:
data = data[data.sentiment != "Neutral"]
data['sentiment']= pd.get_dummies(data['sentiment'], drop_first = True)
data = data.reset_index(drop=True)
data.head()

Unnamed: 0,text,sentiment
0,RT @ScottWalker: Didn't catch the full #GOPdeb...,1
1,RT @RobGeorge: That Carly Fiorina is trending ...,1
2,RT @DanScavino: #GOPDebate w/ @realDonaldTrump...,1
3,"RT @GregAbbott_TX: @TedCruz: ""On my first day ...",1
4,RT @warriorwoman91: I liked her and was happy ...,0


In [57]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import word_tokenize

In [58]:
corpus = []

for i in range(0, data.shape[0]):
    tweet = re.sub('[^a-zA-Z]', ' ', data['text'][i])
    tweet = tweet.lower()
    tweet = word_tokenize(tweet)
    
    # Reduce words to their root form
    tweet = [WordNetLemmatizer().lemmatize(w) for w in tweet if not w in set(stopwords.words('english'))]
    
    # Lemmatize verbs by specifying pos
    tweet = [WordNetLemmatizer().lemmatize(w, pos='v') for w in tweet if not w in set(stopwords.words('english'))]
    tweet = ' '.join(tweet)
    corpus.append(tweet)

print(corpus[0:3])

['rt scottwalker catch full gopdebate last night scott best line second walker http co zsff', 'rt robgeorge carly fiorina trend hour debate men complete gopdebate say', 'rt danscavino gopdebate w realdonaldtrump deliver highest rat history presidential debate trump http co']


In [59]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfIdfVectorizer=TfidfVectorizer(use_idf=True)
tfIdf = tfIdfVectorizer.fit_transform(corpus)
df = pd.DataFrame(tfIdf[0].T.todense(), index=tfIdfVectorizer.get_feature_names(), columns=["TF-IDF"])
df = df.sort_values('TF-IDF', ascending=False)
pd.set_option('display.max_rows', None)
df.head(20)



Unnamed: 0,TF-IDF
zsff,0.453518
catch,0.3446
full,0.330405
second,0.313011
line,0.299403
scottwalker,0.282078
scott,0.274517
best,0.249607
walker,0.23592
last,0.182394


In [60]:
for i in range(0, len(corpus)):
    corpus[i] = re.sub('co','', corpus[i])
    corpus[i] = re.sub('rt','', corpus[i])
    corpus[i] = re.sub('http','', corpus[i])
    

In [61]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer(num_words=None)
tokenizer.fit_on_texts(corpus)
encoded_docs = tokenizer.texts_to_sequences(corpus)
padded_sequence = pad_sequences(encoded_docs,maxlen=25)

In [62]:
padded_sequence.shape

(10729, 25)

In [63]:
print(tokenizer.word_index['trump'])

4


In [64]:
print(corpus[0])
print(encoded_docs[0])

 sttwalker catch full gopdebate last night stt best line send walker   zsff
[252, 779, 564, 1, 15, 13, 204, 114, 354, 224, 72, 5278]


In [65]:
print(padded_sequence[0])

[   0    0    0    0    0    0    0    0    0    0    0    0    0  252
  779  564    1   15   13  204  114  354  224   72 5278]


In [66]:
# Build the model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM,Dense, Dropout
from tensorflow.keras.layers import SpatialDropout1D
from tensorflow.keras.layers import Embedding
from keras.initializers import Constant

vocab_size = len(tokenizer.word_index) + 1
embedding_vector_length = 200


model = Sequential()
model.add(Embedding(vocab_size, embedding_vector_length,
                    input_length=25) )
model.add(SpatialDropout1D(0.2))
model.add(LSTM(3, dropout=0.2, recurrent_dropout=0.2))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='adam', metrics=['accuracy'])
print(model.summary())

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 25, 200)           2598000   
                                                                 
 spatial_dropout1d_1 (Spatia  (None, 25, 200)          0         
 lDropout1D)                                                     
                                                                 
 lstm_1 (LSTM)               (None, 3)                 2448      
                                                                 
 dropout_1 (Dropout)         (None, 3)                 0         
                                                                 
 dense_1 (Dense)             (None, 1)                 4         
                                                                 
Total params: 2,600,452
Trainable params: 2,600,452
Non-trainable params: 0
____________________________________________

In [67]:
# converting the targets to numpy array to feed it into the model
target = np.asarray(data['sentiment'])
print(target)

[1 1 1 ... 1 0 1]


In [49]:
MODEL = model.fit(padded_sequence,target,validation_split=0.2, epochs=6, batch_size=256)

Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


In [52]:
test_word ='Pune is a good city'
tw = tokenizer.texts_to_sequences([test_word])
tw = pad_sequences(tw,maxlen=25)
sentiment = int(model.predict(tw).round().item())
#print(sentiment)
if sentiment==0:
    print("Negative")
else:
    print("Positive")


Negative
