In [156]:
!pip install nltk



In [1]:
import numpy as np
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.datasets import load_files
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from gensim.models import Word2Vec
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.models import KeyedVectors
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn import preprocessing
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences 
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SimpleRNN
from keras.layers import Dense, Dropout, Activation
from keras.layers import Conv1D, MaxPooling1D
from keras.utils.np_utils import to_categorical
from keras.models import Model
from keras.engine import Input

In [2]:
from google.colab import files
uploaded = files.upload()

Saving sa_data.csv to sa_data (4).csv


In [3]:
#Reading the data
data = pd.read_csv('sa_data.csv')

In [4]:
data.head()

Unnamed: 0,text,sentiment
0,RT @NancyLeeGrahn: How did everyone feel about...,Neutral
1,RT @ScottWalker: Didn't catch the full #GOPdeb...,Positive
2,RT @TJMShow: No mention of Tamir Rice and the ...,Neutral
3,RT @RobGeorge: That Carly Fiorina is trending ...,Positive
4,RT @DanScavino: #GOPDebate w/ @realDonaldTrump...,Positive


In [5]:
#Preprocessing - removing unwanted characters, tokenization, stop-word removal
def clean_data(tweet):
    x = re.sub("[^a-zA-Z0-9]", " ",tweet) 
    x = x.lower().split()                             
    stops = set(stopwords.words("english"))                  
    words = [w for w in x if not w in stops]    
    return( " ".join(words))

In [6]:
data['text'] = data['text'].apply(lambda x: clean_data(x))
print(data['text'])

0        rt nancyleegrahn everyone feel climate change ...
1        rt scottwalker catch full gopdebate last night...
2        rt tjmshow mention tamir rice gopdebate held c...
3        rt robgeorge carly fiorina trending hours deba...
4        rt danscavino gopdebate w realdonaldtrump deli...
                               ...                        
13866    rt cappy yarbrough love see men never faced pr...
13867    rt georgehenryw thought huckabee exceeded expe...
13868    rt lrihendry tedcruz president always tell tru...
13869    rt jrehling gopdebate donald trump says time p...
13870    rt lrihendry tedcruz headed presidential debat...
Name: text, Length: 13871, dtype: object


In [7]:
for i in range(len(data)):
    data['text'][i] = data['text'][i].replace('rt ','')

print(data['text'])

0        nancyleegrahn everyone feel climate change que...
1        scottwalker catch full gopdebate last night sc...
2        tjmshow mention tamir rice gopdebate held clev...
3        robgeorge carly fiorina trending hours debate ...
4        danscavino gopdebate w realdonaldtrump deliver...
                               ...                        
13866    cappy yarbrough love see men never faced pregn...
13867    georgehenryw thought huckabee exceeded expecta...
13868    lrihendry tedcruz president always tell truth ...
13869    jrehling gopdebate donald trump says time poli...
13870    lrihendry tedcruz headed presidential debates ...
Name: text, Length: 13871, dtype: object


In [8]:
#Converting the dataframe into list
reviews = data['text'].tolist()
sentiment = data['sentiment'].tolist()

corpus = []
for i in range(len(reviews)):
    corpus.append(word_tokenize(reviews[i]))

In [9]:
corpus

[['nancyleegrahn',
  'everyone',
  'feel',
  'climate',
  'change',
  'question',
  'last',
  'night',
  'exactly',
  'gopdebate'],
 ['scottwalker',
  'catch',
  'full',
  'gopdebate',
  'last',
  'night',
  'scott',
  'best',
  'lines',
  '90',
  'seconds',
  'walker16',
  'http',
  'co',
  'zsff'],
 ['tjmshow',
  'mention',
  'tamir',
  'rice',
  'gopdebate',
  'held',
  'cleveland',
  'wow'],
 ['robgeorge',
  'carly',
  'fiorina',
  'trending',
  'hours',
  'debate',
  'men',
  'completed',
  'gopdebate',
  'says'],
 ['danscavino',
  'gopdebate',
  'w',
  'realdonaldtrump',
  'delivered',
  'highest',
  'ratings',
  'history',
  'presidential',
  'debates',
  'trump2016',
  'http',
  'co'],
 ['gregabbott',
  'tx',
  'tedcruz',
  'first',
  'day',
  'rescind',
  'every',
  'illegal',
  'executive',
  'action',
  'taken',
  'barack',
  'obama',
  'gopdebate',
  'foxnews'],
 ['warriorwoman91',
  'liked',
  'happy',
  'heard',
  'going',
  'moderator',
  'anymore',
  'gopdebate',
  'meg

In [10]:
#feature extraction using word2vec
model = Word2Vec(corpus, size=100, window=5, min_count=1)

In [11]:
#vocabulary list
model.wv.vocab

{'nancyleegrahn': <gensim.models.keyedvectors.Vocab at 0x7fcbd2c9c748>,
 'everyone': <gensim.models.keyedvectors.Vocab at 0x7fcbd2c9c7b8>,
 'feel': <gensim.models.keyedvectors.Vocab at 0x7fcb78c90f28>,
 'climate': <gensim.models.keyedvectors.Vocab at 0x7fcb78c90e48>,
 'change': <gensim.models.keyedvectors.Vocab at 0x7fcb78c90d68>,
 'question': <gensim.models.keyedvectors.Vocab at 0x7fcb78c90c50>,
 'last': <gensim.models.keyedvectors.Vocab at 0x7fcb78c90da0>,
 'night': <gensim.models.keyedvectors.Vocab at 0x7fcb78ca9048>,
 'exactly': <gensim.models.keyedvectors.Vocab at 0x7fcb78ca9080>,
 'gopdebate': <gensim.models.keyedvectors.Vocab at 0x7fcb78ca90b8>,
 'scottwalker': <gensim.models.keyedvectors.Vocab at 0x7fcb78ca90f0>,
 'catch': <gensim.models.keyedvectors.Vocab at 0x7fcb78ca9128>,
 'full': <gensim.models.keyedvectors.Vocab at 0x7fcb78ca9160>,
 'scott': <gensim.models.keyedvectors.Vocab at 0x7fcb78ca9198>,
 'best': <gensim.models.keyedvectors.Vocab at 0x7fcb78ca91d0>,
 'lines': <gens

In [12]:
model.wv["level"]

array([-0.00663869,  0.03270048, -0.04573761,  0.00846171, -0.00091493,
        0.03525607, -0.05422912, -0.07259817,  0.05107541,  0.02563571,
        0.0249094 , -0.04198046,  0.06163323,  0.03075265,  0.0076672 ,
        0.04028037,  0.031223  ,  0.08223183, -0.02456796,  0.03904928,
        0.08412605, -0.00568389,  0.01450918, -0.07859356, -0.03413889,
       -0.01645894, -0.04313166,  0.00285511, -0.00964164,  0.005817  ,
        0.01314514, -0.04669484, -0.03674173, -0.03681422, -0.02586983,
        0.01178649,  0.00884517,  0.04247189,  0.03906308,  0.05868449,
       -0.00012229,  0.00185551, -0.00137145, -0.0148484 ,  0.0151591 ,
        0.02467288,  0.05637284,  0.0679157 ,  0.00124927, -0.05265822,
       -0.03457583,  0.05236578,  0.02799514, -0.07372238, -0.01889307,
        0.00101287,  0.03686753, -0.02493149, -0.00147225, -0.01760936,
       -0.019974  ,  0.00358625,  0.0068832 , -0.04210886,  0.0249658 ,
        0.05130089,  0.03025015,  0.00349097, -0.02725302, -0.04

In [13]:
model.wv.most_similar("level")

  if np.issubdtype(vec.dtype, np.int):


[('articulate', 0.9950860738754272),
 ('word', 0.9950450658798218),
 ('race', 0.9949134588241577),
 ('boy', 0.9946513772010803),
 ('decisions', 0.9945707321166992),
 ('floor', 0.9944626092910767),
 ('die', 0.9944047927856445),
 ('body', 0.994365930557251),
 ('looks', 0.9943300485610962),
 ('brain', 0.9943031072616577)]

In [14]:
model.wv.most_similar(positive=['king','woman'], negative= ['man'], topn=5)

  if np.issubdtype(vec.dtype, np.int):


[('fu', 0.979304850101471),
 ('direct', 0.9784883260726929),
 ('would', 0.9766030311584473),
 ('hits', 0.9728028774261475),
 ('answer', 0.972744345664978)]

In [15]:
model.wv.doesnt_match(['woman','man','queen','movie'])

  vectors = vstack(self.word_vec(word, use_norm=True) for word in used_words).astype(REAL)
  if np.issubdtype(vec.dtype, np.int):


'man'

In [12]:
#tokenization
max_features = 1200 #number of the unique words in the corpus
tokenizer = Tokenizer(nb_words=max_features, split = ' ')
tokenizer.fit_on_texts(data['text'].values) 



In [16]:
#Creating the input data
X = np.zeros((len(corpus),100)) #Initializing the X matrix with zeros
for i in range(len(corpus)):
    emb = [model.wv[w] for w in corpus[i]] #Create a list of word embeddings of the words in each sentence
    X[i] = np.mean(emb, axis=0)

In [17]:
le = preprocessing.LabelEncoder()
Y = le.fit_transform(sentiment)

In [18]:
print(X.shape,Y.shape)

(13871, 100) (13871,)


In [20]:
#Splitting the data into train data and test data
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.20, random_state = 42)
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)

(11096, 100) (11096,)
(2775, 100) (2775,)


In [23]:
#Printing the size of the train data, train label, test data and test label
print("Shape train data = ",np.shape(X_train))
print("Shape of train label = ",np.shape(Y_train))
print("Shape of test data = ",np.shape(X_test))
print("Shape of test label = ",np.shape(Y_test))

Shape train data =  (11096, 100)
Shape of train label =  (11096,)
Shape of test data =  (2775, 100)
Shape of test label =  (2775,)


In [25]:
#Tokenization
max_features = 1200 #number of words to keep. 1200 is the number of unique words in the corpus.
tokenizer = Tokenizer(nb_words=max_features, split=' ')
tokenizer.fit_on_texts(data['text'].values)



In [26]:
#unique words and thier count
tokenizer.word_counts

OrderedDict([('nancyleegrahn', 6),
             ('everyone', 109),
             ('feel', 62),
             ('climate', 72),
             ('change', 84),
             ('question', 351),
             ('last', 879),
             ('night', 903),
             ('exactly', 22),
             ('gopdebate', 9048),
             ('scottwalker', 94),
             ('catch', 9),
             ('full', 45),
             ('scott', 101),
             ('best', 185),
             ('lines', 14),
             ('90', 12),
             ('seconds', 16),
             ('walker16', 29),
             ('http', 3018),
             ('co', 3633),
             ('zsff', 1),
             ('tjmshow', 1),
             ('mention', 57),
             ('tamir', 2),
             ('rice', 2),
             ('held', 10),
             ('cleveland', 37),
             ('wow', 55),
             ('robgeorge', 1),
             ('carly', 120),
             ('fiorina', 144),
             ('trending', 2),
             ('hours', 34),
       

In [27]:
#dictionary index of a word
tokenizer.word_index["moderators"]

185

In [28]:
X = tokenizer.texts_to_sequences(data['text'].values)
X = pad_sequences(X, padding = 'post') #Zero padding at the end of the sequence

In [29]:
Y = to_categorical(Y)
print(Y)

[[0. 1. 0.]
 [0. 0. 1.]
 [0. 1. 0.]
 ...
 [0. 0. 1.]
 [1. 0. 0.]
 [0. 0. 1.]]


In [30]:
#splitting the data as train and test data
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2)

In [31]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten, Conv2D, MaxPooling2D
import warnings
warnings.filterwarnings('ignore')

In [68]:
embed_dim = 500
hidden_layer = 100
model = Sequential()
model.add(Embedding(max_features, embed_dim,input_length = X.shape[1]))
model.add(Dropout(0.2))
model.add(SimpleRNN(hidden_layer))
model.add(Dense(50))
model.add(Dense(3,activation='softmax'))
model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])
print(model.summary())

Model: "sequential_13"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_13 (Embedding)     (None, 24, 500)           600000    
_________________________________________________________________
dropout_13 (Dropout)         (None, 24, 500)           0         
_________________________________________________________________
simple_rnn_11 (SimpleRNN)    (None, 100)               60100     
_________________________________________________________________
dense_26 (Dense)             (None, 50)                5050      
_________________________________________________________________
dense_27 (Dense)             (None, 3)                 153       
Total params: 665,303
Trainable params: 665,303
Non-trainable params: 0
_________________________________________________________________
None


In [63]:
model.fit(X_train,Y_train,epochs = 20, batch_size = 32)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x7fcb6c37d6d8>

In [64]:
score = model.evaluate(X_test, Y_test, verbose = 1, batch_size = 32)
print("Accuracy: %.2f" % (score[1]*100))

Accuracy: 63.06


In [65]:
test = data['text'][0]
test = clean_data(test)
test = test.replace('rt ','')
test = [test]
test = tokenizer.texts_to_sequences(test)
test = pad_sequences(test, maxlen=24, padding = 'post')

In [66]:
class_label = model.predict_classes(test)
print(le.inverse_transform(class_label))

Instructions for updating:
Please use instead:* `np.argmax(model.predict(x), axis=-1)`,   if your model does multi-class classification   (e.g. if it uses a `softmax` last-layer activation).* `(model.predict(x) > 0.5).astype("int32")`,   if your model does binary classification   (e.g. if it uses a `sigmoid` last-layer activation).
['Neutral']


**observation:**

i have added more dense layers for increasing the accuracy but after sometime i didnt find any increase in accuracy when compared with the reference code's accuracy.

so. i have added an extra dense layer of 50 neurons and i got an increased accuracy 63.06% which is 1% more than the reference code's accuracy.

CNN LSTM

In [35]:
vocabulary_size = len(tokenizer.word_counts)
vocabulary_size

18553

In [36]:
embed_dim = 500
model = Sequential()
model.add(Embedding(vocabulary_size, embed_dim,input_length = X.shape[1]))
model.add(Dropout(0.2))
model.add(Conv1D(filters=64,kernel_size=5,padding='valid',activation='relu',strides=1))
model.add(MaxPooling1D(pool_size=4))
model.add(LSTM(50)) #hidden layer neurons = 50
model.add(Dense(vocabulary_size, activation='softmax'))
model.add(Dense(3))
model.add(Activation('softmax'))
model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])
print(model.summary())

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 24, 500)           9276500   
_________________________________________________________________
dropout_1 (Dropout)          (None, 24, 500)           0         
_________________________________________________________________
conv1d (Conv1D)              (None, 20, 64)            160064    
_________________________________________________________________
max_pooling1d (MaxPooling1D) (None, 5, 64)             0         
_________________________________________________________________
lstm (LSTM)                  (None, 50)                23000     
_________________________________________________________________
dense_1 (Dense)              (None, 18553)             946203    
_________________________________________________________________
dense_2 (Dense)              (None, 3)                

In [40]:
model.fit(X_train,Y_train,epochs = 20, batch_size = 32)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x7fcb6868c080>

In [41]:
score = model.evaluate(X_train, Y_train, verbose = 1, batch_size = 32)
print("Accuracy: %.2f" % (score[1]*100))

Accuracy: 88.10


In [42]:
score = model.evaluate(X_test, Y_test, verbose = 1, batch_size = 32)
print("Accuracy: %.2f" % (score[1]*100))

Accuracy: 65.37


In [69]:
test = data['text'][0]
test = clean_data(test)
test = test.replace('rt ','')
test = [test]
test = tokenizer.texts_to_sequences(test)
test = pad_sequences(test, maxlen=24, padding = 'post')

In [70]:
class_label = model.predict_classes(test)
print(le.inverse_transform(class_label))

['Negative']


**observation:**

here i have taken the length of the token_words and added it as an layer "vocabulary size" and used "vocabulary_size" in place of max_features, which i took example from the TRAC2020 D.L model 

and i got an accuracy of 65.37% which is 5% more than the refernce code's accuracy.

for more accuracy i experimented with the vocablury size using a reference.

i even tried adding more dense layers but it declined the accuracy to less than 60%.
