 # TP1. Fully Connected Networks
 
 #### Sciences U, 2019-2020

## Part 3. Classification on Text Data(Sentiment Analysis)

In [1]:
import numpy as np
import pandas as pd 

from sklearn.feature_extraction.text import CountVectorizer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
import re

Using TensorFlow backend.


**Question 1: Load the Sentiment.csv file, only keep columns `text` and `sentiment` and print out first 10 rows.**

In [2]:
file_path = '/home/matt/EII/Archis des applications (UC8-A.2)/w1-a/data/Sentiment.csv'
sentiments = pd.read_csv(file_path, usecols=['text', 'sentiment'])
sentiments.head(n=10)

Unnamed: 0,sentiment,text
0,Neutral,RT @NancyLeeGrahn: How did everyone feel about...
1,Positive,RT @ScottWalker: Didn't catch the full #GOPdeb...
2,Neutral,RT @TJMShow: No mention of Tamir Rice and the ...
3,Positive,RT @RobGeorge: That Carly Fiorina is trending ...
4,Positive,RT @DanScavino: #GOPDebate w/ @realDonaldTrump...
5,Positive,"RT @GregAbbott_TX: @TedCruz: ""On my first day ..."
6,Negative,RT @warriorwoman91: I liked her and was happy ...
7,Neutral,Going on #MSNBC Live with @ThomasARoberts arou...
8,Negative,Deer in the headlights RT @lizzwinstead: Ben C...
9,Negative,RT @NancyOsborne180: Last night's debate prove...


**Question 2. Remove all rows with label Neutral**

In [3]:
sentiments = sentiments[sentiments.sentiment != 'Neutral']
sentiments.head(n=10)

Unnamed: 0,sentiment,text
1,Positive,RT @ScottWalker: Didn't catch the full #GOPdeb...
3,Positive,RT @RobGeorge: That Carly Fiorina is trending ...
4,Positive,RT @DanScavino: #GOPDebate w/ @realDonaldTrump...
5,Positive,"RT @GregAbbott_TX: @TedCruz: ""On my first day ..."
6,Negative,RT @warriorwoman91: I liked her and was happy ...
8,Negative,Deer in the headlights RT @lizzwinstead: Ben C...
9,Negative,RT @NancyOsborne180: Last night's debate prove...
10,Negative,@JGreenDC @realDonaldTrump In all fairness #Bi...
11,Positive,RT @WayneDupreeShow: Just woke up to tweet thi...
12,Negative,Me reading my family's comments about how grea...


**Question 3. Print the number of Positive and Negative rows**

In [4]:
print(sentiments[sentiments.sentiment == 'Positive'].shape)
print(sentiments[sentiments.sentiment == 'Negative'].shape)

(2236, 2)
(8493, 2)


We see that the number of Negative rows is higher than the number of Positive rows. Today, we only forcus on balanced data, and so we would like to make the two equal.

**Question 4. Remove some Negative rows so that #Positive and Negative rows are equal**

In [5]:
# Number of values to delete
pos_rows_count, _ = sentiments[sentiments.sentiment == 'Positive'].shape
neg_rows_count, _ = sentiments[sentiments.sentiment == 'Negative'].shape
remove_n = neg_rows_count - pos_rows_count

# Index to drop
indexes = sentiments[sentiments.sentiment == 'Negative'].index
drop_indices = np.random.choice(indexes.values, remove_n, replace=False)

# Drop random Negative sentiment
sentiments = sentiments.drop(drop_indices)

# Check
print(sentiments[sentiments.sentiment == 'Positive'].shape)
print(sentiments[sentiments.sentiment == 'Negative'].shape)

(2236, 2)
(2236, 2)


In [6]:
data = sentiments

We convert all data into lower case and remove all special characters

In [7]:
data['text'] = data['text'].apply(lambda x: x.lower())
data['text'] = data['text'].apply((lambda x: re.sub('[^a-zA-z0-9\s]','',x)))
for idx,row in data.iterrows():
    row[0] = row[0].replace('rt',' ')

Here are two examples of the texts

In [8]:
print(data.iloc[1].text)
print(data.iloc[2].text)

rt robgeorge that carly fiorina is trending  hours after her debate  above any of the men in justcompleted gopdebate says shes on 
rt danscavino gopdebate w realdonaldtrump delivered the highest ratings in the history of presidential debates trump2016 httptco


We would like to map each sentence to an array of tokens, each word is a token. To make the array having fixed length, we pad enough 0 to the begining of each array

In [9]:
max_fatures = 2000
tokenizer = Tokenizer(num_words=max_fatures, split=' ')
tokenizer.fit_on_texts(data['text'].values)
X = tokenizer.texts_to_sequences(data['text'].values)
X = pad_sequences(X)

In [10]:
X.shape

(4472, 28)

So the length of each array is 29. Here is the array corresponding to the two sentences above

In [11]:
print(X[0])
print(X[1])

[   0    0    0    0    0    0    0    0    0    0    3  346  134    2
  751    1   29   38  310   49  183    7  115 1087   14 1088 1766  802]
[   0    0    0    0    0    0    0    0    0    3   20  182  172    9
  625  100  223   25 1460  162    7    2  249   14    1  197  566   17]


**Question 5. Make label data corresponding to X**

In [12]:
# Replace sentiment by integer
sentiments['sentiment'].replace(['Positive','Negative'],[1,0],inplace=True)

In [13]:
y = sentiments['sentiment']
yc = to_categorical(y)
print(yc.shape)

(4472, 2)


**Question 6. Split train/test sets randomly with ratio 2:1**

In [14]:
x_train, x_test, y_train, y_test = train_test_split(X, yc, test_size=0.33, random_state=42)

print('train data:\t', x_train.shape, y_train.shape)
print('test data:\t', x_test.shape, y_test.shape)
# Expect 
# (2996, 28) (2996, 2)
# (1476, 28) (1476, 2)

train data:	 (2996, 28) (2996, 2)
test data:	 (1476, 28) (1476, 2)


**Question 7. Build a quick Fully Connected network to obtain 55\% accuracy on test data**

In [15]:
model = Sequential()
model.add(Dense(64, input_dim=28, activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(2, activation='softmax'))

model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

model.fit(x_train, y_train,
          epochs=50,
          batch_size=64)

_, train_acc = model.evaluate(x_train, y_train, batch_size=64)
_, test_acc = model.evaluate(x_test, y_test, batch_size=64)
print('train_acc', train_acc)
print('test_acc', test_acc)


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
train_acc 0.9105473756790161
test_acc 0.6111111044883728


**Question 8. Improve the architecture to achieve 65\% accuracy**

In [16]:
# Normalize
x_train = (x_train - np.mean(x_train))/np.std(x_train)
x_test = (x_test - np.mean(x_test))/np.std(x_test)

In [18]:
model = Sequential()
model.add(Dense(64, input_dim=28, activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(2, activation='softmax'))

model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

model.fit(x_train, y_train,
          epochs=50,
          batch_size=64)

_, train_acc = model.evaluate(x_train, y_train, batch_size=64)
_, test_acc = model.evaluate(x_test, y_test, batch_size=64)
print('train_acc', train_acc)
print('test_acc', test_acc)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
train_acc 0.9469292163848877
test_acc 0.6219512224197388
