# Exercise 3

In [15]:
import numpy as np 
import pandas as pd 
from sklearn.feature_extraction.text import CountVectorizer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.layers import Embedding
from keras.layers import Conv1D, GlobalMaxPooling1D
from keras import regularizers
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
import re
%pylab inline
warnings.simplefilter('ignore')

Populating the interactive namespace from numpy and matplotlib


# Neural Network Parameters

In [16]:
max_features = 2000
maxlen = 28
batch_size = 32
embedding_dims = 2000
dropout_a = 0.2
dropout_b = 0.2
filters = 200
kernel_size = 10
hidden_dims = 2000
epochs = 4
random_st=4222
verbose = 2

# Import Data and tokenize

In [17]:
df = pd.read_csv('dataset_sentiment.csv')
df = df[['text','sentiment']]

df = df[df.sentiment != "Neutral"]
df['text'] = df['text'].apply(lambda x: x.lower())
df['text'] = df['text'].apply(lambda x: x.replace('rt',' '))
df['text'] = df['text'].apply((lambda x: re.sub('[^a-zA-z0-9\s]','',x)))
    
tok = Tokenizer(num_words=max_features, split=' ')
tok.fit_on_texts(df['text'].values)
X = tok.texts_to_sequences(df['text'].values)
X = pad_sequences(X)



# Build Neural Network Model

In [18]:
nn = Sequential()

nn.add(Embedding(max_features,
                 embedding_dims))
nn.add(Dropout(dropout_a))

# Convolution1D
nn.add(Conv1D(filters,
                 kernel_size,
                 padding='valid',
                 activation='relu',
                 strides=1))
nn.add(GlobalMaxPooling1D())


# hidden layer
nn.add(Dense(hidden_dims))
nn.add(Dropout(dropout_b))
nn.add(Activation('relu'))



# output layer
nn.add(Dense(2))
nn.add(Activation('sigmoid'))

nn.compile(loss='binary_crossentropy',
              optimizer='adam',
metrics=['accuracy'])
print(nn.summary())



_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, None, 2000)        4000000   
_________________________________________________________________
dropout_5 (Dropout)          (None, None, 2000)        0         
_________________________________________________________________
conv1d_3 (Conv1D)            (None, None, 200)         4000200   
_________________________________________________________________
global_max_pooling1d_3 (Glob (None, 200)               0         
_________________________________________________________________
dense_5 (Dense)              (None, 2000)              402000    
_________________________________________________________________
dropout_6 (Dropout)          (None, 2000)              0         
_________________________________________________________________
activation_5 (Activation)    (None, 2000)              0         
__________

# Create Test/Train Split
The performance of the baseline model was very good for the prediction of negative tweets but classified ~50% of the positve tweets as negative. The reason for that was the unbalanced training set in which ~80% of the tweets were negative. To get a balanced training set I tried (a) to take only a subset of the negative examples and (b) to add duplicate positive examples while adding regularization to avoid overfitting. In my case (a) performed better while I suspect that (b) with a more sophisticated/better approach to avoid overfitting would lead to a better classification.

In [19]:
Y = pd.get_dummies(df['sentiment']).values
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.30, random_state = random_st)

X_train_neg=[X_train[i] for i in range(0, len(X_train)) if Y_train[i][1]==0]
X_train_pos=[X_train[i] for i in range(0, len(X_train)) if Y_train[i][1]==1]
Y_train_neg=[Y_train[i] for i in range(0, len(Y_train)) if Y_train[i][1]==0]
Y_train_pos=[Y_train[i] for i in range(0, len(Y_train)) if Y_train[i][1]==1]

X_train_neg=X_train_neg[:len(X_train_pos)]
Y_train_neg=Y_train_neg[:len(Y_train_pos)]

X_train=np.concatenate((X_train_pos, X_train_neg), axis=0)
Y_train=np.concatenate((Y_train_pos, Y_train_neg), axis=0)

np.random.seed(seed=random_st)
np.random.shuffle(X_train)

np.random.seed(seed=random_st)
np.random.shuffle(Y_train)



# Train the model

In [20]:
nn.fit(X_train, Y_train, epochs = epochs, batch_size=batch_size, verbose=verbose)

Epoch 1/4
 - 83s - loss: 0.5619 - acc: 0.6847
Epoch 2/4
 - 85s - loss: 0.3304 - acc: 0.8668
Epoch 3/4
 - 86s - loss: 0.1749 - acc: 0.9367
Epoch 4/4
 - 104s - loss: 0.1031 - acc: 0.9606


<keras.callbacks.History at 0x7fd378b3b080>

# Evaluate the model

In [21]:
score, accuracy = nn.evaluate(X_test, Y_test, verbose = 2, batch_size = batch_size)
print("score: %.2f" % (score))
print("acc: %.2f" % (accuracy))

pos_cnt, neg_cnt, pos_ok, neg_ok = 0, 0, 0, 0
for x in range(len(X_test)):
    result = nn.predict(X_test[x].reshape(1,X_test.shape[1]),batch_size=1,verbose = 2)[0]
    if np.argmax(result) == np.argmax(Y_test[x]):
        if np.argmax(Y_test[x]) == 0: neg_ok += 1
        else: pos_ok += 1
    if np.argmax(Y_test[x]) == 0: neg_cnt += 1
    else: pos_cnt += 1

print("pos_acc", pos_ok/pos_cnt*100, "%")
print("neg_acc", neg_ok/neg_cnt*100, "%")

X2 = ['Dear Mr. President, you are an asshole!']
X2 = tok.texts_to_sequences(X2)
X2 = pad_sequences(X2, maxlen=28, dtype='int32', value=0)
print(X2)
print(nn.predict(X2, batch_size=1, verbose = 2)[0])

score: 0.98
acc: 0.73
pos_acc 79.48717948717949 %
neg_acc 70.98265895953757 %
[[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0  318  853   79   11   37   99 1156]]
[9.9992824e-01 7.6610864e-05]
