In [1]:
"""Dataset referene: 
Recursive Deep Models for Semantic Compositionality Over a Sentiment Treebank, Socher et al., 
Conference on Empirical Methods in Natural Language Processing (EMNLP, 2013)."""

import pandas as pd
import numpy as np
from scipy import sparse
import re
import os
import sys

import nltk
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('stopwords')

# import keras
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, Dropout, Activation, Input, GlobalMaxPooling1D
from tensorflow.keras.optimizers import SGD
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten
from tensorflow.keras.preprocessing import sequence, text
from tensorflow.keras.initializers import Constant
from tensorflow.keras.callbacks import EarlyStopping
from keras.utils import np_utils

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
Using TensorFlow backend.


In [91]:
# preprocess given documents
def preprocess_data(data):

    # replace unnecessary characters with space
    s = re.sub(r"\\n", " ", str(data).lower())
    s = re.sub(r":", "", s) 
    s = re.sub(r"\.", " ", s)
    s = re.sub(r"\,", " ", s)
    s = re.sub(r"\!", " ", s)
    s = re.sub(r"\;", " ", s)
    s = re.sub(r"\/", " ", s)
    s = re.sub(r"[^A-Za-z]", " ", s)    

    # tokenize the lower cased string 
    s = s.split()  
    s = " ".join(s).rstrip()  

    return str(s)

In [82]:
# textfile path
dir_path = 'a4-data/'
 
with open(os.path.join(dir_path, "datasetSentences.txt"), 'r', encoding='utf-8') as f:
    sentData = f.read().split('\n') 
    doc_list = [(line.split('\t')[0], line.split('\t')[1]) for line in fileData if '\t' in line]    
    all_data_df = pd.DataFrame(data=np.array(doc_list), columns=['sentence_id', 'sentence'])
    
with open(os.path.join(dir_path, "datasetSplit.txt"), 'r', encoding='utf-8') as f:
    splitData = f.read().split('\n') 
    splitset_labels = [(line.split(',')[0], line.split(',')[1]) for line in splitData if ',' in line]  
    split_labels_df = pd.DataFrame(data=np.array(splitset_labels), columns=['sentence_id', 'splitset_label'])
    
with open(os.path.join(dir_path, "sentiment_labels.txt"), 'r', encoding='utf-8') as f:
    labelsData = f.read().split('\n') #.split('|')
    sentiment_values = [(line.split('|')[0], line.split('|')[1]) for line in labelsData if '|' in line] 
    sentiment_labels_df = pd.DataFrame(data=np.array(sentiment_values), columns=['phrase_id', 'sentiment_value'])
    
with open(os.path.join(dir_path, "dictionary.txt"), 'r', encoding='utf-8') as f:
    dictData = f.read().split('\n') #.split('|')
    phrase_list = [(line.split('|')[0], line.split('|')[1]) for line in dictData if '|' in line]
    phrase_list_df = pd.DataFrame(data=np.array(phrase_list), columns=['phrase', 'phrase_id'])

In [83]:
all_data_df = all_data_df.iloc[1:]
all_data_df.head()

Unnamed: 0,sentence_id,sentence
1,1,The Rock is destined to be the 21st Century 's...
2,2,The gorgeously elaborate continuation of `` Th...
3,3,Effective but too-tepid biopic
4,4,If you sometimes like to go to the movies to h...
5,5,"Emerges as something rare , an issue movie tha..."


In [84]:
split_labels_df = split_labels_df.iloc[1:]
split_labels_df.head()

Unnamed: 0,sentence_id,splitset_label
1,1,1
2,2,1
3,3,2
4,4,2
5,5,2


In [85]:
sentiment_labels_df = sentiment_labels_df.iloc[1:]
sentiment_labels_df.head()

Unnamed: 0,phrase_id,sentiment_value
1,0,0.5
2,1,0.5
3,2,0.44444
4,3,0.5
5,4,0.42708


In [86]:
phrase_list_df.head()

Unnamed: 0,phrase,phrase_id
0,!,0
1,! ',22935
2,! '',18235
3,! Alas,179257
4,! Brilliant,22936


In [94]:
full_phrase_df = sentiment_labels_df.merge(phrase_list_df, on='phrase_id')
full_phrase_df.head()

Unnamed: 0,phrase_id,sentiment_value,phrase
0,0,0.5,!
1,1,0.5,'
2,2,0.44444,' (
3,3,0.5,' ( the cockettes
4,4,0.42708,' ( the cockettes )


In [95]:
full_sentence_splitlabel_df = all_data_df.merge(split_labels_df, on='sentence_id')
full_sentence_splitlabel_df.head()

Unnamed: 0,sentence_id,sentence,splitset_label
0,1,The Rock is destined to be the 21st Century 's...,1
1,2,The gorgeously elaborate continuation of `` Th...,1
2,3,Effective but too-tepid biopic,2
3,4,If you sometimes like to go to the movies to h...,2
4,5,"Emerges as something rare , an issue movie tha...",2


In [96]:
full_sentence_splitlabel_df['sentence'] = full_sentence_splitlabel_df['sentence'].apply(lambda x: preprocess_data(x))

In [97]:
full_sentence_splitlabel_df.head()

Unnamed: 0,sentence_id,sentence,splitset_label
0,1,the rock is destined to be the st century s ne...,1
1,2,the gorgeously elaborate continuation of the l...,1
2,3,effective but too tepid biopic,2
3,4,if you sometimes like to go to the movies to h...,2
4,5,emerges as something rare an issue movie that ...,2


In [109]:
full_sentence_splitlabel_df['splitset_label'].unique()

array(['1', '2', '3'], dtype=object)

In [98]:
full_phrase_df['phrase'] = full_phrase_df['phrase'].apply(lambda x: preprocess_data(x))
full_phrase_df.head()

Unnamed: 0,phrase_id,sentiment_value,phrase
0,0,0.5,
1,1,0.5,
2,2,0.44444,
3,3,0.5,the cockettes
4,4,0.42708,the cockettes


In [105]:
print(full_phrase_df.shape)

full_phrase_df = full_phrase_df.drop_duplicates('phrase')
full_phrase_df.dropna(inplace=True)

print(full_phrase_df.shape)
full_phrase_df.head()

(197875, 3)
(197875, 3)


Unnamed: 0,phrase_id,sentiment_value,phrase
0,0,0.5,
3,3,0.5,the cockettes
5,5,0.375,the cockettes provides a window into a subcult...
8,8,0.33333,a nightmare on elm street
10,10,0.47222,a nightmare on elm street or


In [106]:
full_phrase_df = full_phrase_df.iloc[1:]
full_phrase_df.head()

Unnamed: 0,phrase_id,sentiment_value,phrase
3,3,0.5,the cockettes
5,5,0.375,the cockettes provides a window into a subcult...
8,8,0.33333,a nightmare on elm street
10,10,0.47222,a nightmare on elm street or
12,12,0.33333,a nightmare on elm street or the hills


In [124]:
t_split = full_sentence_splitlabel_df.set_index('sentence')
final_df = full_phrase_df.join(t_split, on='phrase')
final_df.head()

Unnamed: 0,phrase_id,sentiment_value,phrase,sentence_id,splitset_label
3,3,0.5,the cockettes,,
5,5,0.375,the cockettes provides a window into a subcult...,,
8,8,0.33333,a nightmare on elm street,,
10,10,0.47222,a nightmare on elm street or,,
12,12,0.33333,a nightmare on elm street or the hills,,


In [230]:
# keeping only phrases which have 3 or more words
final_df_1 = final_df[final_df['phrase'].apply(lambda x: len(x.split())>2)]

In [231]:
train = final_df_1[final_df_1.splitset_label == '1']
test = final_df_1[final_df_1.splitset_label == '2']
dev = final_df_1[final_df_1.splitset_label == '3']

y_train = train['sentiment_value']
y_test = test['sentiment_value']
y_dev = dev['sentiment_value']
print(train.shape, test.shape, dev.shape)

(8055, 5) (2115, 5) (1041, 5)


In [232]:
train.head()

Unnamed: 0,phrase_id,sentiment_value,phrase,sentence_id,splitset_label
102,102,0.69444,blue crush swims away with the sleeper movie o...,3103,1
123,123,0.83333,a brilliant absurd collection of vignettes tha...,5481,1
135,135,0.44444,a clash between the artificial structure of th...,10820,1
147,147,0.77778,a decent enough nail biter that stands a good ...,2565,1
157,157,0.19444,a dreary indulgence,11817,1


In [233]:
all_tokens = (" ".join(train['phrase'])).split()
unique_tokens = set(all_tokens)

print("All tokens size: ", len(all_tokens))
print("Unique tokens size: ", len(unique_tokens))

All tokens size:  140078
Unique tokens size:  14717


In [234]:
# Hyperparameters
VOCAB_SIZE = len(unique_tokens)
EMBEDDING_DIMS = 100
MAX_SEQUENCE_LENGTH = 300
FILTERS = 128 
KERNEL_SIZE = 3 
BATCH_SIZE = 128
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=5)

In [235]:
#zero padding
tokenizer = text.Tokenizer(num_words=VOCAB_SIZE)
tokenizer.fit_on_texts(train['phrase'])

train_tokenizer = tokenizer.texts_to_matrix(train['phrase'])
test_tokenizer = tokenizer.texts_to_matrix(test['phrase'])
dev_tokenizer = tokenizer.texts_to_matrix(dev['phrase'])

train_encoded = sequence.pad_sequences(train_tokenizer, maxlen=MAX_SEQUENCE_LENGTH)
test_encoded = sequence.pad_sequences(test_tokenizer, maxlen=MAX_SEQUENCE_LENGTH)
dev_encoded = sequence.pad_sequences(dev_tokenizer, maxlen=MAX_SEQUENCE_LENGTH)

In [246]:
train_encoded.shape

(8055, 300)

In [236]:
# using pretrained glove
""" NOTE: To run this cell you need to download glove.6B.100d.txt file from http://nlp.stanford.edu/data/glove.6B.zip """
embedding_matrix = np.zeros((VOCAB_SIZE, EMBEDDING_DIMS))

embeddings_index = dict()
f = open('word_embeddings/glove.6B/glove.6B.100d.txt', encoding='utf-8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

for word, index in tokenizer.word_index.items():
    if index > VOCAB_SIZE - 1:
        break
    else:
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[index] = embedding_vector

In [237]:
embedding_matrix.shape

(14717, 100)

In [238]:
# load pre-trained word embeddings into an Embedding layer
# note that we set trainable = False so as to keep the embeddings fixed
embedding_layer = Embedding(VOCAB_SIZE,
                            EMBEDDING_DIMS,
                            embeddings_initializer=Constant(embedding_matrix),
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)

print('Training model...')

# train a 1D convnet with global maxpooling
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)
x = Dropout(0.5)(embedded_sequences)
x = Conv1D(FILTERS, KERNEL_SIZE, activation='relu')(x)
x = MaxPooling1D()(x)
x = Conv1D(FILTERS, KERNEL_SIZE, activation='relu')(x)
x = MaxPooling1D()(x)
x = Conv1D(FILTERS, KERNEL_SIZE, activation='relu')(x)
x = GlobalMaxPooling1D()(x)
x = Dropout(0.25)(x)
x = Dense(FILTERS, activation='relu')(x)
final_layer_output = Dense(1, activation='relu')(x)

Training model...


In [239]:
from keras import backend as K

def root_mean_squared_error(y_true, y_pred):
    return K.sqrt(K.mean(K.square(y_pred - y_true)))  

In [240]:
model = Model(sequence_input, final_layer_output)
model.compile(loss=root_mean_squared_error,
              optimizer='rmsprop')
model.summary()

Model: "model_11"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_14 (InputLayer)        [(None, 300)]             0         
_________________________________________________________________
embedding_13 (Embedding)     (None, 300, 100)          1471700   
_________________________________________________________________
dropout_12 (Dropout)         (None, 300, 100)          0         
_________________________________________________________________
conv1d_18 (Conv1D)           (None, 298, 128)          38528     
_________________________________________________________________
max_pooling1d_12 (MaxPooling (None, 149, 128)          0         
_________________________________________________________________
conv1d_19 (Conv1D)           (None, 147, 128)          49280     
_________________________________________________________________
max_pooling1d_13 (MaxPooling (None, 73, 128)           0  

In [241]:
EPOCHS = 100

history = model.fit(train_encoded, y_train, batch_size=BATCH_SIZE, epochs=EPOCHS, 
                    validation_data=(dev_encoded, y_dev), callbacks=[es])

Train on 8055 samples, validate on 1041 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 00014: early stopping


In [245]:
final_rmse = model.evaluate(test_encoded, y_test)
print("\nRMSE for sentiment values prediction: ", final_rmse)


RMSE for sentiment values prediction:  0.254613383244679
