In [1]:
#System Imports
import os
import io
import json

#Data Processing Imports
import pandas as pd
import numpy as np
import re
import string

#Data Tokenization Imports
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from keras.preprocessing.text import Tokenizer
stopwords=set(stopwords.words('english'))
maxLen = 150

#ML Imports
import keras
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding,LSTM,Dense,SpatialDropout1D, Dropout, Input
from sklearn.model_selection import train_test_split
from tensorflow.keras.optimizers import Adam

In [2]:
#Load Training Data
pos_train = os.listdir("aclImdb/train/pos/")
neg_train = os.listdir("adaptready/aclImdb/train/neg/")
train_dict = {'positive':pos_train,'negative':neg_train}

#Create a Train DataFrame
df_train = pd.DataFrame(columns=['review_id','review_score','review_text', 'review_sentiment'])

for key, val in train_dict.items():
    for i in val:
        try:
            temp = i.replace('.txt','')
            temp_1,temp_2 = temp.split('_')
            f = open('aclImdb/train/'+key[:3]+'/'+i, 'r').read()
            df_train.loc[len(df_train)] = [str(key[0])+"_"+temp_1,int(temp_2), f, key]
        except:
            pass
        

#Load Testing Data
pos_test = os.listdir("aclImdb/test/pos/")
neg_test = os.listdir("aclImdb/test/neg/")
test_dict = {'pos':pos_test,'neg':neg_test}

#Create a Test DataFrame
df_test = pd.DataFrame(columns=['review_id','review_score','review_text','review_sentiment'])

for key, val in test_dict.items():
    for i in val:
        try:
            temp = i.replace('.txt','')
            temp_1,temp_2 = temp.split('_')
            f = open('aclImdb/test/'+key[:3]+'/'+i, 'r').read()
            df_test.loc[len(df_test)] = [str(key[0])+"_"+temp_1,int(temp_2),f, key]
        except:
            pass

In [3]:
def remove_tags(string):
    result = re.sub('<.*?>','',string)
    return result

def clean_further(string):
    result = string.lower()
    result = re.sub(r"\s{1,}"," ",result)
    return result

def remove_stopwords(df):
    df['review without stopwords'] = df['review_text'].apply(lambda x : ' '.join([word for word in x.split() if word not in (stopwords)]))
    df['clean_review']= df['review without stopwords'].apply(lambda cw : remove_tags(cw))
    df['clean_review'] = df['clean_review'].str.replace('[{}]'.format(string.punctuation), ' ')
    df['clean_review'] = df['clean_review'].apply(clean_further)
    return df



In [4]:
#Clean the Dataset
df_train = remove_stopwords(df_train)

  df['clean_review'] = df['clean_review'].str.replace('[{}]'.format(string.punctuation), ' ')


In [5]:
df_train.head()

Unnamed: 0,review_id,review_score,review_text,review_sentiment,review without stopwords,clean_review
0,p_0,9,Bromwell High is a cartoon comedy. It ran at t...,positive,Bromwell High cartoon comedy. It ran time prog...,bromwell high cartoon comedy it ran time progr...
1,p_10000,8,Homelessness (or Houselessness as George Carli...,positive,Homelessness (or Houselessness George Carlin s...,homelessness or houselessness george carlin st...
2,p_10001,10,Brilliant over-acting by Lesley Ann Warren. Be...,positive,Brilliant over-acting Lesley Ann Warren. Best ...,brilliant over acting lesley ann warren best d...
3,p_10002,7,This is easily the most underrated film inn th...,positive,This easily underrated film inn Brooks cannon....,this easily underrated film inn brooks cannon ...
4,p_10003,8,This is not the typical Mel Brooks film. It wa...,positive,This typical Mel Brooks film. It much less sla...,this typical mel brooks film it much less slap...


In [6]:
#Transform Data into lists for train-test split 
reviews_list = []
for i in range(len(df_train['clean_review'])):
    reviews_list.append(df_train['clean_review'][i])
    
sentiment = df_train['review_sentiment']

In [7]:
#Split the Data into training and testing data
y = np.array(list(map(lambda x: 1 if x=="positive" else 0, sentiment)))

X_train, X_test,Y_train, Y_test = train_test_split(reviews_list, y, test_size=0.2, random_state = 45)

In [8]:
#Create a word tokenizer
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(X_train)
words_to_index = tokenizer.word_index

In [9]:
#Save the tokenizer
tokenizer_json = tokenizer.to_json()
with io.open('tokenizer.json', 'w', encoding='utf-8') as f:
    f.write(json.dumps(tokenizer_json, ensure_ascii=False))

In [11]:
#Create Word vectors embedding map
def read_glove_vector(glove_vec):
    with open(glove_vec, 'r', encoding='UTF-8') as f:
        words = set()
        word_to_vec_map = {}
        for line in f:
            w_line = line.split()
            curr_word = w_line[0]
            word_to_vec_map[curr_word] = np.array(w_line[1:], dtype=np.float64)
    return word_to_vec_map

word_to_vec_map = read_glove_vector('glove.42B.300d.txt')

In [12]:
#Generate an Embedding Layer
vocab_len = len(words_to_index)
embed_vector_len = word_to_vec_map['moon'].shape[0]
emb_matrix = np.zeros((vocab_len, embed_vector_len))

for word, index in words_to_index.items():
    break
    embedding_vector = word_to_vec_map.get(word)
    if embedding_vector is not None:
        emb_matrix[index-1, :] = embedding_vector

embedding_layer = Embedding(input_dim=vocab_len, output_dim=embed_vector_len, 
                            input_length=maxLen, weights = [emb_matrix], trainable=False)

In [13]:
#Design the model
def movie_rating(input_shape):
    X_indices = Input(input_shape)
    embeddings = embedding_layer(X_indices)
    X = LSTM(128, return_sequences=True)(embeddings)
    X = Dropout(0.6)(X)
    X = LSTM(128, return_sequences=True)(X)
    X = Dropout(0.6)(X)
    X = LSTM(128)(X)
    X = Dense(1, activation='sigmoid')(X)
    model = Model(inputs=X_indices, outputs=X)
    return model

In [83]:
model = movie_rating((maxLen,))
model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, 150)]             0         
                                                                 
 embedding_1 (Embedding)     multiple                  21034800  
                                                                 
 lstm_1 (LSTM)               (None, 150, 128)          219648    
                                                                 
 dropout (Dropout)           (None, 150, 128)          0         
                                                                 
 lstm_2 (LSTM)               (None, 150, 128)          131584    
                                                                 
 dropout_1 (Dropout)         (None, 150, 128)          0         
                                                                 
 lstm_3 (LSTM)               (None, 128)               131584

In [84]:
#Train the model
X_train_indices = tokenizer.texts_to_sequences(X_train)
X_train_indices = pad_sequences(X_train_indices, maxlen=maxLen, padding='post')
adam = Adam(learning_rate = 0.0001)
model.compile(optimizer=adam, loss='binary_crossentropy', metrics=['accuracy'])
model.fit(X_train_indices, Y_train, batch_size=64, epochs=15)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.callbacks.History at 0x219f80fd3c0>

In [None]:
#Save the model
model.save("model.h5")

In [None]:
# model = keras.models.load_model("model.h5")

In [86]:
#Prepare the evaluate the test dataset
X_test_indices = tokenizer.texts_to_sequences(X_test)
X_test_indices = pad_sequences(X_test_indices, maxlen=maxLen, padding='post')

model.evaluate(X_test_indices, Y_test)
preds = model.predict(X_test_indices)

df_test = remove_stopwords(df_test)

X_test_indices = tokenizer.texts_to_sequences(list(df_test['clean_review']))

X_test_indices = pad_sequences(X_test_indices, maxlen=maxLen, padding='post')

In [119]:
#Evaluate the test dataset
def add_score_predictions(data, reviews_list_idx):
    maxLen = 150
    data['sentiment score'] = 0

    reviews_list_idx = pad_sequences(reviews_list_idx, maxlen=maxLen, padding='post')

    review_preds = model.predict(reviews_list_idx)

    data['sentiment score'] = review_preds

    pred_sentiment = np.array(list(map(lambda x : 'positive' if x > 0.5 else 'negative',review_preds)))

    data['predicted sentiment'] = 0

    data['predicted sentiment'] = pred_sentiment

    return data

In [121]:
test_df = add_score_predictions(data_test_without_stopwords,X_test_indices)

