In [20]:
import numpy as np
import pandas as pd
import string
import collections
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

In [21]:
from keras.models import Sequential
from keras.layers import Dense, SimpleRNN, Embedding
import warnings
warnings.filterwarnings("ignore")

In [22]:
class Dataset(): 
    def __init__(self, dataset_path, stopword_path):
        self.dataframe = pd.read_csv(dataset_path, encoding="Latin-1")
        self.stopwords = pd.read_csv(stopword_path, header = None)
        
        self.dataframe['sentiment']= self.dataframe['sentiment'].map({'positive': 1, 'negative': 0})
        
        self.dataframe['clean_msg']= self.dataframe['review'].apply(lambda x: self.remove_punctuation(x))
        
        self.dataframe['clean_msg']= self.dataframe['clean_msg'].apply(lambda x: x.lower())
        
        self.dataframe['clean_msg']= self.dataframe['clean_msg'].apply(lambda x: self.tokenization(x))
        
        self.dataframe['clean_msg']= self.dataframe['clean_msg'].apply(lambda x: self.remove_stopwords(x))
        
        self.dataframe['clean_msg']= self.dataframe['clean_msg'].apply(lambda x: self.remove_short_word(x))
        
        self.dataframe['lens'] = self.dataframe['clean_msg'].apply(lambda x: len(x))
        self.max_sentence = self.dataframe['lens'].max()
        
        # create dictionary of words
        self.list_review = self.dataframe['clean_msg'].to_list()
        self.list_words = self.create_dictionary(self.list_review)
        del self.list_review
        self.len_words = len(self.list_words)
        
        # indexing (convert strings to intgers)
        a = self.dataframe['clean_msg'].explode()
        a[:] = a.factorize()[0]
        a = a.apply(lambda x:x+1)
        self.dataframe['indexs'] = a.groupby(level=0).agg(list)
        del a
        
        # pre padding with zero(0)
        self.pad = pad_sequences(self.dataframe['indexs'].to_list(), maxlen=self.max_sentence)
        self.len_words += 1
        
        # split data test & train
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(self.pad, self.dataframe['sentiment'], test_size=0.20, random_state=42)
        
        # remove self.dataframe to avoid storing data too much
        del self.dataframe
        
        
    def remove_punctuation(self, text):
        punctuation_free = "".join([i if i not in string.punctuation else ' ' for i in text])
        return punctuation_free
    
    def tokenization(self, text):
        tokens = str(text).split()
        return tokens

    def remove_short_word(self, text):
        short_word_free = [i for i in text if len(i) > 2]
        return short_word_free
        
    def remove_stopwords(self, text):
        stoplist = self.stopwords[0].to_list()
        output= [i for i in text if i not in stoplist]
        return output
    
    def create_dictionary(self, sentences):
        total_dictionary = {}
        total_words = []
        for index in range(len(sentences)):
            words_cnt = collections.Counter(sentences[index])
            for word in words_cnt:
                cnt = words_cnt[word]
                # check if word has already added to total_dictionary
                if word in total_dictionary.keys():
                    total_dictionary[word] += cnt
                else:
                    total_words.append(word)
                    total_dictionary[word] = cnt
        
        return total_words
        

In [23]:
data = Dataset('../input/imdb-da/IMDB_Dataset.csv', '../input/imdb-da/stopwords.txt')

In [24]:
data.X_train.shape, data.X_test.shape

((4000, 897), (1000, 897))

In [25]:
# print(sorted(data.stopwords[0].to_list()))

In [26]:
class Many_to_one_Elman_Net():
    def __init__(self, dataset, max_vocab, max_len):
        self.max_vocab = max_vocab
        self.max_len = max_len
    
        self.y_train = dataset.y_train
        self.y_test = dataset.y_test
        self.trainX = dataset.X_train
        self.testX = dataset.X_test
    
    def RNN_model(self, hidden_neurons, dense_neurons, hidden_act, dense_act,
                  loss_func, optimizer_func):
        
        model = Sequential()
        model.add(Embedding(input_dim=self.max_vocab, output_dim=hidden_neurons,
                            input_length=self.max_len))
        model.add(SimpleRNN(units=hidden_neurons, activation=hidden_act))
        model.add(Dense(units=dense_neurons, activation=dense_act))
        model.compile(loss=loss_func, optimizer=optimizer_func, metrics=['acc']) 
        return model
    
    def train(self, epochs, batch_size, verbose):
        # optimizer = 'rmsprop', 'adam'
        # loss = 'binary_crossentropy', 'mean_squared_error'
        self.model = self.RNN_model(hidden_neurons=64, dense_neurons=1,
                                     hidden_act='sigmoid', dense_act='sigmoid',
                                     loss_func='mean_squared_error',
                                     optimizer_func='rmsprop') 
                
        self.model.fit(self.trainX, self.y_train, epochs=epochs, batch_size=batch_size, verbose=verbose)
        return self.model
    
    def predict(self):
        train_predict = self.model.predict(self.trainX)
        test_predict = self.model.predict(self.testX)
        return train_predict, test_predict
    
    def evaluate(self):
        acc_train = self.model.evaluate(self.trainX, self.y_train)
        acc_test = self.model.evaluate(self.testX, self.y_test)
        return acc_train[1], acc_test[1]

In [27]:
elman = Many_to_one_Elman_Net(data, data.len_words, data.max_sentence)
elman.train(epochs=15, batch_size=100, verbose=1)
print(elman.model.summary())


Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 897, 64)           2496128   
_________________________________________________________________
simple_rnn_1 (SimpleRNN)     (None, 64)                8256      
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 65        
Total params: 2,504,449
Trainable params: 2,504,449
Non-trainable params: 0
_________________________________________________________________
None


In [28]:
train_predict, test_predict = elman.predict()
acc_train, acc_test = elman.evaluate()



In [31]:
print('Train accuracy: ', acc_train*100)
print('Test accuracy:  ', acc_test*100)

Train accuracy:  99.52499866485596
Test accuracy:   72.29999899864197


In [29]:
# lis = elman.model.get_weights()
# print(len(lis))
# for i in lis:
#     print(i.shape)