In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.preprocessing import LabelEncoder
import string
import emoji
import re
import nltk
from textblob import TextBlob
from nltk.corpus import wordnet
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from gensim.models import Word2Vec
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, Bidirectional
import numpy as np
import pickle
import keras.backend as K


# IMDB Dataset from Kaggle

In [2]:
data = pd.read_csv('IMDB_Dataset.csv')
data.sample(n=10)

Unnamed: 0,review,sentiment
29566,I would say to the foreign people who have see...,positive
17139,I just came back from the Late-night cinema an...,positive
18181,Its a very good comedy movie.Ijust liked it.I ...,positive
38645,I have been looking for this mini-series for a...,positive
24673,"what a great little film, lots of good roles f...",positive
7490,There were a lot of dumb teenage getting sex m...,positive
36839,This seemed really similar to the CHILD'S PLAY...,negative
26560,The idea of making a miniseries about the Berl...,negative
24217,All credit to writer/director Gilles Mimouni w...,positive
27984,"I also saw this amazingly bad piece of ""anime""...",negative


In [3]:
data['label'] = data['sentiment'].replace(['positive', 'negative'],['1', '0'])

In [4]:
data.sample(n=10)

Unnamed: 0,review,sentiment,label
17661,R.I.C.C.O. is the STUPIDEST film ever made. I ...,negative,0
37412,"So, it's Friday night and you want to go watch...",negative,0
352,Tell the truth I’m a bit stun to see all these...,negative,0
24232,I have the entire Weissmuller Tarzan series on...,positive,1
34784,According to the blurb on the back of the DVD ...,negative,0
43920,Commissaire Mattei(André Bourvil) is a single ...,positive,1
24849,Another classic study of the effects of wealth...,positive,1
46657,"A year after her triumphant first special, ""My...",positive,1
9620,This is perhaps the best rockumentary ever- a ...,positive,1
5227,Being an Austrian myself this has been a strai...,positive,1


In [5]:
data['label'].value_counts()

label
1    25000
0    25000
Name: count, dtype: int64

In [6]:
data.to_csv('IMDB_Dataset_label.csv')

# Preprocessing

In [7]:
stopwords = nltk.corpus.stopwords.words('english')

def preprocess(text):
    
    #1. Generating the list of words in the tweet (hastags and other punctuations removed)
    text_blob = TextBlob(text)
    text = ' '.join(text_blob.words)
    
    #2. clean the number 
    text = re.sub(r'[0-9]', '', text)
    
    #3. lower the text
    text = text.lower()
    
    #4. conver the emoji to text form
    text = emoji.demojize(text)
    
    #5. remove punctuation 
    for punctuation in string.punctuation:
        text = text.replace(punctuation, '')
    
    #6. tokenize the text
    text = word_tokenize(text)
    
    #7. remove empty token
    text = [t for t in text if len(t) > 0]
    
    #8. remove non-alphabetical token
    text = [t for t in text if t.isalpha()]
    
    #9. replace the negation token
    replacer  = AntonymReplacer()
    text = replacer.replace_negations(text)
    
    #10. remove the stopwords
    text = [i for i in text if i not in stopwords]
    
    #11. stem the text
    porter_stemmer = PorterStemmer()
    text = [porter_stemmer.stem(w) for w in text]
    
    return text

class AntonymReplacer(object):
    def replace(self, word, pos=None):
        antonyms = set()

        for syn in wordnet.synsets(word, pos=pos):
            for lemma in syn.lemmas():
                for antonym in lemma.antonyms():
                    antonyms.add(antonym.name())

        if len(antonyms) == 1:
            return antonyms.pop()
        else:
            return None

    def replace_negations(self, sent):
        i, l = 0, len(sent)
        words = []

        while i < l:
            word = sent[i]

            if word == 'not' and i+1 < l:
                ant = self.replace(sent[i+1])

                if ant:
                    words.append(ant)
                    i += 2
                    continue

            words.append(word)
            i += 1

        return words

In [8]:
# Apply preprocessing to the 'review' column

data['processed_review'] = data['review'].apply(preprocess)

# Train Word2Vec models
sentences = data['processed_review'].tolist()

# CBOW model
cbow_model = Word2Vec(sentences, vector_size=100, window=5, min_count=2, sg=0)
# Skip Gram model
skipgram_model = Word2Vec(sentences, vector_size=100, window=5, min_count=2, sg=1)

# Save the Word2Vec models
cbow_model.save('Trained Model/cbow_model.bin')
skipgram_model.save('Trained Model/skipgram_model.bin')

In [9]:
# Convert texts to embeddings
def text_to_embedding(text, model):
    embeddings = [model.wv[word] for word in text if word in model.wv]
    return embeddings

In [10]:
# Prepare data for BiLSTM
max_len = 100
X_cbow = pad_sequences([text_to_embedding(text, cbow_model) for text in data['processed_review']], maxlen=max_len, dtype='float32', padding='post', truncating='post', value=0.0)
X_skipgram = pad_sequences([text_to_embedding(text, skipgram_model) for text in data['processed_review']], maxlen=max_len, dtype='float32', padding='post', truncating='post', value=0.0)

y = data['label'].astype('int')

X_train_cbow, X_test_cbow, y_train, y_test = train_test_split(X_cbow, y, test_size=0.2, random_state=42)
X_train_skipgram, X_test_skipgram, _, _ = train_test_split(X_skipgram, y, test_size=0.2, random_state=42)


In [11]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Bidirectional, LSTM, Dense, Dropout
# Define BiLSTM model
def create_bilstm_model(input_shape):
    model = Sequential()
    model.add(Bidirectional(LSTM(100, return_sequences=True), input_shape=input_shape))
    model.add(Dropout(0.2))  # Adding dropout layer with dropout rate of 0.5

    model.add(Bidirectional(LSTM(100)))
    model.add(Dropout(0.2))  # Adding dropout layer with dropout rate of 0.5

    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

# Model Training

In [12]:
K.clear_session()

# Train BiLSTM with CBOW embeddings
bilstm_cbow_model = create_bilstm_model((max_len, 100))
bilstm_cbow_model.fit(X_train_cbow, y_train, epochs=10, batch_size=128, validation_data=(X_test_cbow, y_test))
# Save the models in the native Keras format

# # Train BiLSTM with Skip Gram embeddings
K.clear_session()
bilstm_skipgram_model = create_bilstm_model((max_len, 100))
bilstm_skipgram_model.fit(X_train_skipgram, y_train, epochs=30, batch_size=128, validation_data=(X_test_skipgram, y_test))





  super().__init__(**kwargs)


Epoch 1/10
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m104s[0m 319ms/step - accuracy: 0.7928 - loss: 0.4344 - val_accuracy: 0.8509 - val_loss: 0.3535
Epoch 2/10
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m97s[0m 309ms/step - accuracy: 0.8652 - loss: 0.3173 - val_accuracy: 0.8575 - val_loss: 0.3252
Epoch 3/10
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m103s[0m 328ms/step - accuracy: 0.8771 - loss: 0.2973 - val_accuracy: 0.8625 - val_loss: 0.3180
Epoch 4/10
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m106s[0m 339ms/step - accuracy: 0.8921 - loss: 0.2628 - val_accuracy: 0.8707 - val_loss: 0.3112
Epoch 5/10
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m102s[0m 327ms/step - accuracy: 0.9050 - loss: 0.2366 - val_accuracy: 0.8730 - val_loss: 0.3035
Epoch 6/10
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m98s[0m 314ms/step - accuracy: 0.9167 - loss: 0.2113 - val_accuracy: 0.8722 - val_loss: 0.3280
Epoch 

# Model Evaluation

In [13]:
# Evaluate models
print('Results with CBOW embeddings:')
y_pred_cbow = (bilstm_cbow_model.predict(X_test_cbow) > 0.5).astype('int')
print(classification_report(y_test, y_pred_cbow))
print('Confusion matrix: \n', confusion_matrix(y_test, y_pred_cbow))
print('Accuracy score: ', accuracy_score(y_test, y_pred_cbow))

print('Results with Skip Gram embeddings:')
y_pred_skipgram = (bilstm_skipgram_model.predict(X_test_skipgram) > 0.5).astype('int')
print(classification_report(y_test, y_pred_skipgram))
print('Confusion matrix: \n', confusion_matrix(y_test, y_pred_skipgram))
print('Accuracy score: ', accuracy_score(y_test, y_pred_skipgram))



Results with CBOW embeddings:
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 41ms/step
              precision    recall  f1-score   support

           0       0.86      0.87      0.87      4961
           1       0.87      0.86      0.87      5039

    accuracy                           0.87     10000
   macro avg       0.87      0.87      0.87     10000
weighted avg       0.87      0.87      0.87     10000

Confusion matrix: 
 [[4333  628]
 [ 718 4321]]
Accuracy score:  0.8654
Results with Skip Gram embeddings:
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 41ms/step
              precision    recall  f1-score   support

           0       0.86      0.86      0.86      4961
           1       0.87      0.86      0.86      5039

    accuracy                           0.86     10000
   macro avg       0.86      0.86      0.86     10000
weighted avg       0.86      0.86      0.86     10000

Confusion matrix: 
 [[4287  674]
 [ 720 4319]]
Accuracy 

# Saving Model

In [14]:
# Save the models in the native Keras format
bilstm_cbow_model.save('Trained Model/bilstm_cbow_model.keras')
bilstm_skipgram_model.save('Trained Model/bilstm_skipgram_model.keras')


In [15]:
# from keras.models import load_model
# from keras.layers import Bidirectional, LSTM

# # Load the models
# loaded_bilstm_cbow_model = load_model('bilstm_cbow_model.keras')
# loaded_bilstm_skipgram_model = load_model('bilstm_skipgram_model.keras')

# # Manually recompile the models
# loaded_bilstm_cbow_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
# loaded_bilstm_skipgram_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
