In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.preprocessing import LabelEncoder
import string
import emoji
import re
import nltk
from textblob import TextBlob
from nltk.corpus import wordnet
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from gensim.models import Word2Vec
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, Bidirectional
import numpy as np
import pickle
import keras.backend as K


In [2]:
data = pd.read_csv('IMDB_Dataset.csv')
data.sample(n=10)

Unnamed: 0,review,sentiment
3178,BEGIN SPOILER: Fitfully funny and memorable fo...,negative
8248,"Typically Spanish production - slow-moving, bu...",positive
23301,What? - that was it? The town sheriff (John Ag...,negative
47405,What Fox's fascination with dysfunctional fami...,negative
743,A lovely little film about the introduction of...,positive
3749,Rating: 4 out of 10<br /><br />As this mini-se...,negative
17222,Pumpkinhead was in itself a decent 80s horror ...,negative
38456,"When I started watching this, I instantly noti...",negative
4533,An absolutely baffling western featuring flash...,negative
30098,"I love camp movies, believe me and the usual t...",negative


In [3]:
data['label'] = data['sentiment'].replace(['positive', 'negative'],['1', '0'])

In [4]:
data.sample(n=10)

Unnamed: 0,review,sentiment,label
25115,This movie was just plain bad. I can forgive l...,negative,0
3173,I am awed by actress Bobbie Phillips and her s...,positive,1
11902,I guess it's Jack's great empathic ability tha...,negative,0
28893,This is one of those films that's more interes...,negative,0
33057,I can't believe we watched this total piece of...,negative,0
40078,"""Shadrach"" was not my favorite type of movie. ...",negative,0
36484,"Without question, the worst ELVIS film ever ma...",negative,0
3237,"We bought the DVD set of ""Es war einmal das Le...",positive,1
38634,'Maladolescenza' has the air of a dark fairy t...,negative,0
13610,"This is yet another pseudo-intellectual ""let's...",negative,0


In [5]:
data['label'].value_counts()

label
1    25000
0    25000
Name: count, dtype: int64

In [6]:
data.to_csv('IMDB_Dataset_label.csv')

In [7]:
stopwords = nltk.corpus.stopwords.words('english')

def preprocess(text):
    
    #1. Generating the list of words in the tweet (hastags and other punctuations removed)
    text_blob = TextBlob(text)
    text = ' '.join(text_blob.words)
    
    #2. clean the number 
    text = re.sub(r'[0-9]', '', text)
    
    #3. lower the text
    text = text.lower()
    
    #4. conver the emoji to text form
    text = emoji.demojize(text)
    
    #5. remove punctuation 
    for punctuation in string.punctuation:
        text = text.replace(punctuation, '')
    
    #6. tokenize the text
    text = word_tokenize(text)
    
    #7. remove empty token
    text = [t for t in text if len(t) > 0]
    
    #8. remove non-alphabetical token
    text = [t for t in text if t.isalpha()]
    
    #9. replace the negation token
    replacer  = AntonymReplacer()
    text = replacer.replace_negations(text)
    
    #10. remove the stopwords
    text = [i for i in text if i not in stopwords]
    
    #11. stem the text
    porter_stemmer = PorterStemmer()
    text = [porter_stemmer.stem(w) for w in text]
    
    return text

class AntonymReplacer(object):
    def replace(self, word, pos=None):
        antonyms = set()

        for syn in wordnet.synsets(word, pos=pos):
            for lemma in syn.lemmas():
                for antonym in lemma.antonyms():
                    antonyms.add(antonym.name())

        if len(antonyms) == 1:
            return antonyms.pop()
        else:
            return None

    def replace_negations(self, sent):
        i, l = 0, len(sent)
        words = []

        while i < l:
            word = sent[i]

            if word == 'not' and i+1 < l:
                ant = self.replace(sent[i+1])

                if ant:
                    words.append(ant)
                    i += 2
                    continue

            words.append(word)
            i += 1

        return words

In [8]:
data['processed_review'] = data['review'].apply(preprocess)

# Train Word2Vec models
sentences = data['processed_review'].tolist()

# CBOW model
cbow_model = Word2Vec(sentences, vector_size=100, window=5, min_count=2, sg=0)
# Skip Gram model
skipgram_model = Word2Vec(sentences, vector_size=100, window=5, min_count=2, sg=1)

# Save the Word2Vec models
cbow_model.save('cbow_model.bin')
skipgram_model.save('skipgram_model.bin')

In [9]:
# Convert texts to embeddings
def text_to_embedding(text, model):
    embeddings = [model.wv[word] for word in text if word in model.wv]
    return embeddings

In [10]:
# Prepare data for BiLSTM
max_len = 100
X_cbow = pad_sequences([text_to_embedding(text, cbow_model) for text in data['processed_review']], maxlen=max_len, dtype='float32', padding='post', truncating='post', value=0.0)
X_skipgram = pad_sequences([text_to_embedding(text, skipgram_model) for text in data['processed_review']], maxlen=max_len, dtype='float32', padding='post', truncating='post', value=0.0)

y = data['label'].astype('int')

X_train_cbow, X_test_cbow, y_train, y_test = train_test_split(X_cbow, y, test_size=0.2, random_state=42)
X_train_skipgram, X_test_skipgram, _, _ = train_test_split(X_skipgram, y, test_size=0.2, random_state=42)


In [21]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Bidirectional, LSTM, Dense, Dropout
# Define BiLSTM model
def create_bilstm_model(input_shape):
    model = Sequential()
    model.add(Bidirectional(LSTM(100, return_sequences=True), input_shape=input_shape))
    model.add(Dropout(0.2))  # Adding dropout layer with dropout rate of 0.5

    model.add(Bidirectional(LSTM(100)))
    model.add(Dropout(0.2))  # Adding dropout layer with dropout rate of 0.5

    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [24]:
K.clear_session()

# Train BiLSTM with CBOW embeddings
bilstm_cbow_model = create_bilstm_model((max_len, 100))
bilstm_cbow_model.fit(X_train_cbow, y_train, epochs=10, batch_size=128, validation_data=(X_test_cbow, y_test))
# Save the models in the native Keras format
bilstm_cbow_model.save('bilstm_cbow_model.keras')

# # Train BiLSTM with Skip Gram embeddings
# K.clear_session()
# bilstm_skipgram_model = create_bilstm_model((max_len, 100))
# bilstm_skipgram_model.fit(X_train_skipgram, y_train, epochs=30, batch_size=128, validation_data=(X_test_skipgram, y_test))
# bilstm_skipgram_model.save('bilstm_skipgram_model.keras')

  super().__init__(**kwargs)


Epoch 1/10
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m86s[0m 263ms/step - accuracy: 0.7853 - loss: 0.4409 - val_accuracy: 0.8530 - val_loss: 0.3484
Epoch 2/10
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 262ms/step - accuracy: 0.8656 - loss: 0.3215 - val_accuracy: 0.8586 - val_loss: 0.3282
Epoch 3/10
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m84s[0m 269ms/step - accuracy: 0.8812 - loss: 0.2875 - val_accuracy: 0.8714 - val_loss: 0.3070
Epoch 4/10
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m85s[0m 273ms/step - accuracy: 0.8900 - loss: 0.2667 - val_accuracy: 0.8733 - val_loss: 0.3090
Epoch 5/10
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m88s[0m 282ms/step - accuracy: 0.9041 - loss: 0.2335 - val_accuracy: 0.8649 - val_loss: 0.3212
Epoch 6/10
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m86s[0m 275ms/step - accuracy: 0.9182 - loss: 0.2096 - val_accuracy: 0.8686 - val_loss: 0.3267
Epoch 7/10

In [25]:
# Evaluate models
print('Results with CBOW embeddings:')
y_pred_cbow = (bilstm_cbow_model.predict(X_test_cbow) > 0.5).astype('int')
print(classification_report(y_test, y_pred_cbow))
print('Confusion matrix: \n', confusion_matrix(y_test, y_pred_cbow))
print('Accuracy score: ', accuracy_score(y_test, y_pred_cbow))

# print('Results with Skip Gram embeddings:')
# y_pred_skipgram = (bilstm_skipgram_model.predict(X_test_skipgram) > 0.5).astype('int')
# print(classification_report(y_test, y_pred_skipgram))
# print('Confusion matrix: \n', confusion_matrix(y_test, y_pred_skipgram))
# print('Accuracy score: ', accuracy_score(y_test, y_pred_skipgram))



Results with CBOW embeddings:
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 19ms/step
              precision    recall  f1-score   support

           0       0.85      0.86      0.86      4961
           1       0.87      0.86      0.86      5039

    accuracy                           0.86     10000
   macro avg       0.86      0.86      0.86     10000
weighted avg       0.86      0.86      0.86     10000

Confusion matrix: 
 [[4290  671]
 [ 728 4311]]
Accuracy score:  0.8601


In [14]:
# Save the models in the native Keras format
bilstm_cbow_model.save('bilstm_cbow_model.keras')
bilstm_skipgram_model.save('bilstm_skipgram_model.keras')


In [15]:
# from keras.models import load_model
# from keras.layers import Bidirectional, LSTM

# # Load the models
# loaded_bilstm_cbow_model = load_model('bilstm_cbow_model.keras')
# loaded_bilstm_skipgram_model = load_model('bilstm_skipgram_model.keras')

# # Manually recompile the models
# loaded_bilstm_cbow_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
# loaded_bilstm_skipgram_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])


In [16]:
import keras
import tensorflow as tf
print(keras.__version__)
print(tf.__version__)


3.3.3
2.16.1
