In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt 
import os 
import sys

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [3]:
from keras.models import Sequential
from keras import layers, models, optimizers
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.callbacks import ModelCheckpoint
from keras.layers import LSTM,Dropout,Dense
from keras.layers import Embedding
from keras.preprocessing import sequence
from keras.layers import LSTM, Conv1D, MaxPooling1D, Dropout
from keras.callbacks import EarlyStopping

In [4]:
def plot_history(history):
    acc = history.history['accuracy']
    val_acc = history.history['val_accuracy']
    loss = history.history['loss']
    val_loss = history.history['val_loss']
    x = range(1, len(acc) + 1)

    plt.figure(figsize=(12, 5))
    plt.subplot(1, 2, 1)
    plt.plot(x, acc, 'b', label='Training acc')
    plt.plot(x, val_acc, 'r', label='Validation acc')
    plt.title('Training and validation accuracy')
    plt.legend()
    plt.subplot(1, 2, 2)
    plt.plot(x, loss, 'b', label='Training loss')
    plt.plot(x, val_loss, 'r', label='Validation loss')
    plt.title('Training and validation loss')
    plt.legend()


In [5]:
data = pd.read_csv("../data/distilbert_imdb2.csv")
data.head()

Unnamed: 0,sentiment,word count,count_word,count_unique_word,count_letters,count_punctuations,count_words_upper,count_words_title,count_stopwords,mean_word_len,word_unique_percent,punct_percent,reviews_pre,embeddings_distilbert
0,1,307,307,208,1761,78,8,36,133,4.739414,67.752443,25.407166,one reviewer mentioned watching 1 oz episode h...,[ 2.25813061e-01 3.54351997e-02 5.05604744e-...
1,1,162,162,114,998,44,2,12,66,5.166667,70.37037,27.160494,wonderful little production filming technique ...,[-4.65010226e-01 6.32754207e-01 2.78978735e-...
2,1,166,166,121,926,40,3,20,76,4.584337,72.891566,24.096386,thought wonderful way spend time hot summer we...,[ 1.80858865e-01 4.71553922e-01 6.99151278e-...
3,0,138,138,96,748,42,3,13,62,4.427536,69.565217,30.434783,basically family little boy jake think zombie ...,[ 5.36763310e-01 4.89160836e-01 4.62464690e-...
4,1,230,230,152,1317,56,1,31,96,4.730435,66.086957,24.347826,petter matteis love time money visually stunni...,[-3.51513296e-01 4.82253611e-01 6.09883726e-...


In [6]:
X_train, X_test, y_train, y_test = train_test_split(data['reviews_pre'], data['sentiment'],test_size=0.2, random_state=0)

X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train,test_size=0.1, random_state=0)

In [7]:
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(data.reviews_pre)

X_train1 = tokenizer.texts_to_sequences(X_train)
X_valid1 = tokenizer.texts_to_sequences(X_valid)
X_test1 = tokenizer.texts_to_sequences(X_test)

vocab_size = len(tokenizer.word_index) + 1  # Adding 1 because of reserved 0 index

print(X_train[2])
print(X_train1[2])

thought wonderful way spend time hot summer weekend sitting air conditioned theater watching lighthearted comedy plot simplistic dialogue witty character likable even well bread suspected serial killer may disappointed realize match point 2 risk addiction thought proof woody allen still fully control style many u grown lovethis would laughed one woodys comedy year dare say decade never impressed scarlet johanson managed tone sexy image jumped right average spirited young womanthis may crown jewel career wittier devil wear prada interesting superman great comedy go see friend
[363, 158, 71, 9, 26, 356, 105, 3, 1791, 3460, 3404, 4712, 2294, 1654, 15, 134, 2, 140, 363, 130, 731, 790, 2, 1518, 15, 1138, 29, 1936, 9, 2685, 102, 1138, 3, 675, 1, 120, 48, 177, 120, 31, 1327, 105, 29, 3, 38, 2, 19, 127, 521, 4227, 924, 1138, 4198, 3557, 151, 1523, 1263, 440, 1138, 311, 269, 1939, 2267, 193, 3307, 149, 1138, 3532, 5, 144, 1138, 1213, 41, 40, 41, 233, 445, 3606, 2000, 62, 3, 46, 101, 1341, 3472,

In [8]:
seq_lens = [len(s) for s in X_train1]
print("average length: %0.1f" % np.mean(seq_lens))
print("max length: %d" % max(seq_lens))

average length: 99.8
max length: 949


In [9]:
maxlen = 150

X_train1 = pad_sequences(X_train1, padding='post', maxlen=maxlen)
X_valid1 = pad_sequences(X_valid1, padding='post', maxlen=maxlen)
X_test1 = pad_sequences(X_test1, padding='post', maxlen=maxlen)

print(X_train1[2, :])

[2147 1562   48   28   53  460  264 2963 1025 4121 1085  187  232  275
 2492   59   35 2104    2  721   88   59 2944  929    7   11   41  161
  815  280    4  139  703  563  508  279  104   27    4   31 3176 2802
 1512  461 1268 1089   36    2   60   17  148   39  219 2087  451  799
  435   58   13  143   80 1138 2554 3460   53    9  851  815  124  241
 2155 4530 2083 1973   26 1151   51 1507    2    3  644 4121 1138  234
 4386   21  630  223  628   84  478  694 1337 3697    1  234  162   19
  134 1138    2   48    5  788  283  302  763 1082  184 2365 4587 1031
  134   16 4146   27  821  120  433   53  561  561   53    3 1654   53
 3167 1290  706    9    2  122 2628  866  134  149  763 1082 1449 2576
  288 3830 2740    2   29   19 1740  548 2406  274]


# Neural Network

In [10]:
embedding_dim = 50
callback = EarlyStopping(monitor='val_loss', patience=2)

model = Sequential()
model.add(layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=maxlen))
model.add(layers.Flatten())
model.add(layers.Dense(10, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))

model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])

model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 150, 50)           10554700  
                                                                 
 flatten (Flatten)           (None, 7500)              0         
                                                                 
 dense (Dense)               (None, 10)                75010     
                                                                 
 dense_1 (Dense)             (None, 1)                 11        
                                                                 
Total params: 10629721 (40.55 MB)
Trainable params: 10629721 (40.55 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [11]:
history = model.fit(X_train1, y_train,epochs=10,verbose=True,validation_data=(X_valid1, y_valid),batch_size=1000,callbacks=[callback])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10


In [12]:
y_pred = model.predict(X_test1)
y_pred_classes = (y_pred > 0.5).astype(int) 

accuracy = accuracy_score(y_test, y_pred_classes)
print(f"Accuracy : {accuracy}")

Accuracy : 0.8752


In [13]:
model.save('../models/NN.keras')

# LSTM

In [14]:
embedding_vecor_length = 32
callback = EarlyStopping(monitor='val_loss', patience=2)

model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=maxlen))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

print(model.summary())

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 150, 50)           10554700  
                                                                 
 lstm (LSTM)                 (None, 100)               60400     
                                                                 
 dense_2 (Dense)             (None, 1)                 101       
                                                                 
Total params: 10615201 (40.49 MB)
Trainable params: 10615201 (40.49 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
None


In [15]:
model.fit(X_train1, y_train, epochs=10, batch_size=256,verbose = 1,validation_data=(X_valid1,y_valid),callbacks=[callback])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10


<keras.src.callbacks.History at 0x2ce280e80>

In [16]:
y_pred = model.predict(X_test1)
y_pred_classes = (y_pred > 0.5).astype(int) 

accuracy = accuracy_score(y_test, y_pred_classes)
print(f"Accuracy : {accuracy}")

Accuracy : 0.8746


In [17]:
model.save('../models/LSTM.keras')

# CNN

In [18]:
embedding_vecor_length = 32
callback = EarlyStopping(monitor='val_loss', patience=2)

model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=maxlen))
model.add(Conv1D(filters=32, kernel_size=3, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(LSTM(100))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

print(model.summary())

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 150, 50)           10554700  
                                                                 
 conv1d (Conv1D)             (None, 150, 32)           4832      
                                                                 
 max_pooling1d (MaxPooling1  (None, 75, 32)            0         
 D)                                                              
                                                                 
 lstm_1 (LSTM)               (None, 100)               53200     
                                                                 
 dense_3 (Dense)             (None, 1)                 101       
                                                                 
Total params: 10612833 (40.48 MB)
Trainable params: 10612833 (40.48 MB)
Non-trainable params: 0 (0.00 Byte)
____________

In [19]:
model.fit(X_train1, y_train, epochs=10, batch_size=256,verbose = 1,validation_data=(X_valid1,y_valid),callbacks=[callback])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10


<keras.src.callbacks.History at 0x12891f910>

In [59]:
y_pred = model.predict(X_test1)
y_pred_classes = (y_pred > 0.5).astype(int) 

accuracy = accuracy_score(y_test, y_pred_classes)
print(f"Accuracy : {accuracy}")

Accuracy : 0.8789


In [61]:
model.save('../models/CNN.keras')