In [1]:
! gdown --id 1QPn5WNRVUAtowSAgkw8ZuR3IZf_pxIdu
! gdown --id 13rg3cVqcNNP5Lhce_-EiQYkF3gx9BeGU

Downloading...
From: https://drive.google.com/uc?id=1QPn5WNRVUAtowSAgkw8ZuR3IZf_pxIdu
To: /content/IMDB_Dataset.csv
100% 66.2M/66.2M [00:01<00:00, 58.1MB/s]
Downloading...
From: https://drive.google.com/uc?id=13rg3cVqcNNP5Lhce_-EiQYkF3gx9BeGU
To: /content/glove.6B.100d.txt
100% 347M/347M [00:04<00:00, 84.8MB/s]


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import pandas as pd
dataset = pd.read_csv("IMDB_Dataset.csv")

In [4]:
dataset.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [5]:
import numpy as np
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [6]:
dataset = dataset[dataset['sentiment'].isin(['positive', 'negative'])]

In [7]:
dataset["sentiment"].loc[dataset["sentiment"]=="positive"]=1.0
dataset["sentiment"].loc[dataset["sentiment"]=="negative"]=0.0

In [8]:
dataset.head(30)

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1.0
1,A wonderful little production. <br /><br />The...,1.0
2,I thought this was a wonderful way to spend ti...,1.0
3,Basically there's a family where a little boy ...,0.0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1.0
5,"Probably my all-time favorite movie, a story o...",1.0
6,I sure would like to see a resurrection of a u...,1.0
7,"This show was an amazing, fresh & innovative i...",0.0
8,Encouraged by the positive comments about this...,0.0
9,If you like original gut wrenching laughter yo...,1.0


In [10]:
from sklearn.model_selection import train_test_split
# Convert the text data to sequences
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(dataset["review"])

sequences = tokenizer.texts_to_sequences(dataset["review"])

# Pad sequences to have the same length
max_sequence_length = 100
padded_sequences = pad_sequences(sequences, maxlen=max_sequence_length)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, dataset["sentiment"], test_size=0.2, random_state=42)


In [11]:
# Assuming you have downloaded the pre-trained GloVe embeddings file and placed it in the current directory
glove_file = 'glove.6B.100d.txt'

# Create a dictionary to map words to their corresponding word vectors
embeddings_index = {}
with open(glove_file, 'r', encoding='utf8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

# Create an embedding matrix for the words in our dataset
embedding_dim = 100
num_words = min(10000, len(tokenizer.word_index) + 1)
embedding_matrix = np.zeros((num_words, embedding_dim))
for word, i in tokenizer.word_index.items():
    if i < num_words:
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

In [12]:
model = Sequential()
model.add(Embedding(num_words, embedding_dim, weights=[embedding_matrix], input_length=max_sequence_length, trainable=False))
model.add(LSTM(64))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

Model Training

In [13]:
print(y_train)

39087    0.0
30893    0.0
45278    1.0
16398    0.0
13653    0.0
        ... 
11284    1.0
44732    1.0
38158    0.0
860      1.0
15795    1.0
Name: sentiment, Length: 40000, dtype: object


In [14]:
X_train = np.asarray(X_train).astype(np.float32)

In [15]:
print(y_train)

39087    0.0
30893    0.0
45278    1.0
16398    0.0
13653    0.0
        ... 
11284    1.0
44732    1.0
38158    0.0
860      1.0
15795    1.0
Name: sentiment, Length: 40000, dtype: object


In [16]:
#y_train = np.asarray(y_train).astype(np.float32)
y_train = np.asarray(y_train).astype('float32')

In [17]:
history = model.fit(X_train, y_train, validation_split=0.1, epochs=50, batch_size=128)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [18]:
X_test = np.asarray(X_test).astype(np.float32)
y_test = np.asarray(y_test).astype('float32')

In [19]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 100, 100)          1000000   
                                                                 
 lstm (LSTM)                 (None, 64)                42240     
                                                                 
 dense (Dense)               (None, 1)                 65        
                                                                 
Total params: 1,042,305
Trainable params: 42,305
Non-trainable params: 1,000,000
_________________________________________________________________


In [20]:
_, accuracy = model.evaluate(X_test, y_test)
print('Test Accuracy:', accuracy)

Test Accuracy: 0.8507000207901001


In [21]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Make predictions on the test set
y_pred = model.predict(X_test)
y_pred = y_pred.flatten()
y_pred = np.where(y_pred > 0.5, 1, 0)
print(f1_score(y_test,y_pred))

0.8508938380105862
