In [1]:
!nvidia-smi

Sun Mar 23 08:16:49 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   37C    P8             10W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [2]:
import torch
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

In [3]:
!wget -O dataset.csv "https://raw.githubusercontent.com/Subrat1920/IMDb-Movie-Sentiment-Analysis/refs/heads/main/Notebooks/Datasets/cleaned_sentiment_data.csv"

--2025-03-23 08:16:53--  https://raw.githubusercontent.com/Subrat1920/IMDb-Movie-Sentiment-Analysis/refs/heads/main/Notebooks/Datasets/cleaned_sentiment_data.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 57950796 (55M) [text/plain]
Saving to: ‘dataset.csv’


2025-03-23 08:16:54 (236 MB/s) - ‘dataset.csv’ saved [57950796/57950796]



In [4]:
import pandas as pd
import numpy as np

In [5]:
df = pd.read_csv("dataset.csv")
df.drop(columns="Unnamed: 0", inplace=True)
df = df.head(5000)
df.shape

(5000, 2)

In [6]:
df["Sentiments"].value_counts()

Unnamed: 0_level_0,count
Sentiments,Unnamed: 1_level_1
1,2546
0,2454


In [7]:
import re
import nltk
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [8]:
nltk.download("stopwords")
nltk.download("punkt")
nltk.download("wordnet")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [9]:
stop_words = set(stopwords.words('english'))

In [10]:
## Preprocessing the text
def clean_text(text):
  text = text.lower()
  text = re.sub(f"[{string.punctuation}]", "", text)
  text = re.sub(r'\d+', '', text)
  text = ' '.join([word for word in text.split() if word not in stop_words])
  return text

In [11]:
df['processed_revs'] = df["Reviews"].apply(clean_text)

In [12]:
df.head()

Unnamed: 0,Reviews,Sentiments,processed_revs
0,this film was just brilliant casting location ...,1,film brilliant casting location scenery story ...
1,big hair big boobs bad music and a giant safet...,0,big hair big boobs bad music giant safety pin ...
2,this has to be one of the worst films of the 1...,0,one worst films friends watching film target a...
3,the at storytelling the traditional sort man...,1,storytelling traditional sort many years event...
4,worst mistake of my life i picked this movie...,0,worst mistake life picked movie target figured...


In [13]:
sentences = df["processed_revs"].tolist()
max_len = max(len(x.split()) for x in sentences)
max_len

837

In [14]:
sentences[0]

'film brilliant casting location scenery story direction everyones really suited part played could imagine robert amazing actor director father came scottish island loved fact real connection film witty remarks throughout film great brilliant much bought film soon released would recommend everyone watch fly fishing amazing really cried end sad know say cry film must good definitely also two little boys played norman paul brilliant children often left list think stars play grown big profile whole film children amazing praised done dont think whole story lovely true someones life shared us'

In [15]:
vocabulary = set(sentences)
vocabulary_size = len(vocabulary)
vocabulary_size

4997

In [16]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [17]:
tokenizer = Tokenizer(num_words=5000, oov_token="<OOV>")
tokenizer.fit_on_texts(sentences)

In [18]:
sequences = tokenizer.texts_to_sequences(sentences)

In [19]:
len(sequences[0]) == len(sentences[0].split())

True

In [20]:
## padding sequences
padded_sequence = pad_sequences(sequences, maxlen=max_len, padding='post')

In [21]:
## padded sequence for first line
len(padded_sequence[0])

837

In [22]:
len(padded_sequence)

5000

In [23]:
from tensorflow.keras.layers import Embedding, Input
from tensorflow.keras.models import Model

In [24]:
## Applying Embeddings
embedding_dim = 10
max_word = 9900 ## vocabulary of the review
input_layer = Input(shape=(max_len,))
embedding_layer = Embedding(input_dim=max_word, output_dim=embedding_dim, input_length=max_len)(input_layer)
embedding_model = Model(inputs=input_layer, outputs=embedding_layer)



In [25]:
## as the dataset is very huge, process them in batch
import gc
import h5py
batch_size = 1000
with h5py.File("embeddings.h5", "w") as h5f:
    dset = h5f.create_dataset("embeddings", (len(df), max_len, embedding_dim), dtype="float32")

    for i in range(0, len(df), batch_size):
        batch_sequences = padded_sequence[i : i + batch_size]
        batch_embeddings = embedding_model.predict(batch_sequences, batch_size=256)  # Process in chunks
        dset[i : i + batch_size] = batch_embeddings  # Store in file

        # Free memory
        del batch_sequences, batch_embeddings
        gc.collect()

[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step 
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step


In [26]:
with h5py.File("embeddings.h5", "r") as h5f:
    sample_embedding = h5f["embeddings"][0]  ## Get first row's embedding
    print(sample_embedding.shape)

(837, 10)


In [27]:
## Open the HDF5 file without loading everything into RAM
h5f = h5py.File("embeddings.h5", "r")
## Get dataset shape (num_rows, max_len, embedding_dim)
print("Shape of dataset:", h5f["embeddings"].shape)
## Load embeddings in batches
safe_limit = 5000  ## Load only first 40,000 rows
batch_size = 500   ## Adjust batch size based on memory

## Store embeddings in chunks
all_embeddings = []

for i in range(0, safe_limit, batch_size):  ## Stopping at 40K
    batch_embeddings = h5f["embeddings"][i : i + batch_size]
    all_embeddings.append(batch_embeddings)  ## Storing the batches
    print(f"Loaded batch {i} to {i + batch_size}")

## Close HDF5 file
h5f.close()

embeddings = np.concatenate(all_embeddings, axis=0)
print("Final shape of embeddings:", embeddings.shape)

Shape of dataset: (5000, 837, 10)
Loaded batch 0 to 500
Loaded batch 500 to 1000
Loaded batch 1000 to 1500
Loaded batch 1500 to 2000
Loaded batch 2000 to 2500
Loaded batch 2500 to 3000
Loaded batch 3000 to 3500
Loaded batch 3500 to 4000
Loaded batch 4000 to 4500
Loaded batch 4500 to 5000
Final shape of embeddings: (5000, 837, 10)


In [28]:
np.save("embeddings.npy", embeddings)

In [29]:
embeddings.shape

(5000, 837, 10)

In [30]:
import shutil
from google.colab import files
shutil.make_archive("/content/embeddings", 'zip', "/content", "embeddings.npy")
files.download("/content/embeddings.zip")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [31]:
df.head()

Unnamed: 0,Reviews,Sentiments,processed_revs
0,this film was just brilliant casting location ...,1,film brilliant casting location scenery story ...
1,big hair big boobs bad music and a giant safet...,0,big hair big boobs bad music giant safety pin ...
2,this has to be one of the worst films of the 1...,0,one worst films friends watching film target a...
3,the at storytelling the traditional sort man...,1,storytelling traditional sort many years event...
4,worst mistake of my life i picked this movie...,0,worst mistake life picked movie target figured...


In [32]:
embeddings.shape[0] == df.shape[0]

True

In [33]:
new_df = df.copy()
new_df.head()

Unnamed: 0,Reviews,Sentiments,processed_revs
0,this film was just brilliant casting location ...,1,film brilliant casting location scenery story ...
1,big hair big boobs bad music and a giant safet...,0,big hair big boobs bad music giant safety pin ...
2,this has to be one of the worst films of the 1...,0,one worst films friends watching film target a...
3,the at storytelling the traditional sort man...,1,storytelling traditional sort many years event...
4,worst mistake of my life i picked this movie...,0,worst mistake life picked movie target figured...


In [34]:
new_df.drop(columns=['Reviews'], inplace=True)
new_df.head()

Unnamed: 0,Sentiments,processed_revs
0,1,film brilliant casting location scenery story ...
1,0,big hair big boobs bad music giant safety pin ...
2,0,one worst films friends watching film target a...
3,1,storytelling traditional sort many years event...
4,0,worst mistake life picked movie target figured...


In [35]:
new_df["embeddings"] = embeddings.tolist()

In [36]:
new_df.head()

Unnamed: 0,Sentiments,processed_revs,embeddings
0,1,film brilliant casting location scenery story ...,"[[-0.026815105229616165, -0.0444408655166626, ..."
1,0,big hair big boobs bad music giant safety pin ...,"[[0.04148361459374428, 0.03169364109635353, -0..."
2,0,one worst films friends watching film target a...,"[[0.02104378119111061, 0.014727059751749039, -..."
3,1,storytelling traditional sort many years event...,"[[-0.015688061714172363, -0.002199698239564895..."
4,0,worst mistake life picked movie target figured...,"[[-0.04897646978497505, 0.02014763280749321, -..."


In [37]:
new_df["embeddings"].shape

(5000,)

In [38]:
from google.colab import files
new_df.to_csv("cleaned_set_data.csv")
files.download("cleaned_set_data.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# **Sentiment Analysis Model Building**

In [39]:
df = pd.read_csv("cleaned_set_data.csv")
df.drop(columns="Unnamed: 0", inplace=True)
df.head()

Unnamed: 0,Sentiments,processed_revs,embeddings
0,1,film brilliant casting location scenery story ...,"[[-0.026815105229616165, -0.0444408655166626, ..."
1,0,big hair big boobs bad music giant safety pin ...,"[[0.04148361459374428, 0.03169364109635353, -0..."
2,0,one worst films friends watching film target a...,"[[0.02104378119111061, 0.014727059751749039, -..."
3,1,storytelling traditional sort many years event...,"[[-0.015688061714172363, -0.002199698239564895..."
4,0,worst mistake life picked movie target figured...,"[[-0.04897646978497505, 0.02014763280749321, -..."


In [44]:
x = np.array(df["embeddings"])
y = np.array(df["Sentiments"])

In [45]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2,random_state=42)

In [46]:
len(x_train), len(x_test), len(y_train), len(y_test)

(4000, 1000, 4000, 1000)

In [53]:
from google.colab import files
## saving them into npy format
np.save("x_train.npy", x_train)
np.save("x_test.npy", x_test)
np.save("y_train.npy", y_train)
np.save("y_test.npy", y_test)

## downloading them
files.download("x_train.npy")
print('Downloaded x_train')
files.download("x_test.npy")
print('Downloaded x_test')
files.download("y_train.npy")
print('Downloaded y_train')
files.download("y_test.npy")
print('Downloaded y_test')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Downloaded x_train


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Downloaded x_test


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Downloaded y_train


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Downloaded y_test
