In [1]:
!nvidia-smi

Fri Mar 21 14:05:31 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   70C    P8             11W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [2]:
import torch
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

In [3]:
!wget -O dataset.csv "https://raw.githubusercontent.com/Subrat1920/IMDb-Movie-Sentiment-Analysis/refs/heads/main/Notebooks/Datasets/cleaned_sentiment_data.csv"

--2025-03-21 14:05:35--  https://raw.githubusercontent.com/Subrat1920/IMDb-Movie-Sentiment-Analysis/refs/heads/main/Notebooks/Datasets/cleaned_sentiment_data.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 57950796 (55M) [text/plain]
Saving to: ‘dataset.csv’


2025-03-21 14:05:36 (404 MB/s) - ‘dataset.csv’ saved [57950796/57950796]



In [4]:
import pandas as pd
import numpy as np

In [5]:
df = pd.read_csv("dataset.csv")
df.drop(columns="Unnamed: 0", inplace=True)
df = df.head(20000)
df.shape

(20000, 2)

In [6]:
import re
import nltk
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [7]:
nltk.download("stopwords")
nltk.download("punkt")
nltk.download("wordnet")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [8]:
stop_words = set(stopwords.words('english'))

In [9]:
## Preprocessing the text
def clean_text(text):
  text = text.lower()
  text = re.sub(f"[{string.punctuation}]", "", text)
  text = re.sub(r'\d+', '', text)
  text = ' '.join([word for word in text.split() if word not in stop_words])
  return text

In [10]:
df['processed_revs'] = df["Reviews"].apply(clean_text)

In [11]:
df.head()

Unnamed: 0,Reviews,Sentiments,processed_revs
0,this film was just brilliant casting location ...,1,film brilliant casting location scenery story ...
1,big hair big boobs bad music and a giant safet...,0,big hair big boobs bad music giant safety pin ...
2,this has to be one of the worst films of the 1...,0,one worst films friends watching film target a...
3,the at storytelling the traditional sort man...,1,storytelling traditional sort many years event...
4,worst mistake of my life i picked this movie...,0,worst mistake life picked movie target figured...


In [12]:
sentences = df["processed_revs"].tolist()
max_len = max(len(x.split()) for x in sentences)
max_len

1137

In [13]:
sentences[0]

'film brilliant casting location scenery story direction everyones really suited part played could imagine robert amazing actor director father came scottish island loved fact real connection film witty remarks throughout film great brilliant much bought film soon released would recommend everyone watch fly fishing amazing really cried end sad know say cry film must good definitely also two little boys played norman paul brilliant children often left list think stars play grown big profile whole film children amazing praised done dont think whole story lovely true someones life shared us'

In [14]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [15]:
tokenizer = Tokenizer(num_words=10000, oov_token="<OOV>")
tokenizer.fit_on_texts(sentences)

In [16]:
sequences = tokenizer.texts_to_sequences(sentences)

In [17]:
len(sequences[0]) == len(sentences[0].split())

True

In [18]:
## padding sequences
padded_sequence = pad_sequences(sequences, maxlen=max_len, padding='post')

In [19]:
## padded sequence for first line
padded_sequence[0]

array([  3, 401, 836, ...,   0,   0,   0], dtype=int32)

In [20]:
len(padded_sequence)

20000

In [21]:
from tensorflow.keras.layers import Embedding, Input
from tensorflow.keras.models import Model

In [22]:
## Applying Embeddings
embedding_dim = 50
max_word = 9900 ## vocabulary of the review
input_layer = Input(shape=(max_len,))
embedding_layer = Embedding(input_dim=max_word, output_dim=embedding_dim, input_length=max_len)(input_layer)
embedding_model = Model(inputs=input_layer, outputs=embedding_layer)



In [23]:
## as the dataset is very huge, process them in batch
import gc
import h5py
batch_size = 5000
with h5py.File("embeddings.h5", "w") as h5f:
    dset = h5f.create_dataset("embeddings", (len(df), max_len, embedding_dim), dtype="float32")

    for i in range(0, len(df), batch_size):
        batch_sequences = padded_sequence[i : i + batch_size]
        batch_embeddings = embedding_model.predict(batch_sequences, batch_size=256)  # Process in chunks
        dset[i : i + batch_size] = batch_embeddings  # Store in file

        # Free memory
        del batch_sequences, batch_embeddings
        gc.collect()

[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 72ms/step
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 61ms/step
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 44ms/step
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 49ms/step


In [24]:
with h5py.File("embeddings.h5", "r") as h5f:
    sample_embedding = h5f["embeddings"][0]  ## Get first row's embedding
    print(sample_embedding.shape)

(1137, 50)


In [25]:
## Open the HDF5 file without loading everything into RAM
h5f = h5py.File("embeddings.h5", "r")
## Get dataset shape (num_rows, max_len, embedding_dim)
print("Shape of dataset:", h5f["embeddings"].shape)
## Load embeddings in batches
safe_limit = 20000  ## Load only first 40,000 rows
batch_size = 2500   ## Adjust batch size based on memory

## Store embeddings in chunks
all_embeddings = []

for i in range(0, safe_limit, batch_size):  ## Stopping at 40K
    batch_embeddings = h5f["embeddings"][i : i + batch_size]
    all_embeddings.append(batch_embeddings)  ## Storing the batches
    print(f"Loaded batch {i} to {i + batch_size}")

## Close HDF5 file
h5f.close()

embeddings = np.concatenate(all_embeddings, axis=0)
print("Final shape of embeddings:", embeddings.shape)

Shape of dataset: (20000, 1137, 50)
Loaded batch 0 to 2500
Loaded batch 2500 to 5000
Loaded batch 5000 to 7500
Loaded batch 7500 to 10000
Loaded batch 10000 to 12500
Loaded batch 12500 to 15000
Loaded batch 15000 to 17500
Loaded batch 17500 to 20000
Final shape of embeddings: (20000, 1137, 50)


In [26]:
np.save("embeddings.npy", embeddings)

In [27]:
embeddings.shape

(20000, 1137, 50)

In [33]:
import shutil
from google.colab import files
shutil.make_archive("/content/embeddings", 'zip', "/content", "embeddings.npy")
files.download("/content/embeddings.zip")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>