In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
import os


In [2]:


ROOT = os.path.abspath(os.path.join(os.getcwd(),".."))

Data_dir = os.path.join(ROOT,'data')

file_path = os.path.join(Data_dir, "IMDB Dataset.csv")

df = pd.read_csv(file_path)

df.head()


Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [3]:
df['sentiment'].value_counts()

sentiment
positive    25000
negative    25000
Name: count, dtype: int64

In [4]:
df.isnull().sum()

review       0
sentiment    0
dtype: int64

In [5]:
print(f"Before dropping duplicates: ",df.duplicated().sum())
df.drop_duplicates(inplace=True)
print(f"After dropping duplicates: ",df.duplicated().sum())
print("------------------------------------------------------")
print("Value counts after removing duplicated rows")
df['sentiment'].value_counts()

Before dropping duplicates:  418
After dropping duplicates:  0
------------------------------------------------------
Value counts after removing duplicated rows


sentiment
positive    24884
negative    24698
Name: count, dtype: int64

## Text Processing

**1. Prepare Labels**
  >Map  0 --> Negative, 1 --> Positive 

In [6]:
label_map = {"negative":0, "positive":1}

y = df['sentiment'].map(label_map).astype("int32").values
X = df['review'].astype('str').values

**2. Split Data**

1. Split data to Train and Test
2. Split data to Train and Validation

In [7]:
from sklearn.model_selection import train_test_split

# Training and Test Split
X_train,X_test, y_train, y_test = train_test_split(X,y, random_state=42, test_size=0.2, stratify=y)

# Training and Validation Split
X_train,X_val, y_train, y_val = train_test_split(X_train,y_train, random_state=42, test_size=0.1, stratify=y_train)

**3.Vectorization Of Training Data**

In [8]:
from keras import layers
vocab_size = 30000
seq_len = 256
vectorizer = layers.TextVectorization(
    max_tokens= vocab_size,
    output_sequence_length=seq_len,
    output_mode='int',
    standardize= 'lower_and_strip_punctuation'
)

vectorizer.adapt(X_train)

**4. tf.data Pipelines**

In [9]:
batch_size = 64
AUTOTUNE = tf.data.AUTOTUNE

def make_ds(texts, labels, training=False):

    ds = tf.data.Dataset.from_tensor_slices((texts, labels))
    if training:
        ds = ds.shuffle(8192, reshuffle_each_iteration=True)
    ds = ds.batch(batch_size).prefetch(AUTOTUNE)

    return ds

train_ds = make_ds(X_train, y_train , training=True)
val_ds = make_ds(X_val, y_val)
test_ds = make_ds(X_test,y_test)

**5. Model**

`Text --> Vectorize --> Embedding --> BiLSTM --> Dense`

In [10]:
from keras import Input, models

inputs = Input(shape=(), dtype=tf.string)
x = vectorizer(inputs)
x = layers.Embedding(vocab_size,128,mask_zero=True )(x)

x = layers.Bidirectional(layers.LSTM(64))(x)

x = layers.Dropout(0.3)(x)

outputs = layers.Dense(1, activation='sigmoid')(x)

model = models.Model(inputs,outputs)

model.compile(
    optimizer="adam",
    loss="binary_crossentropy",
    metrics = ['accuracy']
)


**6. Train with early stopping**

In [11]:
from keras import callbacks
Callbacks = [
    callbacks.EarlyStopping(monitor='val_loss', patience=2,restore_best_weights=True)
]

history = model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=8,
    callbacks=Callbacks,
    verbose=1
)

Epoch 1/8
[1m558/558[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m243s[0m 426ms/step - accuracy: 0.8291 - loss: 0.3775 - val_accuracy: 0.8747 - val_loss: 0.2919
Epoch 2/8
[1m558/558[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m237s[0m 424ms/step - accuracy: 0.9271 - loss: 0.1949 - val_accuracy: 0.8692 - val_loss: 0.3110
Epoch 3/8
[1m558/558[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m248s[0m 444ms/step - accuracy: 0.9602 - loss: 0.1116 - val_accuracy: 0.8505 - val_loss: 0.3782


In [12]:
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np

# Get raw predictions
y_pred_probs = model.predict(test_ds)
y_pred = (y_pred_probs > 0.5).astype("int32").flatten()

# True labels (test_ds is batched, so we need to collect them)
y_true = np.concatenate([y for _, y in test_ds], axis=0)

# Metrics
print(classification_report(y_true, y_pred, target_names=["negative", "positive"]))

print("Confusion Matrix:\n", confusion_matrix(y_true, y_pred))


[1m155/155[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 93ms/step
              precision    recall  f1-score   support

    negative       0.84      0.92      0.88      4940
    positive       0.92      0.83      0.87      4977

    accuracy                           0.88      9917
   macro avg       0.88      0.88      0.88      9917
weighted avg       0.88      0.88      0.88      9917

Confusion Matrix:
 [[4563  377]
 [ 847 4130]]


In [13]:
import tensorflow as tf

texts = tf.constant([
    "This movie was amazing!",
    "Terrible acting, complete waste of time.",
    "It was okay, not great but not awful either."
])

pred_probs = model.predict(texts).flatten()

for t, p in zip(texts.numpy(), pred_probs):   # convert tf back to numpy for printing
    label = "positive" if p > 0.5 else "negative"
    print(f"{t.decode('utf-8')} → {label} ({p:.2f})")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 449ms/step
This movie was amazing! → positive (0.80)
Terrible acting, complete waste of time. → negative (0.00)
It was okay, not great but not awful either. → negative (0.03)
