In [None]:
# 1. Import Libraries
import numpy as np
import matplotlib.pyplot as plt
from keras.datasets import imdb
from keras import models, layers
from keras.callbacks import EarlyStopping
from sklearn.feature_extraction.text import TfidfTransformer

# 2. Load IMDB Dataset
(train_data, train_labels), (test_data, test_labels) = imdb.load_data(num_words=10000)

# 3. Decode integer sequences into text (needed for TF-IDF)
word_index = imdb.get_word_index()
index_word = {v: k for k, v in word_index.items()}

def decode_review(encoded_review):
    return ' '.join([index_word.get(i - 3, '?') for i in encoded_review])

# Decode reviews
decoded_train = [decode_review(review) for review in train_data]
decoded_test = [decode_review(review) for review in test_data]

decoded_train[0] = "this movie was bad"

# 4. Tokenize text to Bag-of-Words (simple word counts)
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(max_features=10000)
X_train_counts = vectorizer.fit_transform(decoded_train)
X_test_counts = vectorizer.transform(decoded_test)

# 5. Apply TF-IDF
tfidf = TfidfTransformer()

X_train_tfidf = tfidf.fit_transform(X_train_counts).toarray()
X_test_tfidf = tfidf.transform(X_test_counts).toarray()

# 6. Build DNN Model
# Build DNN Model with correct input dimension
model = models.Sequential()
model.add(layers.Dense(64, activation='relu', input_shape=(X_train_tfidf.shape[1],)))  # Not hardcoded
model.add(layers.Dense(32, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))


# 7. Compile Model
model.compile(optimizer='rmsprop',
              loss='binary_crossentropy',
              metrics=['accuracy'])

# 8. Prepare Validation Set
X_val = X_train_tfidf[:10000]
partial_X_train = X_train_tfidf[10000:]
y_val = train_labels[:10000]
partial_y_train = train_labels[10000:]

# 9. Train Model
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

history = model.fit(
    partial_X_train,
    partial_y_train,
    epochs=20,
    batch_size=512,
    validation_data=(X_val, y_val),
    callbacks=[early_stopping]
)

# 10. Evaluate Model
test_loss, test_acc = model.evaluate(X_test_tfidf, test_labels)
print(f"\nTest Accuracy: {test_acc:.4f}")

# 11. Predict Example
predictions = (model.predict(X_test_tfidf) > 0.5).astype("int32")

print("\nSample Prediction Result:")
print("Predicted Label:", "Positive" if predictions[0] == 1 else "Negative")
print("Actual Label:", "Positive" if test_labels[0] == 1 else "Negative")
print("Review Text:", decoded_train[0][:300], "...")  # Show part of review


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/20
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 123ms/step - accuracy: 0.6041 - loss: 0.6851 - val_accuracy: 0.8135 - val_loss: 0.6358
Epoch 2/20
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 81ms/step - accuracy: 0.8293 - loss: 0.6024 - val_accuracy: 0.8414 - val_loss: 0.5079
Epoch 3/20
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 121ms/step - accuracy: 0.8710 - loss: 0.4569 - val_accuracy: 0.8661 - val_loss: 0.3930
Epoch 4/20
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 72ms/step - accuracy: 0.8964 - loss: 0.3407 - val_accuracy: 0.8694 - val_loss: 0.3329
Epoch 5/20
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 75ms/step - accuracy: 0.9056 - loss: 0.2724 - val_accuracy: 0.8869 - val_loss: 0.2900
Epoch 6/20
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 79ms/step - accuracy: 0.9180 - loss: 0.2301 - val_accuracy: 0.8855 - val_loss: 0.2820
Epoch 7/20
[1m30/30[0m [32m━━

In [None]:
print("Review Text:", decoded_train[0][:300], "...")  # Show part of review


Review Text: this movie was bad ...


In [None]:
pip install tensorflow


In [2]:
# 1. Import Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from keras.datasets import imdb
from keras import models, layers
from keras.callbacks import EarlyStopping
from keras.preprocessing.sequence import pad_sequences

# 2. Load IMDB Dataset
(train_data, train_labels), (test_data, test_labels) = imdb.load_data(num_words=10000)

# 3. Pad Sequences (for embedding layer)
maxlen = 500  # Maximum review length (tuneable)
X_train = pad_sequences(train_data, maxlen=maxlen)
X_test  = pad_sequences(test_data,  maxlen=maxlen)


# 4. Display the shape of the padded data
print("Shape of X_train:", X_train.shape)
print("Shape of X_test: ", X_test.shape)
print("Shape of train_labels:", train_labels.shape)
print("Shape of test_labels: ", test_labels.shape)

# 5. Prepare human-readable reviews for DataFrame inspection
word_index = imdb.get_word_index()
index_word = {v: k for k, v in word_index.items()}

def decode_review(encoded):
    return " ".join(index_word.get(i-3, "?") for i in encoded)

decoded_train = [decode_review(r) for r in train_data]
decoded_test  = [decode_review(r) for r in test_data]


# 6. Build a DataFrame and inspect it
train_df = pd.DataFrame({
    "review": decoded_train,
    "label":  train_labels
})

print("\nDataFrame shape:", train_df.shape)
print("\nDataFrame head():")
print(train_df.head())


# 7. Build DNN Model with Embedding Layer
model = models.Sequential([
    # Embedding layer (increase the dimension)
    layers.Embedding(input_dim=10000, output_dim=256, input_length=maxlen),  # Increase embedding dimension to 256
    layers.GlobalAveragePooling1D(),

    # Dropout to prevent overfitting
    layers.Dropout(0.5),  # Dropout layer added

    # Fully connected layers
    layers.Dense(128, activation="relu"),  # Increased neurons
    layers.Dense(64, activation="relu"),
    layers.Dense(32, activation="relu"),

    # Output layer
    layers.Dense(1, activation="sigmoid")
])

# 8. Compile Model with Adam Optimizer
model.compile(
    optimizer="adam",  # Changed optimizer to Adam
    loss="binary_crossentropy",
    metrics=["accuracy"]
)

# 9. Prepare Validation Set
X_val = X_train[:10000]
partial_X_train = X_train[10000:]
y_val = train_labels[:10000]
partial_y_train = train_labels[10000:]

# 10. Train Model with EarlyStopping
early_stopping = EarlyStopping(
    monitor="val_loss",
    patience=5,  # Increased patience to allow more training epochs
    restore_best_weights=True
)

history = model.fit(
    partial_X_train,
    partial_y_train,
    epochs=20,  # Increased number of epochs
    batch_size=512,
    validation_data=(X_val, y_val),
    callbacks=[early_stopping]
)

# 11. Evaluate Model
test_loss, test_acc = model.evaluate(X_test, test_labels)
print(f"\nTest Accuracy: {test_acc:.4f}")

# 12. Predict Example
preds = (model.predict(X_test) > 0.5).astype("int32")
print("\nSample Prediction Result:")
print("Predicted:", "Positive" if preds[0] == 1 else "Negative")
print("Actual:   ", "Positive" if test_labels[0] == 1 else "Negative")
print("Review:   ", decode_review(test_data[0][:300]), "...")


Shape of X_train: (25000, 500)
Shape of X_test:  (25000, 500)
Shape of train_labels: (25000,)
Shape of test_labels:  (25000,)

DataFrame shape: (25000, 2)

DataFrame head():
                                              review  label
0  ? this film was just brilliant casting locatio...      1
1  ? big hair big boobs bad music and a giant saf...      0
2  ? this has to be one of the worst films of the...      0
3  ? the ? ? at storytelling the traditional sort...      1
4  ? worst mistake of my life br br i picked this...      0
Epoch 1/20




[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 119ms/step - accuracy: 0.5090 - loss: 0.6931 - val_accuracy: 0.5104 - val_loss: 0.6923
Epoch 2/20
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 17ms/step - accuracy: 0.5256 - loss: 0.6912 - val_accuracy: 0.5248 - val_loss: 0.6873
Epoch 3/20
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - accuracy: 0.5856 - loss: 0.6709 - val_accuracy: 0.7419 - val_loss: 0.5823
Epoch 4/20
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 16ms/step - accuracy: 0.7019 - loss: 0.5784 - val_accuracy: 0.8084 - val_loss: 0.4706
Epoch 5/20
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 13ms/step - accuracy: 0.7286 - loss: 0.5388 - val_accuracy: 0.8088 - val_loss: 0.4300
Epoch 6/20
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 13ms/step - accuracy: 0.7838 - loss: 0.4560 - val_accuracy: 0.6293 - val_loss: 0.5699
Epoch 7/20
[1m30/30[0m [32m━━━━━━━━━━━━━━

In [3]:
def preprocess_review(review, word_index, maxlen=500):
    # Clean and tokenize
    words = review.lower().split()
    # Map words to their index (subtracting 3 because IMDB reserve indexes 0, 1, 2 for special tokens)
    encoded = [word_index.get(word, 2) + 3 for word in words]  # 2 is "unknown" word
    # Pad the sequence
    encoded = pad_sequences([encoded], maxlen=maxlen)
    return encoded

# Example Input
custom_review = "This movie was absolutely fantastic and thrilling to watch"

# Preprocess it
input_data = preprocess_review(custom_review, word_index)

# Predict
prediction = (model.predict(input_data) > 0.5).astype("int32")

# Show Result
print("\nCustom Review Prediction:")
print(f"Review: \"{custom_review}\"")
print("Predicted Sentiment:", "Positive" if prediction[0][0] == 1 else "Negative")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 243ms/step

Custom Review Prediction:
Review: "This movie was absolutely fantastic and thrilling to watch"
Predicted Sentiment: Positive
