Binary classification using Deep Neural Networks Example: Classify movie reviews into
positive reviews and negative reviews, just based on the text content of the reviews.
Use IMDB dataset.

In [25]:
import pandas as pd
import re
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, Dropout, GlobalAveragePooling1D
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

import re

# Load the dataset
df = pd.read_csv("imdb_master.csv", on_bad_lines='skip', encoding='ISO-8859-1')

# Step 1: Filter only training rows with 'pos' or 'neg' labels
df = df.query("type == 'train' and label in ['pos', 'neg']")

# Step 2: Remove HTML tags from reviews
df['review'] = df['review'].str.replace('<.*?>', '', regex=True)

# Prepare data
texts = df['review']
labels = LabelEncoder().fit_transform(df['label'])

# Tokenization and padding
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(texts)
X = tokenizer.texts_to_sequences(texts)
X = pad_sequences(X, maxlen=200)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.2, random_state=42)

# Model architecture
model = Sequential([
    Embedding(input_dim=10000, output_dim=64, input_length=200),
    GlobalAveragePooling1D(),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

# Compile and train
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=5, batch_size=64, validation_split=0.2)

# Evaluate
accuracy = model.evaluate(X_test, y_test)
print(f"Accuracy: {accuracy[1]:.4f}")

# Predict
predictions = (model.predict(X_test[:5]) > 0.5).astype(int)

# Display results
for i in range(5):
    print(f"Predicted: {'Positive' if predictions[i] == 1 else 'Negative'} | Actual: {'Positive' if y_test[i] == 1 else 'Negative'}")


Epoch 1/5




[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 14ms/step - accuracy: 0.6024 - loss: 0.6593 - val_accuracy: 0.7990 - val_loss: 0.4343
Epoch 2/5
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 13ms/step - accuracy: 0.8593 - loss: 0.3434 - val_accuracy: 0.8482 - val_loss: 0.3398
Epoch 3/5
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 14ms/step - accuracy: 0.8958 - loss: 0.2613 - val_accuracy: 0.8585 - val_loss: 0.3210
Epoch 4/5
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 13ms/step - accuracy: 0.9251 - loss: 0.2004 - val_accuracy: 0.8683 - val_loss: 0.3205
Epoch 5/5
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 13ms/step - accuracy: 0.9360 - loss: 0.1807 - val_accuracy: 0.8763 - val_loss: 0.3316
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.8755 - loss: 0.3277
Accuracy: 0.8714
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 86ms/step
Pr