In [1]:
from datasets import load_dataset
import pandas as pd

# Load IMDb dataset
dataset = load_dataset("imdb")

df = pd.DataFrame(dataset["train"])  

# Print first 10 rows
print("Dataset Loaded Successfully!")
print(df.head(10))


Dataset Loaded Successfully!
                                                text  label
0  I rented I AM CURIOUS-YELLOW from my video sto...      0
1  "I Am Curious: Yellow" is a risible and preten...      0
2  If only to avoid making this type of film in t...      0
3  This film was probably inspired by Godard's Ma...      0
4  Oh, brother...after hearing about this ridicul...      0
5  I would put this at the top of my list of film...      0
6  Whoever wrote the screenplay for this movie ob...      0
7  When I first saw a glimpse of this movie, I qu...      0
8  Who are these "They"- the actors? the filmmake...      0
9  This is said to be a personal film for Peter B...      0


In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

vectorizer = TfidfVectorizer(max_features=5000)  # Limit vocabulary size

X = vectorizer.fit_transform(df["text"]).toarray()  # Transform text to numerical values
y = df["label"].values  # Labels (0 = Negative, 1 = Positive)

# Convert to DataFrame
tfidf_df = pd.DataFrame(X, columns=vectorizer.get_feature_names_out())

# Print first 10 rows of transformed data
print("\nFirst 10 Rows (After TF-IDF Transformation):")
print(tfidf_df.head(10))



First 10 Rows (After TF-IDF Transformation):
    00  000        10  100   11   12   13  13th   14   15  ...     young  \
0  0.0  0.0  0.000000  0.0  0.0  0.0  0.0   0.0  0.0  0.0  ...  0.043502   
1  0.0  0.0  0.000000  0.0  0.0  0.0  0.0   0.0  0.0  0.0  ...  0.000000   
2  0.0  0.0  0.000000  0.0  0.0  0.0  0.0   0.0  0.0  0.0  ...  0.000000   
3  0.0  0.0  0.084063  0.0  0.0  0.0  0.0   0.0  0.0  0.0  ...  0.000000   
4  0.0  0.0  0.000000  0.0  0.0  0.0  0.0   0.0  0.0  0.0  ...  0.047706   
5  0.0  0.0  0.000000  0.0  0.0  0.0  0.0   0.0  0.0  0.0  ...  0.000000   
6  0.0  0.0  0.000000  0.0  0.0  0.0  0.0   0.0  0.0  0.0  ...  0.000000   
7  0.0  0.0  0.000000  0.0  0.0  0.0  0.0   0.0  0.0  0.0  ...  0.000000   
8  0.0  0.0  0.000000  0.0  0.0  0.0  0.0   0.0  0.0  0.0  ...  0.000000   
9  0.0  0.0  0.000000  0.0  0.0  0.0  0.0   0.0  0.0  0.0  ...  0.000000   

   younger      your  yourself  youth  zero  zizek  zombie  zombies  zone  
0      0.0  0.000000       0.0    0.0   0

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.neural_network import BernoulliRBM
from sklearn.model_selection import train_test_split

# Sigmoid function for reconstruction
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

# Split dataset into train & test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define hyperparameter search space
hidden_units = [64, 128, 256]  
best_rbm = None
best_loss = float("inf")
rbm_losses = {}

# Try different hidden units
for units in hidden_units:
    print(f"\nTraining RBM with {units} hidden units...")
    
    rbm = BernoulliRBM(n_components=units, learning_rate=0.01, n_iter=3, verbose=True, random_state=42)  # Reduced iterations
    
    # Train RBM
    rbm.fit(X_train)
    
    # Transform input through hidden layer
    hidden_features = rbm.transform(X_train)
    
    # Reconstruct input using visible probabilities (sigmoid activation)
    reconstructed_X = sigmoid(np.dot(hidden_features, rbm.components_))
    
    # Compute mean squared error (MSE) as reconstruction loss
    loss = np.mean(np.square(X_train - reconstructed_X))  
    
    rbm_losses[units] = loss
    
    # Keep track of the best RBM
    if loss < best_loss:
        best_loss = loss
        best_rbm = rbm

print("\nHyperparameter tuning completed!")
print(f"Best RBM has {best_rbm.n_components} hidden units with loss: {best_loss:.4f}")

plt.figure(figsize=(8, 5))
plt.plot(rbm_losses.keys(), rbm_losses.values(), marker='o', linestyle='--', color='r')
plt.xlabel("Number of Hidden Units")
plt.ylabel("Reconstruction Loss")
plt.title("RBM Hyperparameter Tuning: Hidden Units vs. Loss")
plt.grid()
plt.show()



Training RBM with 64 hidden units...
[BernoulliRBM] Iteration 1, pseudo-likelihood = -19.59, time = 34.71s
[BernoulliRBM] Iteration 2, pseudo-likelihood = -11.15, time = 35.90s
[BernoulliRBM] Iteration 3, pseudo-likelihood = -10.31, time = 35.19s

Training RBM with 128 hidden units...


In [None]:
import numpy as np
import matplotlib.pyplot as plt

best_rbm.n_iter = 1  
epochs = 25 
loss_curve = []

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

for epoch in range(1, epochs + 1):
    best_rbm.fit(X_train)  
    hidden_features = best_rbm.transform(X_train) 
    reconstructed_X = sigmoid(np.dot(hidden_features, best_rbm.components_))  # Reconstruct input
    loss = np.mean(np.square(X_train - reconstructed_X))  # Compute MSE loss
    
    loss_curve.append(loss)
    print(f"Epoch {epoch}/{epochs} - Loss: {loss:.4f}")

# Plot Loss Curve
plt.figure(figsize=(8, 5))
plt.plot(range(1, epochs + 1), loss_curve, marker='o', linestyle='-', color='b', label="Reconstruction Loss")
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.title("RBM Training Loss Curve")
plt.legend()
plt.grid()
plt.show()

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import DataLoader, TensorDataset

# Transform data using RBM
X_train_rbm = best_rbm.transform(X_train)  # Extract features
X_test_rbm = best_rbm.transform(X_test)    # Extract features from test set

# Convert labels to tensors
le = LabelEncoder()
y_train_enc = le.fit_transform(y_train)  # Encode labels
y_test_enc = le.transform(y_test)

# Convert to PyTorch tensors
X_train_tensor = torch.tensor(X_train_rbm, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test_rbm, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train_enc, dtype=torch.long)
y_test_tensor = torch.tensor(y_test_enc, dtype=torch.long)

# Create DataLoaders
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

print(f"RBM feature extraction complete! New feature shape: {X_train_rbm.shape}")


In [None]:
class CNN_Model(nn.Module):
    def __init__(self, input_size, num_classes):
        super(CNN_Model, self).__init__()
        self.fc1 = nn.Linear(input_size, 128)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, num_classes)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = self.softmax(self.fc3(x))
        return x

# Initialize model
input_size = X_train_rbm.shape[1]  # Number of RBM features
num_classes = len(np.unique(y_train_enc))  # Number of classes
model = CNN_Model(input_size, num_classes)

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

print("CNN model initialized!")


In [None]:
num_epochs = 20  # Reduce training time
train_loss_curve = []
train_acc_curve = []

# Training loop
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    correct = 0
    total = 0
    
    for inputs, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
        _, predicted = outputs.max(1)
        correct += (predicted == labels).sum().item()
        total += labels.size(0)
    
    avg_loss = total_loss / len(train_loader)
    accuracy = 100 * correct / total
    train_loss_curve.append(avg_loss)
    train_acc_curve.append(accuracy)

    print(f"Epoch {epoch+1}/{num_epochs} - Loss: {avg_loss:.4f}, Accuracy: {accuracy:.2f}%")

# Plot Loss & Accuracy Curve
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
plt.plot(range(1, num_epochs+1), train_loss_curve, marker='o', linestyle='-', color='b', label="Loss")
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.title("CNN Training Loss")
plt.legend()
plt.grid()

plt.subplot(1, 2, 2)
plt.plot(range(1, num_epochs+1), train_acc_curve, marker='o', linestyle='-', color='g', label="Accuracy")
plt.xlabel("Epochs")
plt.ylabel("Accuracy (%)")
plt.title("CNN Training Accuracy")
plt.legend()
plt.grid()

plt.show()


In [None]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, precision_score, recall_score, f1_score
import seaborn as sns

# Set model to evaluation mode
model.eval()

y_pred = []
y_true = []

# Run model on test set
with torch.no_grad():
    for inputs, labels in test_loader:
        outputs = model(inputs)
        _, predicted = outputs.max(1)
        y_pred.extend(predicted.numpy())
        y_true.extend(labels.numpy())

print("Predictions completed!")


In [None]:
# Compute metrics
accuracy = accuracy_score(y_true, y_pred)
precision = precision_score(y_true, y_pred, average="weighted")
recall = recall_score(y_true, y_pred, average="weighted")
f1 = f1_score(y_true, y_pred, average="weighted")

# Print classification report
print("\n📊 Classification Report:")
print(classification_report(y_true, y_pred))

# Print evaluation metrics
print(f"✅ Accuracy:  {accuracy:.4f}")
print(f"✅ Precision: {precision:.4f}")
print(f"✅ Recall:    {recall:.4f}")
print(f"✅ F1 Score:  {f1:.4f}")


In [None]:
# Generate Confusion Matrix
cm = confusion_matrix(y_true, y_pred)

# Plot Confusion Matrix
plt.figure(figsize=(6, 5))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=le.classes_, yticklabels=le.classes_)
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix")
plt.show()


# PCA

In [None]:
from sklearn.decomposition import PCA

# Apply PCA (keep 95% variance)
pca = PCA(n_components=0.95, random_state=42)
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)

print(f"PCA reduced dimensions from {X_train.shape[1]} to {X_train_pca.shape[1]}")


In [None]:
# Convert PCA features & labels to tensors
X_train_tensor_pca = torch.tensor(X_train_pca, dtype=torch.float32)
X_test_tensor_pca = torch.tensor(X_test_pca, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train_enc, dtype=torch.long)
y_test_tensor = torch.tensor(y_test_enc, dtype=torch.long)

# Create DataLoaders
train_dataset_pca = TensorDataset(X_train_tensor_pca, y_train_tensor)
test_dataset_pca = TensorDataset(X_test_tensor_pca, y_test_tensor)

train_loader_pca = DataLoader(train_dataset_pca, batch_size=32, shuffle=True)
test_loader_pca = DataLoader(test_dataset_pca, batch_size=32, shuffle=False)


In [None]:
# Reuse the same CNN model
model_pca = CNN_Model(X_train_pca.shape[1], num_classes)

# Define loss function & optimizer
criterion = nn.CrossEntropyLoss()
optimizer_pca = optim.Adam(model_pca.parameters(), lr=0.001)

print("CNN model initialized for PCA features!")


In [None]:
num_epochs = 20  
train_loss_curve_pca = []
train_acc_curve_pca = []

for epoch in range(num_epochs):
    model_pca.train()
    total_loss = 0
    correct = 0
    total = 0
    
    for inputs, labels in train_loader_pca:
        optimizer_pca.zero_grad()
        outputs = model_pca(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer_pca.step()
        
        total_loss += loss.item()
        _, predicted = outputs.max(1)
        correct += (predicted == labels).sum().item()
        total += labels.size(0)
    
    avg_loss = total_loss / len(train_loader_pca)
    accuracy = 100 * correct / total
    train_loss_curve_pca.append(avg_loss)
    train_acc_curve_pca.append(accuracy)

    print(f"Epoch {epoch+1}/{num_epochs} - Loss: {avg_loss:.4f}, Accuracy: {accuracy:.2f}%")

# Plot Loss & Accuracy Curve
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
plt.plot(range(1, num_epochs+1), train_loss_curve_pca, marker='o', linestyle='-', color='b', label="Loss")
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.title("PCA+CNN Training Loss")
plt.legend()
plt.grid()

plt.subplot(1, 2, 2)
plt.plot(range(1, num_epochs+1), train_acc_curve_pca, marker='o', linestyle='-', color='g', label="Accuracy")
plt.xlabel("Epochs")
plt.ylabel("Accuracy (%)")
plt.title("PCA+CNN Training Accuracy")
plt.legend()
plt.grid()

plt.show()


In [None]:
y_pred_pca = []
y_true_pca = []

# Run model on test set
model_pca.eval()
with torch.no_grad():
    for inputs, labels in test_loader_pca:
        outputs = model_pca(inputs)
        _, predicted = outputs.max(1)
        y_pred_pca.extend(predicted.numpy())
        y_true_pca.extend(labels.numpy())

# Compute evaluation metrics
accuracy_pca = accuracy_score(y_true_pca, y_pred_pca)
precision_pca = precision_score(y_true_pca, y_pred_pca, average="weighted")
recall_pca = recall_score(y_true_pca, y_pred_pca, average="weighted")
f1_pca = f1_score(y_true_pca, y_pred_pca, average="weighted")

# Print classification report
print("\n📊 PCA+CNN Classification Report:")
print(classification_report(y_true_pca, y_pred_pca))

# Print metrics
print(f"✅ Accuracy:  {accuracy_pca:.4f}")
print(f"✅ Precision: {precision_pca:.4f}")
print(f"✅ Recall:    {recall_pca:.4f}")
print(f"✅ F1 Score:  {f1_pca:.4f}")


# COMPARISON OF PCA+CNN

In [None]:
# Print comparison table
print("\n🔹 Performance Comparison: RBM+CNN vs. PCA+CNN")
print(f"{'Metric':<15}{'RBM+CNN':<10}{'PCA+CNN':<10}")
print(f"{'-'*35}")
print(f"{'Accuracy':<15}{accuracy:.4f}    {accuracy_pca:.4f}")
print(f"{'Precision':<15}{precision:.4f}    {precision_pca:.4f}")
print(f"{'Recall':<15}{recall:.4f}    {recall_pca:.4f}")
print(f"{'F1 Score':<15}{f1:.4f}    {f1_pca:.4f}")


In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Define metrics & values
metrics = ["Accuracy", "Precision", "Recall", "F1 Score"]
rbm_values = [accuracy, precision, recall, f1]
pca_values = [accuracy_pca, precision_pca, recall_pca, f1_pca]

# Set bar width
bar_width = 0.3  
index = np.arange(len(metrics))

# Plot bars
plt.figure(figsize=(8, 5))
plt.bar(index, rbm_values, bar_width, label="RBM+CNN", color="blue")
plt.bar(index + bar_width, pca_values, bar_width, label="PCA+CNN", color="green")

# Labels & Title
plt.xlabel("Evaluation Metrics")
plt.ylabel("Score")
plt.title("RBM+CNN vs PCA+CNN Performance Comparison")
plt.xticks(index + bar_width / 2, metrics)  # Set x-ticks at center
plt.ylim(0, 1)
plt.legend()
plt.grid(axis="y", linestyle="--", alpha=0.7)

# Show the plot
plt.show()
