In [None]:
# Sentiment Analysis on IMDb Movie Reviews
# Algorithms: K-Means, Linear Perceptron, Multi-Layer Perceptron

# Step 1: Download and Extract IMDb Dataset
import os

dataset_url = "https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"
dataset_tar = "aclImdb_v1.tar.gz"

if not os.path.exists('aclImdb'):
    print("Downloading IMDb dataset...")
    !wget $dataset_url
    print("Extracting dataset...")
    !tar -xzf $dataset_tar
else:
    print("IMDb dataset already downloaded and extracted.")

# Step 2: Import Libraries
import numpy as np
import pandas as pd
from sklearn.datasets import load_files
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.linear_model import Perceptron
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, accuracy_score

# Step 3: Load Data from ./aclImdb/train (Positive & Negative Reviews)
dataset = load_files('./aclImdb/train', categories=['pos', 'neg'], shuffle=True)
X_raw, y = dataset.data, dataset.target

# Subsample for Project Requirement (10,000 samples)
X_raw = X_raw[:10000]
y = y[:10000]

# Step 4: Text Vectorization with TF-IDF
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
X = vectorizer.fit_transform([x.decode('utf-8', errors='ignore') for x in X_raw])

# Step 5: Split Dataset into Training and Testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Part 1: K-Means Clustering
print("\n[ K-Means Clustering ]")
kmeans = KMeans(n_clusters=2, random_state=42, n_init=10)
kmeans.fit(X_train)

from scipy.stats import mode

# Map cluster ID to sentiment label (because clusters are unsupervised)
cluster_to_label = {}

for cluster_id in [0, 1]:
    mask = (kmeans.labels_ == cluster_id)
    majority_label = mode(np.array(y_train)[mask], keepdims=True).mode[0]
    cluster_to_label[cluster_id] = majority_label


predicted_clusters = kmeans.predict(X_test)
predicted_labels = np.vectorize(cluster_to_label.get)(predicted_clusters)
print(f"K-Means Accuracy: {accuracy_score(y_test, predicted_labels):.4f}")

# Part 2: Linear Perceptron Classifier
print("\n[ Linear Perceptron ]")
perceptron = Perceptron(max_iter=1000, random_state=42)
perceptron.fit(X_train, y_train)
y_pred_perceptron = perceptron.predict(X_test)
print(classification_report(y_test, y_pred_perceptron))

# Part 3: Multi-Layer Perceptron (MLP) Classifier
print("\n[ Multi-Layer Perceptron (MLP) ]")
mlp = MLPClassifier(hidden_layer_sizes=(100,), max_iter=20, solver='adam', random_state=42)
mlp.fit(X_train, y_train)
y_pred_mlp = mlp.predict(X_test)
print(classification_report(y_test, y_pred_mlp))



IMDb dataset already downloaded and extracted.

[ K-Means Clustering ]
K-Means Accuracy: 0.5345

[ Linear Perceptron ]
              precision    recall  f1-score   support

           0       0.81      0.85      0.83       969
           1       0.85      0.81      0.83      1031

    accuracy                           0.83      2000
   macro avg       0.83      0.83      0.83      2000
weighted avg       0.83      0.83      0.83      2000


[ Multi-Layer Perceptron (MLP) ]
              precision    recall  f1-score   support

           0       0.83      0.85      0.84       969
           1       0.86      0.84      0.85      1031

    accuracy                           0.85      2000
   macro avg       0.85      0.85      0.85      2000
weighted avg       0.85      0.85      0.85      2000



