In [15]:
# ============================================================
# 1. IMPORT LIBRARIES
# ============================================================

import re
import numpy as np
import nltk
from sklearn.datasets import load_files
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from gensim.models import Word2Vec

In [16]:
# ============================================================
# 2. LOAD IMDB DATASET
# (Folder structure: aclImdb/train/pos, aclImdb/train/neg)
# ============================================================

train_data = load_files("aclImdb/train", categories=['pos','neg'])
test_data = load_files("aclImdb/test", categories=['pos','neg'])

X_train, y_train = train_data.data, train_data.target
X_test, y_test = test_data.data, test_data.target

# Convert bytes to string
X_train = [doc.decode("utf-8", errors="ignore") for doc in X_train]
X_test = [doc.decode("utf-8", errors="ignore") for doc in X_test]

In [17]:
# ============================================================
# 3. TEXT PREPROCESSING
# Lowercase + punctuation removal + stopwords + lemmatization
# ============================================================

stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

def preprocess(text):
    text = text.lower()
    text = re.sub(r"[^\w\s]", "", text)
    tokens = word_tokenize(text)
    
    tokens = [
        lemmatizer.lemmatize(word)
        for word in tokens
        if word not in stop_words and word.isalpha()
    ]
    
    return " ".join(tokens)

print("Preprocessing text...")

X_train_clean = [preprocess(doc) for doc in X_train]
X_test_clean = [preprocess(doc) for doc in X_test]

Preprocessing text...


In [18]:
# ============================================================
# 4. TF-IDF WITH N-GRAMS (1,2)
# ============================================================

print("Generating TF-IDF features...")

tfidf = TfidfVectorizer(
    max_features=5000,
    ngram_range=(1,2)
)

X_train_tfidf = tfidf.fit_transform(X_train_clean)
X_test_tfidf = tfidf.transform(X_test_clean)

Generating TF-IDF features...


In [19]:
# ============================================================
# 5. WORD2VEC EMBEDDINGS
# ============================================================

print("Training Word2Vec model...")

tokenized_train = [doc.split() for doc in X_train_clean]

w2v_model = Word2Vec(
    sentences=tokenized_train,
    vector_size=100,
    window=5,
    min_count=2,
    workers=4
)

def document_vector(tokens):
    vectors = [
        w2v_model.wv[word]
        for word in tokens
        if word in w2v_model.wv
    ]
    return np.mean(vectors, axis=0) if vectors else np.zeros(100)

X_train_w2v = np.array([document_vector(doc.split()) for doc in X_train_clean])
X_test_w2v = np.array([document_vector(doc.split()) for doc in X_test_clean])

Training Word2Vec model...


In [20]:
# ============================================================
# 6. DEFINE CLASSIFIERS
# ============================================================

models = {
    "Logistic Regression": LogisticRegression(max_iter=200),
    "Random Forest": RandomForestClassifier(n_estimators=100),
    "Linear SVM": LinearSVC(),
    "Naive Bayes": MultinomialNB()
}

In [21]:
# ============================================================
# 7. EVALUATION FUNCTION
# ============================================================

def evaluate(model, X_train, X_test, y_train, y_test, name):
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    
    print("\n==============================")
    print("Model:", name)
    print("==============================")
    print("Accuracy:", accuracy_score(y_test, preds))
    print("\nClassification Report:\n")
    print(classification_report(y_test, preds))
    print("Confusion Matrix:\n")
    print(confusion_matrix(y_test, preds))

In [22]:
# ============================================================
# 8. TRAIN & EVALUATE ON TF-IDF FEATURES
# ============================================================

print("\n######## TF-IDF FEATURES ########")

for name, model in models.items():
    evaluate(model, X_train_tfidf, X_test_tfidf, y_train, y_test, name)


######## TF-IDF FEATURES ########

Model: Logistic Regression
Accuracy: 0.88024

Classification Report:

              precision    recall  f1-score   support

           0       0.88      0.88      0.88     12500
           1       0.88      0.88      0.88     12500

    accuracy                           0.88     25000
   macro avg       0.88      0.88      0.88     25000
weighted avg       0.88      0.88      0.88     25000

Confusion Matrix:

[[10964  1536]
 [ 1458 11042]]

Model: Random Forest
Accuracy: 0.84228

Classification Report:

              precision    recall  f1-score   support

           0       0.83      0.85      0.84     12500
           1       0.85      0.83      0.84     12500

    accuracy                           0.84     25000
   macro avg       0.84      0.84      0.84     25000
weighted avg       0.84      0.84      0.84     25000

Confusion Matrix:

[[10679  1821]
 [ 2122 10378]]

Model: Linear SVM
Accuracy: 0.8662

Classification Report:

              

In [23]:
# ============================================================
# 9. TRAIN & EVALUATE ON WORD2VEC FEATURES
# (Skipping Naive Bayes for Word2Vec)
# ============================================================

print("\n######## WORD2VEC FEATURES ########")

for name, model in models.items():
    if name == "Naive Bayes":
        continue
    evaluate(model, X_train_w2v, X_test_w2v, y_train, y_test, name)


######## WORD2VEC FEATURES ########

Model: Logistic Regression
Accuracy: 0.84332

Classification Report:

              precision    recall  f1-score   support

           0       0.84      0.84      0.84     12500
           1       0.84      0.84      0.84     12500

    accuracy                           0.84     25000
   macro avg       0.84      0.84      0.84     25000
weighted avg       0.84      0.84      0.84     25000

Confusion Matrix:

[[10538  1962]
 [ 1955 10545]]

Model: Random Forest
Accuracy: 0.81472

Classification Report:

              precision    recall  f1-score   support

           0       0.82      0.81      0.81     12500
           1       0.81      0.82      0.82     12500

    accuracy                           0.81     25000
   macro avg       0.81      0.81      0.81     25000
weighted avg       0.81      0.81      0.81     25000

Confusion Matrix:

[[10107  2393]
 [ 2239 10261]]

Model: Linear SVM
Accuracy: 0.84416

Classification Report:

           