In [None]:
import glob #to find files with matching pattern
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt



from sklearn.feature_extraction.text import CountVectorizer #CountVectorizer is used to convert raw text into numeric feature matrix of word counts
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

#
import zipfile
import os


zip_path = "mix20_rand700_tokens.zip"


with zipfile.ZipFile(zip_path, 'r') as zip_ref: #extracting the given data - subfolders are present- pos,neg
    zip_ref.extractall("movie_reviews")


os.listdir("movie_reviews")



pos_files = glob.glob("movie_reviews/tokens/pos/*.txt")
neg_files = glob.glob("movie_reviews/tokens/neg/*.txt")

#latin-1 is used because there may be special characters present in the reviews
pos_reviews = [open(f, encoding="latin-1").read() for f in pos_files] #list of positive review strings
neg_reviews = [open(f, encoding="latin-1").read() for f in neg_files] #list of negative review strings

texts = pos_reviews + neg_reviews
labels = [1]*len(pos_reviews) + [0]*len(neg_reviews) #creates a list of 1's for positive reviews and list of 0's for negative reviews

print("Total reviews:", len(texts))
print("Example review:\n", texts[0][:300])



X_train, X_test, y_train, y_test = train_test_split( #randome_state is used to prevent different random splits each time
    texts, labels, test_size=0.2, random_state=42, stratify=labels #stratify is used to ensure both sets have same proportion of pos and neg
) #test_size implies 20% of data goes to testing, rest 80% goes to training

print(f"Train size: {len(X_train)}, Test size: {len(X_test)}")

# here we extract the features, each word becomes a feature
vectorizer = CountVectorizer(binary=True, stop_words="english")  #if a word is present in revview it stores 1, else 0, stopwords is used to remove common connecting english words in general
X_train_vec = vectorizer.fit_transform(X_train) # learning the vocabulary and turn the training reviews into vectors
X_test_vec = vectorizer.transform(X_test) #use same vocabulary from training and turn it into vectors
# no fit_transform is used for test because we don't want to introduce new words

print("Feature matrix shape:", X_train_vec.shape) #coverted to sparse vector that consists of only 0's and 1's and shows (number of samples, number of features)
# number of samples = number of training reviews
# number of features = number of unique words in vocabulary
results = {}

# Naive Bayes
nb = MultinomialNB()
nb.fit(X_train_vec, y_train) #conditional probability happens..generative model
y_pred_nb = nb.predict(X_test_vec)
results["Naive Bayes"] = accuracy_score(y_test, y_pred_nb) * 100

# Logistic Regression
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train_vec, y_train) #it depends more on learning the weights for each word, chooses the model with maximum entropy from data..discriminative
y_pred_lr = lr.predict(X_test_vec)
results["Logistic Regression"] = accuracy_score(y_test, y_pred_lr) * 100

# Support Vector Machine , finds the best separating hyperplane between positive and negative reviews  , best for classification problems
svm = LinearSVC(max_iter=5000)
svm.fit(X_train_vec, y_train)
y_pred_svm = svm.predict(X_test_vec)
results["SVM (Linear)"] = accuracy_score(y_test, y_pred_svm) * 100


df_results = pd.DataFrame(list(results.items()), columns=["Classifier", "Accuracy (%)"])
print("\nResults:")
print(df_results)

Total reviews: 1400
Example review:
 filled with a tantalizing air of suspense , with a friend like harry is an unusual yet well-balanced mix of dark comedy , french thriller , and surreal drama . as i was watching the film , i found myself groping for its message at each turn of the plot . its ultimate effect is comparable to claude c
Train size: 1120, Test size: 280
Feature matrix shape: (1120, 32238)

Results:
            Classifier  Accuracy (%)
0          Naive Bayes     80.000000
1  Logistic Regression     83.214286
2         SVM (Linear)     83.214286


In [None]:
from sklearn.feature_extraction.text import CountVectorizer

# Frequency (counts)
vec_freq = CountVectorizer(stop_words="english") #counts word occurrences and remove the common english words
X_train_f = vec_freq.fit_transform(X_train) #learns and builds sparse matrix
X_test_f  = vec_freq.transform(X_test)

# Presence (binary)
vec_bin = CountVectorizer(binary=True, stop_words="english") #ignores the number of times the word occurs and marks only the presence of the word
X_train_b = vec_bin.fit_transform(X_train)
X_test_b  = vec_bin.transform(X_test)


In [None]:
vec_bi = CountVectorizer(binary=True, stop_words="english", ngram_range=(1,2)) #for both unigrams (single word review) and bigrams(double word review)
X_train_bi = vec_bi.fit_transform(X_train)
X_test_bi  = vec_bi.transform(X_test)


In [None]:
vec_bigrams = CountVectorizer(binary=True, stop_words="english", ngram_range=(2,2)) #only bigrams are reviewed
X_train_bigrams = vec_bigrams.fit_transform(X_train)
X_test_bigrams  = vec_bigrams.transform(X_test)


In [None]:
import spacy #imports english language model and gives parts of speech tagging
nlp = spacy.load("en_core_web_sm")

def keep_adjectives(texts):
    new_texts = []
    for doc in nlp.pipe(texts, batch_size=20, disable=["ner","parser"]):
        adj_words = [token.text for token in doc if token.pos_=="ADJ"]
        new_texts.append(" ".join(adj_words))
    return new_texts

X_train_adj = CountVectorizer(binary=True).fit_transform(keep_adjectives(X_train))
X_test_adj  = CountVectorizer(binary=True).fit(keep_adjectives(X_train)).transform(keep_adjectives(X_test))


In [None]:
# Load spaCy English model
nlp = spacy.load("en_core_web_sm")

# ---- Preprocessor for Unigrams + POS ----
def pos_preprocessor(text):
    doc = nlp(text)
    return " ".join([
        f"{token.text.lower()}_{token.pos_}"
        for token in doc
        if token.is_alpha and not token.is_stop
    ])

# ---- Preprocessor for Unigrams + Position ----
def position_preprocessor(text):
    tokens = [w.lower() for w in text.split() if w.isalpha()]
    return " ".join([f"{word}_{i}" for i, word in enumerate(tokens)])


In [None]:
vec_top = CountVectorizer(binary=True, stop_words="english", max_features=2633)
X_train_top = vec_top.fit_transform(X_train)
X_test_top  = vec_top.transform(X_test)


In [None]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score

def evaluate_models(X_train, X_test, y_train, y_test):
    results = {}
    nb = MultinomialNB().fit(X_train, y_train)
    results["NB"] = accuracy_score(y_test, nb.predict(X_test)) * 100

    lr = LogisticRegression(max_iter=1000).fit(X_train, y_train)
    results["ME"] = accuracy_score(y_test, lr.predict(X_test)) * 100

    svm = LinearSVC(max_iter=5000).fit(X_train, y_train)
    results["SVM"] = accuracy_score(y_test, svm.predict(X_test)) * 100
    return results



experiments = [
    ("Unigrams (freq)", CountVectorizer(stop_words="english", binary=False)),
    ("Unigrams (presence)", CountVectorizer(stop_words="english", binary=True)),
    ("Unigrams+Bigrams (presence)", CountVectorizer(stop_words="english", binary=True, ngram_range=(1,2))),
    ("Bigrams only", CountVectorizer(stop_words="english", binary=True, ngram_range=(2,2))),
    ("Adjectives only", CountVectorizer(stop_words="english", binary=True)),

    ("Top 2633 unigrams", CountVectorizer(stop_words="english", binary=True, max_features=2633)),
    # ✅ New experiments added below
    ("Unigrams + POS", CountVectorizer(binary=True, preprocessor=pos_preprocessor)),
    ("Unigrams + Position", CountVectorizer(binary=True, preprocessor=position_preprocessor)),
]

results = []

for name, vectorizer in experiments:

    Xtr = vectorizer.fit_transform(X_train)
    Xte = vectorizer.transform(X_test)

    res = evaluate_models(Xtr, Xte, y_train, y_test)
    vocab_size = len(vectorizer.get_feature_names_out())  # number of features

    results.append([name, vocab_size, res["NB"], res["ME"], res["SVM"]])

df = pd.DataFrame(results, columns=["Features", "#Features", "NB", "ME", "SVM"])
df.index += 1
print(df)



                      Features  #Features         NB         ME        SVM
1              Unigrams (freq)      32238  76.785714  78.214286  77.142857
2          Unigrams (presence)      32238  80.000000  83.214286  83.214286
3  Unigrams+Bigrams (presence)     325558  78.571429  82.857143  82.857143
4                 Bigrams only     293320  76.428571  73.928571  74.642857
5              Adjectives only      32238  80.000000  83.214286  83.214286
6            Top 2633 unigrams       2633  77.142857  81.428571  81.071429
7               Unigrams + POS      40048  81.071429  85.714286  85.714286
8          Unigrams + Position     410241  59.285714  61.071429  60.714286


In [None]:

#LOGISTIC REGREssion

# Convert sparse matrices to dense
X_train_manual = X_train_vec.toarray()
X_test_manual = X_test_vec.toarray()

y_train_arr = np.array(y_train)
y_test_arr = np.array(y_test)

# Add bias term
X_train_manual = np.hstack([np.ones((X_train_manual.shape[0], 1)), X_train_manual])
X_test_manual = np.hstack([np.ones((X_test_manual.shape[0], 1)), X_test_manual])

#computing sigmoid
def sigmoid(z):
    return 1 / (1 + np.exp(-z))

# cross entropy lss
def compute_loss(X, y, theta):
    m = len(y)
    h = sigmoid(X @ theta)
    epsilon = 1e-15  # avoid log(0)
    return - (1/m) * np.sum(y * np.log(h + epsilon) + (1 - y) * np.log(1 - h + epsilon))

#Gradient Descent
def gradient_descent(X, y, learning_rate=0.1, num_iters=1000):
    m, n = X.shape
    theta = np.zeros(n)
    losses = []

    for i in range(num_iters):
        h = sigmoid(X @ theta)
        gradient = (1/m) * (X.T @ (h - y))
        theta -= learning_rate * gradient

        current_loss = compute_loss(X, y, theta)
        losses.append(current_loss)

        if i % 100 == 0 or i == num_iters - 1:
            print(f"Iteration {i}: Loss = {current_loss:.6f}")

    return theta, losses

#model training
theta, losses = gradient_descent(X_train_manual, y_train_arr,
                                 learning_rate=0.1, num_iters=1000)

#Minimum Loss Achieved
min_loss = min(losses)
min_loss_iter = losses.index(min_loss)
print(f"\nMinimum Loss: {min_loss:.6f} at Iteration {min_loss_iter}")

#Final Training Loss
final_loss = compute_loss(X_train_manual, y_train_arr, theta)
print(f"Final Training Loss: {final_loss:.6f}")

#Final Optimized Weights
print("\nFirst 10 Optimized Weights (theta):")
print(theta[:10])  # showing only first 10 for readability

#Evaluation on test data
y_pred_prob = sigmoid(X_test_manual @ theta)
y_pred = (y_pred_prob >= 0.5).astype(int)
accuracy = np.mean(y_pred == y_test_arr) * 100
print(f"\nManual Logistic Regression Accuracy: {accuracy:.2f}%")


# Check loss trend
initial_loss = losses[0]
final_loss = losses[-1]

print(f"Initial Loss: {initial_loss:.6f}")
print(f"Final Loss: {final_loss:.6f}")

if final_loss < initial_loss:
    print("Loss decreased over training — Gradient Descent converged properly.")
else:
    print("Loss did NOT decrease — model did not converge correctly.")

# Optional: show first 5 and last 5 losses to visualize the trend
print("\nFirst 5 losses:", losses[:5])
print("Last 5 losses:", losses[-5:])

Iteration 0: Loss = 0.681028
Iteration 100: Loss = 0.281804
Iteration 200: Loss = 0.183811
Iteration 300: Loss = 0.136463
Iteration 400: Loss = 0.108319
Iteration 500: Loss = 0.089654
Iteration 600: Loss = 0.076383
Iteration 700: Loss = 0.066473
Iteration 800: Loss = 0.058797
Iteration 900: Loss = 0.052682
Iteration 999: Loss = 0.047744

Minimum Loss: 0.047744 at Iteration 999
Final Training Loss: 0.047744

First 10 Optimized Weights (theta):
[-0.23901885 -0.03960387 -0.03866037 -0.01959266 -0.01343476 -0.00378508
 -0.01046234 -0.00813789  0.00357438 -0.00628475]

Manual Logistic Regression Accuracy: 81.07%
Initial Loss: 0.681028
Final Loss: 0.047744
Loss decreased over training — Gradient Descent converged properly.

First 5 losses: [np.float64(0.6810283820870058), np.float64(0.6697831607949025), np.float64(0.6589931830931941), np.float64(0.64860046967171), np.float64(0.6385820077456074)]
Last 5 losses: [np.float64(0.047925616514381056), np.float64(0.04788001777606783), np.float64(0.0