In [None]:
import zipfile
import os


zip_path = "mix20_rand700_tokens.zip"


with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall("movie_reviews")


os.listdir("movie_reviews")



['tokens', 'README']

In [None]:
import glob


pos_files = glob.glob("movie_reviews/tokens/pos/*.txt")
neg_files = glob.glob("movie_reviews/tokens/neg/*.txt")


pos_reviews = [open(f, encoding="latin-1").read() for f in pos_files]
neg_reviews = [open(f, encoding="latin-1").read() for f in neg_files]


texts = pos_reviews + neg_reviews
labels = [1]*len(pos_reviews) + [0]*len(neg_reviews)

print("Total reviews:", len(texts))
print("Example review:\n", texts[0][:500])  # print first 500 chars

Total reviews: 1400
Example review:
 rating : * * * out of * * * * - 7 . 0 out of 10 . 0 cast : tom everett scott ( guy patterson ) , liv tyler ( faye dolan ) , johnathon schaech ( james 'jimmy' mattingly ii ) , steve zahn ( lenny ) , ethan randall ( the bass player ) , tom hanks ( mr . white ) director : tom hanks certification : pg ( usa ) year of production : 1996 academy award nominations : best original song that thing you do ! , from first-time film director tom hanks , is an enjoyable tale about a fictional band , the wonder


In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer


vectorizer = CountVectorizer(binary=True, stop_words="english")
X = vectorizer.fit_transform(texts)

print("Feature matrix shape:", X.shape)


Feature matrix shape: (1400, 35307)


In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import MultinomialNB
import numpy as np

nb = MultinomialNB()
nb_scores = cross_val_score(nb, X, labels, cv=10)

print("Naive Bayes Fold Accuracies:", nb_scores)
print("Naive Bayes Mean Accuracy:", np.mean(nb_scores))


Naive Bayes Fold Accuracies: [0.81428571 0.81428571 0.77857143 0.88571429 0.80714286 0.83571429
 0.81428571 0.76428571 0.80714286 0.82857143]
Naive Bayes Mean Accuracy: 0.8150000000000001


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
import numpy as np

lr = LogisticRegression(max_iter=1000)
lr_scores = cross_val_score(lr, X, labels, cv=10)

print("Logistic Regression Fold Accuracies:", lr_scores)
print("Logistic Regression Mean Accuracy:", np.mean(lr_scores))


Logistic Regression Fold Accuracies: [0.85714286 0.80714286 0.81428571 0.89285714 0.8        0.82857143
 0.84285714 0.86428571 0.78571429 0.81428571]
Logistic Regression Mean Accuracy: 0.8307142857142857


In [None]:
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_score
import numpy as np

svm = LinearSVC(max_iter=5000)
svm_scores = cross_val_score(svm, X, labels, cv=10)

print("SVM Fold Accuracies:", svm_scores)
print("SVM Mean Accuracy:", np.mean(svm_scores))



SVM Fold Accuracies: [0.85       0.77857143 0.79285714 0.87857143 0.79285714 0.79285714
 0.82142857 0.86428571 0.79285714 0.82142857]
SVM Mean Accuracy: 0.8185714285714285


In [None]:
import pandas as pd
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
import numpy as np

results = {}

nb = MultinomialNB()
nb_scores = cross_val_score(nb, X, labels, cv=10)
results["Naive Bayes"] = np.mean(nb_scores) * 100


lr = LogisticRegression(max_iter=1000)
lr_scores = cross_val_score(lr, X, labels, cv=10)
results["Logistic Regression"] = np.mean(lr_scores) * 100


svm = LinearSVC(max_iter=5000)
svm_scores = cross_val_score(svm, X, labels, cv=10)
results["SVM (Linear)"] = np.mean(svm_scores) * 100


df_results = pd.DataFrame(list(results.items()), columns=["Classifier", "Accuracy (%)"])
print(df_results)


            Classifier  Accuracy (%)
0          Naive Bayes     81.500000
1  Logistic Regression     83.071429
2         SVM (Linear)     81.857143
