In [6]:
%pip install -q scikit-learn pandas

import os, re, numpy as np, pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_score
import zipfile
import urllib.request


\
url = "http://www.cs.cornell.edu/people/pabo/movie-review-data/mix20_rand700_tokens_cleaned.zip"
zip_path = "mix20_rand700_tokens_cleaned.zip"


if not os.path.exists(zip_path):
    print("Downloading dataset...")
    urllib.request.urlretrieve(url, zip_path)


data_folder = os.path.splitext(zip_path)[0]
if not os.path.exists(data_folder):
    print(" Unzipping dataset...")
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(data_folder)


if os.path.exists(os.path.join(data_folder, "tokens", "pos")):
    pos_dir = os.path.join(data_folder, "tokens", "pos")
    neg_dir = os.path.join(data_folder, "tokens", "neg")
    version = "Polarity v0.9/v1.0 (EMNLP 2002) – 700 pos + 700 neg"
elif os.path.exists(os.path.join(data_folder, "txt_sentoken", "pos")):
    pos_dir = os.path.join(data_folder, "txt_sentoken", "pos")
    neg_dir = os.path.join(data_folder, "txt_sentoken", "neg")
    version = "Polarity v2.0 (ACL 2004) – 1000 pos + 1000 neg"
else:
    raise FileNotFoundError("Could not detect dataset version.")


def load_data(pos_dir, neg_dir):
    texts, labels = [], []
    for folder, label in [(pos_dir, 1), (neg_dir, 0)]:
        for fname in os.listdir(folder):
            if not fname.endswith(".txt"):
                continue
            with open(os.path.join(folder, fname), encoding="utf-8", errors="ignore") as f:
                txt = f.read()
            # Remove rating hints like "10/10", "****"
            txt = re.sub(r"\d+/\d+|\*+", "", txt)
            texts.append(txt)
            labels.append(label)
    return texts, np.array(labels)


texts, labels = load_data(pos_dir, neg_dir)
print(f"\nLoaded {len(texts)} docs ({sum(labels)} pos / {len(labels)-sum(labels)} neg)")
print(f" Dataset version: {version}\n")


def evaluate_model(clf, X, y):
    return cross_val_score(clf, X, y, cv=3).mean() * 100


results = []
token_pattern = r"(?u)\b\w+\b"


Loaded 1400 docs (700 pos / 700 neg)
 Dataset version: Polarity v0.9/v1.0 (EMNLP 2002) – 700 pos + 700 neg



In [12]:
results = []

# (1) Unigrams (frequency)
vectorizer = CountVectorizer(binary=False, token_pattern=token_pattern, min_df=4)
X = vectorizer.fit_transform(texts)
results.append(["(1) unigrams (freq)", X.shape[1],
                evaluate_model(MultinomialNB(), X, labels),
                None,
                evaluate_model(LinearSVC(max_iter=5000), X, labels)])

# (2) Unigrams (presence)
vectorizer = CountVectorizer(binary=True, token_pattern=token_pattern, min_df=4)
X = vectorizer.fit_transform(texts)
results.append(["(2) unigrams (presence)", X.shape[1],
                evaluate_model(MultinomialNB(), X, labels),
                evaluate_model(LogisticRegression(max_iter=1000), X, labels),
                evaluate_model(LinearSVC(max_iter=5000), X, labels)])

# (3) Unigrams + Bigrams
vectorizer = CountVectorizer(binary=True, ngram_range=(1,2),
                             token_pattern=token_pattern, min_df=7)
X = vectorizer.fit_transform(texts)
results.append(["(3) unigrams+bigrams", X.shape[1],
                evaluate_model(MultinomialNB(), X, labels),
                evaluate_model(LogisticRegression(max_iter=1000), X, labels),
                evaluate_model(LinearSVC(max_iter=5000), X, labels)])

# (4) Bigrams only
vectorizer = CountVectorizer(binary=True, ngram_range=(2,2),
                             token_pattern=token_pattern, min_df=7)
X = vectorizer.fit_transform(texts)
results.append(["(4) bigrams only", X.shape[1],
                evaluate_model(MultinomialNB(), X, labels),
                evaluate_model(LogisticRegression(max_iter=1000), X, labels),
                evaluate_model(LinearSVC(max_iter=5000), X, labels)])


def adj_tokenizer(text):
    tokens = re.findall(r"\b\w+\b", text)
    return [w for w in tokens if re.match(r".*ly$|.*ous$|.*ful$|.*able$|.*ive$|.*less$|.*ic$|.*al$|.*est$|.*er$|good|bad|great|awful|excellent|poor", w.lower())]

vectorizer = CountVectorizer(tokenizer=adj_tokenizer, binary=True, min_df=4)
X = vectorizer.fit_transform(texts)
results.append(["(5) adjectives only", X.shape[1],
                evaluate_model(MultinomialNB(), X, labels),
                evaluate_model(LogisticRegression(max_iter=1000), X, labels),
                evaluate_model(LinearSVC(max_iter=5000), X, labels)])


vectorizer = CountVectorizer(binary=True, token_pattern=token_pattern, max_features=2633)
X = vectorizer.fit_transform(texts)
results.append(["(6) top 2633 unigrams", X.shape[1],
                evaluate_model(MultinomialNB(), X, labels),
                evaluate_model(LogisticRegression(max_iter=1000), X, labels),
                evaluate_model(LinearSVC(max_iter=5000), X, labels)])


df = pd.DataFrame(results, columns=["Features","#Features","NB","ME","SVM"])
df = df.drop_duplicates().reset_index(drop=True)

print("\n Final Results Table:\n")
print(df.to_string(index=False))





 Final Results Table:

               Features  #Features        NB        ME       SVM
    (1) unigrams (freq)      12960 78.713856       NaN 77.929162
(2) unigrams (presence)      12960 81.356511 83.142483 81.642634
   (3) unigrams+bigrams      24462 81.713552 82.571155 80.999776
       (4) bigrams only      15825 79.285948 77.857324 75.572016
    (5) adjectives only       1937 76.069975 73.356095 69.856142
  (6) top 2633 unigrams       2633 80.855183 80.712734 78.427273
