1. Data loading

In [26]:
# Import libraries
import pandas as pd
import numpy as np
import math
import re
from sklearn.model_selection import train_test_split

# Load the data
with open('positive-reviews.txt', 'r', encoding='utf-8') as f:
    positive_reviews = f.readlines()

with open('negative-reviews.txt', 'r', encoding='utf-8') as f:
    negative_reviews = f.readlines()

with open('positive-words.txt', 'r', encoding='utf-8') as f:
    positive_words = set(f.read().splitlines())

with open('negative-words.txt', 'r', encoding='utf-8') as f:
    negative_words = set(f.read().splitlines())

# Split data into train and test sets (80/20 split)
positive_train, positive_test = train_test_split(positive_reviews, test_size=0.2, random_state=42)
negative_train, negative_test = train_test_split(negative_reviews, test_size=0.2, random_state=42)

# Combine and label data
train_data = [(review, 1) for review in positive_train] + [(review, 0) for review in negative_train]
test_data = [(review, 1) for review in positive_test] + [(review, 0) for review in negative_test]

# Shuffle the data
np.random.shuffle(train_data)
np.random.shuffle(test_data)

negative_words

{'crap',
 'hegemony',
 'skimpy',
 'uproariously',
 'harrow',
 'victimize',
 'torment',
 'coerce',
 'futilely',
 'vulgar',
 'depravedly',
 'sneer',
 'frenzy',
 'slanderous',
 'smallish',
 'suffering',
 'betraying',
 'skittishly',
 'trapped',
 'unlawful',
 'unbelievably',
 'ingrate',
 'fugitive',
 'helplessness',
 'gnawing',
 'ironic',
 'taboo',
 'sarcasm',
 'dejection',
 'unacceptably',
 'ungrateful',
 'isolation',
 'frozen',
 'twist',
 'remorseless',
 'bicker',
 'devious',
 'damnably',
 'inconsistent',
 'latency',
 'radical',
 'dissed',
 'mulish',
 'shrouded',
 'tangle',
 'disdain',
 'delirious',
 'syndrome',
 'spookier',
 'debaser',
 'exasperated',
 'rejects',
 'villify',
 'nemesis',
 'nonsense',
 'dungeons',
 'inconstant',
 'sags',
 'betrays',
 'irk',
 'sidetrack',
 'antagonistic',
 'pale',
 'sidetracked',
 'confuses',
 'dejected',
 'famine',
 'lies',
 'draconian',
 'forgetfulness',
 'disparage',
 'mendacious',
 'collapse',
 'unwieldy',
 'dissenter',
 'bewildering',
 'senselessly',
 

2. Preprocessing

In [27]:
# Define preprocessing function
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove special characters and numbers
    text = re.sub(r'[^a-z\s]', '', text)
    # Tokenize
    return text.split()

# Apply preprocessing to training and test data
train_data = [(preprocess_text(review), label) for review, label in train_data]
test_data = [(preprocess_text(review), label) for review, label in test_data]


3. Feature Engineering

In [28]:
# Define feature extraction function
def extract_features(review, positive_words, negative_words):
    features = {}
    # Count positive words
    features['positive_word_count'] = sum(1 for word in review if word in positive_words)
    # Count negative words
    features['negative_word_count'] = sum(1 for word in review if word in negative_words)
    # Binary indicator for 'no'
    features['contains_no'] = 1 if 'no' in review else 0
    # Count pronouns
    pronouns = {'i', 'me', 'my', 'you', 'your'}
    features['pronoun_count'] = sum(1 for word in review if word in pronouns)
    # Binary indicator for '!'
    features['contains_exclamation'] = 1 if '!' in ''.join(review) else 0
    # Log of review length
    features['log_length'] = math.log(len(review)) if len(review) > 0 else 0
    return features

# Extract features from training and test data
X_train = [extract_features(review, positive_words, negative_words) for review, _ in train_data]
y_train = [label for _, label in train_data]

X_test = [extract_features(review, positive_words, negative_words) for review, _ in test_data]
y_test = [label for _, label in test_data]

# Convert feature dictionaries to DataFrames
X_train = pd.DataFrame(X_train)
X_test = pd.DataFrame(X_test)


In [29]:
X_train.shape
X_train.tail()

Unnamed: 0,positive_word_count,negative_word_count,contains_no,pronoun_count,contains_exclamation,log_length
31995,0,0,0,0,0,0.693147
31996,0,1,0,0,0,1.098612
31997,1,0,0,0,0,2.079442
31998,0,1,0,0,0,0.0
31999,0,2,0,0,0,2.70805


4. Evaluation and Comparision

In [30]:
# Import models and evaluation metric
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

# Train Logistic Regression
lr = LogisticRegression()
lr.fit(X_train, y_train)
lr_pred = lr.predict(X_test)
lr_acc = accuracy_score(y_test, lr_pred)

# Train Random Forest
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
rf_pred = rf.predict(X_test)
rf_acc = accuracy_score(y_test, rf_pred)

# Train SVM
svm = SVC()
svm.fit(X_train, y_train)
svm_pred = svm.predict(X_test)
svm_acc = accuracy_score(y_test, svm_pred)


In [31]:
# Ensure CELL INDEX: 5 is executed before this cell
# Print accuracy results
print("Logistic Regression Accuracy:", lr_acc)
print("Random Forest Accuracy:", rf_acc)
print("SVM Accuracy:", svm_acc)

Logistic Regression Accuracy: 0.817625
Random Forest Accuracy: 0.818625
SVM Accuracy: 0.818375
