## Importing the necessary libraries

In [None]:
import pandas as pd
import numpy as np
import math
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from scipy.sparse import hstack

## Load words

In [None]:
neg_words = pd.DataFrame(
    open("../data/negative-words.txt", encoding="latin-1").read().splitlines(),
    columns=["word"]
)

pos_words = pd.DataFrame(
    open("../data/positive-words.txt", encoding="latin-1").read().splitlines(),
    columns=["word"]
)

neg_words["sentiment"] = 0
pos_words["sentiment"] = 1

neg_set = set(neg_words['word'])
pos_set = set(pos_words['word'])


#combine datasets
word_df = pd.concat([pos_words, neg_words], ignore_index=True)

#shuffle rows
word_df = word_df.sample(frac=1).reset_index(drop=True)

word_df.head(20)

## Preparing our dataset

In [None]:
from sklearn.model_selection import train_test_split

neg_reviews = pd.DataFrame(
    open("../data/negative-reviews.txt", encoding="latin-1").read().splitlines(),
    columns=["review"]
)

pos_reviews = pd.DataFrame(
    open("../data/positive-reviews.txt", encoding="latin-1").read().splitlines(),
    columns=["review"]
)

neg_reviews["sentiment"] = 0
pos_reviews["sentiment"] = 1

# Split dataset into training and testing sets (80-20)
neg_train, neg_test, pos_train, pos_test = train_test_split(neg_reviews, pos_reviews, test_size=0.2, random_state=42)

# Combine and shuffle train set
train =  pd.concat([neg_train, pos_train], ignore_index=True)
train = train.sample(frac=1).reset_index(drop=True) 

# Combine and shuffle test set
test = pd.concat([neg_test, pos_test],)
test = test.sample(frac=1).reset_index(drop=True)

print("Shuffled train set: \n", train.head())
print('\n')
print("Shuffled test set: \n", test.head())

In [None]:
def preprocess_text(text):
    contractions = {
        "doesn't": "does not", "don't": "do not", "isn't": "is not",
        "wasn't": "was not", "didn't": "did not", "won't": "will not",
        "can't": "can not", "couldn't": "could not", "shouldn't": "should not",
        "wouldn't": "would not", "haven't": "have not", "hasn't": "has not",
        "hadn't": "had not", "aren't": "are not", "weren't": "were not"
    }
    text_lower = text.lower()
    for contraction, expansion in contractions.items():
        text_lower = text_lower.replace(contraction, expansion)
    return text_lower

# Apply before feature extraction
train["review"] = train["review"].apply(preprocess_text)
test["review"] = test["review"].apply(preprocess_text)

## Feature extraction

In [None]:
pronouns = {"i", "me", "my", "you", "your"}

def extract_features(review):
    words = review.lower().split()
    word_count = len(words)
    has_exclaim = int("!" in review)
    pronoun_count = sum(1 for w in words if w in pronouns)
    pos_count = sum(1 for w in words if w in pos_set)
    neg_count = sum(1 for w in words if w in neg_set)
    has_no = int("no" in words)
    log_length = math.log(word_count + 1)
    question_count = review.count("?")
    
    sentiment_ratio = (pos_count - neg_count) / (word_count + 1)
    intensifiers = {"very", "extremely", "really", "absolutely", "totally", "completely"}
    intensifier_count = sum(1 for w in words if w in intensifiers)
    
    negations = {"not", "never", "neither", "nobody", "nothing", "nowhere", "no"}
    negation_count = sum(1 for w in words if w in negations)
    
    caps_count = sum(1 for w in review.split() if w.isupper() and len(w) > 1)
    avg_word_len = sum(len(w) for w in words) / (word_count + 1)

    
    return [
        has_exclaim, 
        pronoun_count, 
        pos_count, 
        neg_count, 
        has_no,
        log_length, 
        question_count, 
        sentiment_ratio, 
        intensifier_count,
        negation_count, 
        caps_count, 
        avg_word_len
    ]
    
# Apply to all reviews
train_extra = np.array([extract_features(r) for r in train["review"]])
test_extra  = np.array([extract_features(r) for r in test["review"]])

# Standardize extra features
scaler = StandardScaler()
train_extra = scaler.fit_transform(train_extra)
test_extra  = scaler.transform(test_extra)

In [None]:
vectorizer = TfidfVectorizer(
    max_features=10000,      
    ngram_range=(1,3),       
    stop_words='english',
    min_df=2,                
    max_df=0.8,              
    sublinear_tf=True
)

# Fit TF-IDF on training reviews and transform
X_train_tfidf = vectorizer.fit_transform(train["review"])
X_test_tfidf  = vectorizer.transform(test["review"])

# Labels
y_train = train["sentiment"].tolist()
y_test  = test["sentiment"].tolist()

# hstack combines sparse TF-IDF with dense extra features
X_train = hstack([X_train_tfidf, train_extra])
X_test  = hstack([X_test_tfidf, test_extra])

y_train = train["sentiment"].tolist()
y_test  = test["sentiment"].tolist()

## Train Model

In [None]:
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Test Accuracy:", accuracy)