## Importing the necessary libraries

In [119]:
import pandas as pd
import numpy as np
import math
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from scipy.sparse import hstack

## Load words

In [120]:
neg_words = pd.DataFrame(
    open("../data/negative-words.txt", encoding="latin-1").read().splitlines(),
    columns=["review"]
)

pos_words = pd.DataFrame(
    open("../data/positive-words.txt", encoding="latin-1").read().splitlines(),
    columns=["review"]
)

neg_words["sentiment"] = 0
pos_words["sentiment"] = 1

#combine datasets
word_df = pd.concat([pos_words, neg_words], ignore_index=True)

#shuffle rows
word_df = word_df.sample(frac=1).reset_index(drop=True)

word_df.head(20)

Unnamed: 0,review,sentiment
0,wonders,1
1,staunchness,1
2,droop,0
3,shamelessness,0
4,stigmatize,0
5,bleed,0
6,affluence,1
7,infiltrators,0
8,selfinterested,0
9,demonizing,0


## Preparing our dataset

In [121]:
from sklearn.model_selection import train_test_split

neg_reviews = pd.DataFrame(
    open("../data/negative-reviews.txt", encoding="latin-1").read().splitlines(),
    columns=["review"]
)

pos_reviews = pd.DataFrame(
    open("../data/positive-reviews.txt", encoding="latin-1").read().splitlines(),
    columns=["review"]
)

neg_reviews["sentiment"] = 0
pos_reviews["sentiment"] = 1

# Split dataset into training and testing sets (90-10)
neg_train, neg_test, pos_train, pos_test = train_test_split(neg_reviews, pos_reviews, test_size=0.1, random_state=42)

# Combine and shuffle train set
train =  pd.concat([neg_train, pos_train], ignore_index=True)
train = train.sample(frac=1).reset_index(drop=True) 

# Combine and shuffle test set
test = pd.concat([neg_test, pos_test],)
test = test.sample(frac=1).reset_index(drop=True)

print("Shuffled train set: \n", train.head())
print('\n')
print("Shuffled test set: \n", test.head())

Shuffled train set: 
                                               review  sentiment
0                     Ease of Use, Features, Quality          1
1  8mb memory stick, slow shutter/focusing, weak ...          0
2      Fast, Easy To Use, Reliable, Energy-efficient          1
3  holster, speaker not loud enough, slow process...          0
4          Small, Lite, simple to use and Very Sheik          1


Shuffled test set: 
                                               review  sentiment
0  poor battery life,no download ringtones, games...          0
1                             Lightweight, reclining          1
2                                        battery use          0
3  Battery life, durability, good photo quality f...          1
4      Wish it had an automatic retracting lens cap.          0


## Feature extraction

In [122]:
pronouns = {"i", "me", "my", "you", "your"}

def extract_features(review):
    words = review.lower().split()
    word_count = len(words)
    return[
        int("!" in review),
        sum(1 for w in words if w in pronouns),
        int("no" in words),
        math.log(word_count + 1),
        word_count,
        review.count("?")
    ]
    
# Apply to all reviews
train_extra = np.array([extract_features(r) for r in train["review"]])
test_extra  = np.array([extract_features(r) for r in test["review"]])

# Standardize extra features
scaler = StandardScaler()
train_extra = scaler.fit_transform(train_extra)
test_extra  = scaler.transform(test_extra)

In [123]:
vectorizer = TfidfVectorizer(
    max_features=8000,    
    ngram_range=(1,3),   
    stop_words='english'  
)

# Fit TF-IDF on training reviews and transform
X_train_tfidf = vectorizer.fit_transform(train["review"])
X_test_tfidf  = vectorizer.transform(test["review"])

# Labels
y_train = train["sentiment"].tolist()
y_test  = test["sentiment"].tolist()

# hstack combines sparse TF-IDF with dense extra features
X_train = hstack([X_train_tfidf, train_extra])
X_test  = hstack([X_test_tfidf, test_extra])

y_train = train["sentiment"].tolist()
y_test  = test["sentiment"].tolist()

## Train Model

In [124]:
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Test Accuracy:", accuracy)

Test Accuracy: 0.921
