In [1]:
import pandas as pd
import numpy as np

reviews = pd.read_csv("data/reviews.csv")

In [2]:
reviews.columns

Index(['Id', 'ProductId', 'UserId', 'ProfileName', 'HelpfulnessNumerator',
       'HelpfulnessDenominator', 'Score', 'Time', 'Summary', 'Text'],
      dtype='object')

In [3]:
reviews = reviews.drop(['Id', 'ProductId', 'UserId', 'ProfileName', 'HelpfulnessNumerator', 'HelpfulnessDenominator', 'Time', 'Summary'], axis=1)

In [4]:
reviews.columns

Index(['Score', 'Text'], dtype='object')

In [5]:
reviews.to_csv("data/cleaned_reviews.csv", index=1)

In [6]:
cleaned_reviews = pd.read_csv("data/cleaned_reviews.csv")

In [7]:
cleaned_reviews.head()

Unnamed: 0.1,Unnamed: 0,Score,Text
0,0,5,I have bought several of the Vitality canned d...
1,1,1,Product arrived labeled as Jumbo Salted Peanut...
2,2,4,This is a confection that has been around a fe...
3,3,2,If you are looking for the secret ingredient i...
4,4,5,Great taffy at a great price. There was a wid...


In [8]:
cleaned_reviews = cleaned_reviews.drop("Unnamed: 0", axis=1)

In [9]:
cleaned_reviews.head()

Unnamed: 0,Score,Text
0,5,I have bought several of the Vitality canned d...
1,1,Product arrived labeled as Jumbo Salted Peanut...
2,4,This is a confection that has been around a fe...
3,2,If you are looking for the secret ingredient i...
4,5,Great taffy at a great price. There was a wid...


In [10]:
cleaned_reviews = cleaned_reviews.rename(
    columns={
        "Text": "review_text",
        "Score": "rating"
    }
)


In [11]:
# Score 4 or 5 → Positive, Score 1 or 2 → Negative


# Filter out columns where score == 3
cleaned_reviews = cleaned_reviews[cleaned_reviews["rating"] != 3]

# Sentiment labeling function
def label_sentiment(score):
    if score > 3:
        return 1   # Positive
    else:
        return 0   # Negative


cleaned_reviews["sentiment"] = cleaned_reviews["rating"].apply(label_sentiment) # Appling label_sentiment function to the entire column
cleaned_reviews["sentiment"].value_counts()


sentiment
1    443777
0     82037
Name: count, dtype: int64

In [12]:
import re

def clean_text(text):
    text = text.lower() # Converting to lower case
    text = re.sub(r"[^a-z\s]", "", text) # d
    return text

cleaned_reviews["review_text"] = cleaned_reviews["review_text"].apply(clean_text)


In [13]:
cleaned_reviews.head()

Unnamed: 0,rating,review_text,sentiment
0,5,i have bought several of the vitality canned d...,1
1,1,product arrived labeled as jumbo salted peanut...,0
2,4,this is a confection that has been around a fe...,1
3,2,if you are looking for the secret ingredient i...,0
4,5,great taffy at a great price there was a wide...,1


In [14]:
# Importing TF-IDF for text extraction
from sklearn.feature_extraction.text import TfidfVectorizer

# Keep only the top 5000 most important words, stop_words Automatically removes common words like: the, is, was, and, to
tfidf = TfidfVectorizer(max_features = 5000, stop_words="english")

X = tfidf.fit_transform(cleaned_reviews["review_text"]) # Text transformed into numbers using TF-IDF formulas
y = cleaned_reviews["sentiment"]

In [15]:
import numpy as np
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split

np.random.seed(42)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

model = LinearSVC()
model.fit(X_train, y_train)

In [16]:
from sklearn.metrics import accuracy_score

model.score(X_test, y_test)

0.9306410049161777

In [17]:
y_pred = model.predict(X_test)
y_pred

array([1, 1, 1, ..., 1, 0, 1])

In [18]:
# Example new reviews
new_reviews = [
    "This product is amazing and worth the price",
    "Very poor quality, totally disappointed"
]

# Convert text to TF IDF using the SAME tfidf
X_new = tfidf.transform(new_reviews)

# Predict sentiment
predictions = model.predict(X_new)

# Convert to readable labels
for review, pred in zip(new_reviews, predictions):
    label = "Positive" if pred == 1 else "Negative"
    print(f"{label} : {review}")


Positive : This product is amazing and worth the price
Negative : Very poor quality, totally disappointed


In [19]:
# Evaluate
from sklearn.metrics import accuracy_score, classification_report

y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.9306410049161777
              precision    recall  f1-score   support

           0       0.83      0.69      0.76     16379
           1       0.95      0.97      0.96     88784

    accuracy                           0.93    105163
   macro avg       0.89      0.83      0.86    105163
weighted avg       0.93      0.93      0.93    105163



In [20]:
from sklearn.model_selection import cross_val_score

np.random.seed(42)  

# Cross-Validation accuracy
cv_acc = cross_val_score(model, X, y, cv=5,scoring=None) 
cv_acc

array([0.92830178, 0.92952845, 0.92839687, 0.93007046, 0.92771153])

In [21]:
# Cross-Validated accuracy
print(f"The cross-validated accuracy is: {np.mean(cv_acc)*100:.2f}%")

The cross-validated accuracy is: 92.88%


In [24]:
import pickle
# Saving model

pickle.dump(model, open("sentiment.pkl", "wb"))
pickle.dump(tfidf, open("sentiment_tfidf.pkl", "wb"))
