# REVIEW SENTIMENT CLASSIFIER: POSITIVE OR NEGATIVE

### Problem Statement
 - With the growing volume of customer reviews , businesses face challenges in quickly extracting actionable insights from feedback . Manual sentiment analysis is time consuming and prone to errors. An automated solution to classify reviews as positive or negative can enable faster decision making , improve customer experience, and drive business growth .



##  IMPORT LIBRARIES

In [5]:
import random
import json
import pickle
import os

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn import svm
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report


## Ensure models directory exists

In [7]:
if not os.path.exists('./models'):
    os.makedirs('./models')

##   DATA CLASS DEFINITIONS  


In [9]:
class Sentiment:
    NEGATIVE = "NEGATIVE"
    NEUTRAL = "NEUTRAL"
    POSITIVE = "POSITIVE"

class Review:
    def __init__(self, text, score):
        self.text = text
        self.score = score
        self.sentiment = self.get_sentiment()
        
    def get_sentiment(self):
        if self.score <= 2:
            return Sentiment.NEGATIVE
        elif self.score == 3:
            return Sentiment.NEUTRAL
        else:  # Score of 4 or 5
            return Sentiment.POSITIVE

class ReviewContainer:
    def __init__(self, reviews):
        self.reviews = reviews
        
    def get_text(self):
        return [x.text for x in self.reviews]
    
    def get_sentiment(self):
        return [x.sentiment for x in self.reviews]
        
    def evenly_distribute(self):
        negative = list(filter(lambda x: x.sentiment == Sentiment.NEGATIVE, self.reviews))
        positive = list(filter(lambda x: x.sentiment == Sentiment.POSITIVE, self.reviews))
        positive_shrunk = positive[:len(negative)]
        self.reviews = negative + positive_shrunk
        random.shuffle(self.reviews)

## LOAD THE DATA    
 

In [11]:
print("\n--- Loading Data ---")
file_name = 'Books.json'
reviews = []
with open(file_name) as f:
    for line in f:
        review = json.loads(line)
        reviews.append(Review(review['reviewText'], review['overall']))
print("Total reviews loaded:", len(reviews))



--- Loading Data ---
Total reviews loaded: 10000


## PREPARE THE DATA
    

In [13]:
print("\n--- Preparing Data ---")
training, test = train_test_split(reviews, test_size=0.2, random_state=42)
train_container = ReviewContainer(training)
test_container = ReviewContainer(test)

train_container.evenly_distribute()
train_x = train_container.get_text()
train_y = train_container.get_sentiment()

test_container.evenly_distribute()
test_x = test_container.get_text()
test_y = test_container.get_sentiment()

print("Count of Positive reviews (train):", train_y.count(Sentiment.POSITIVE))
print("Count of Negative reviews (train):", train_y.count(Sentiment.NEGATIVE))


--- Preparing Data ---
Count of Positive reviews (train): 513
Count of Negative reviews (train): 513


 ## BAG OF WORDS VECTORIZATION      


In [15]:
print("\n--- Vectorizing Data ---")
vectorizer = TfidfVectorizer()
train_x_vectors = vectorizer.fit_transform(train_x)
test_x_vectors = vectorizer.transform(test_x)

print("Example raw text:\n", train_x[0])
print("Vectorized representation:\n", train_x_vectors[0].toarray())



--- Vectorizing Data ---
Example raw text:
 I love Debbie! I have read most of her books, she has another winner here. I particularly like books centered around Alaska. My sister lives there, brings her close. Another hit for Debbie.
Vectorized representation:
 [[0. 0. 0. ... 0. 0. 0.]]


## TRAINING MODELS   

### Linear SVM

In [18]:
print("\n[Linear SVM]")
clf_svm = svm.SVC(kernel='linear' , random_state=42)
clf_svm.fit(train_x_vectors, train_y)
print("SVM Prediction for first test sample:", clf_svm.predict(test_x_vectors[3]))



[Linear SVM]
SVM Prediction for first test sample: ['NEGATIVE']


In [19]:
test_x[3]

'I purchased the &#34;book&#34; in the Kindle edition as a gamble, given the price of 1.99.  I lost the bet.  Please describe the item as sheet music, for that is what it is.'

### Decision Tree

In [21]:
print("\n[Decision Tree]")
clf_dec = DecisionTreeClassifier(random_state=42)
clf_dec.fit(train_x_vectors, train_y)
print("Decision Tree Prediction for first test sample:", clf_dec.predict(test_x_vectors[3]))



[Decision Tree]
Decision Tree Prediction for first test sample: ['NEGATIVE']


### Multinomial Naive Bayes

In [23]:
print("\n[Multinomial Naive Bayes]")
clf_nb = MultinomialNB()
clf_nb.fit(train_x_vectors, train_y)
print("MultinomialNB Prediction for first test sample:", clf_nb.predict(test_x_vectors[3]))




[Multinomial Naive Bayes]
MultinomialNB Prediction for first test sample: ['NEGATIVE']


### Logistic Regression

In [25]:
print("\n[Logistic Regression]")
clf_log = LogisticRegression(max_iter=1000, random_state=42)
clf_log.fit(train_x_vectors, train_y)
print("Logistic Regression Prediction for first test sample:", clf_log.predict(test_x_vectors[3]))



[Logistic Regression]
Logistic Regression Prediction for first test sample: ['NEGATIVE']


### Random Forest

In [27]:
print("\n[Random Forest]")
clf_rf = RandomForestClassifier(n_estimators=100, random_state=42)
clf_rf.fit(train_x_vectors, train_y)
print("Random Forest Prediction for first test sample:", clf_rf.predict(test_x_vectors[3]))



[Random Forest]
Random Forest Prediction for first test sample: ['NEGATIVE']


### K-Nearest Neighbors

In [29]:
print("\n[K-Nearest Neighbors]")
clf_knn = KNeighborsClassifier(n_neighbors=5)
clf_knn.fit(train_x_vectors, train_y)
print("KNN Prediction for first test sample:", clf_knn.predict(test_x_vectors[3]))


[K-Nearest Neighbors]
KNN Prediction for first test sample: ['NEGATIVE']


### SGDClassifier

In [31]:
print("\n[SGDClassifier]")
clf_sgd = SGDClassifier(random_state=42)
clf_sgd.fit(train_x_vectors, train_y)
print("SGDClassifier Prediction for first test sample:", clf_sgd.predict(test_x_vectors[3]))



[SGDClassifier]
SGDClassifier Prediction for first test sample: ['NEGATIVE']


### Multi-Layer Perceptron (MLP)

In [33]:
print("\n[Multi-Layer Perceptron]")
clf_mlp = MLPClassifier(hidden_layer_sizes=(100,), max_iter=150, random_state=42)
clf_mlp.fit(train_x_vectors, train_y)
print("MLP Prediction for first test sample:", clf_mlp.predict(test_x_vectors[3]))


[Multi-Layer Perceptron]
MLP Prediction for first test sample: ['NEGATIVE']


## EVALUATION METRICS  

In [35]:
print("\n--- Evaluating Models ---")
models = {
    "SVM": clf_svm,
    "Decision Tree": clf_dec,
    "MultinomialNB": clf_nb,
    "Logistic Regression": clf_log,
    "Random Forest": clf_rf,
    "KNN": clf_knn,
    "SGDClassifier": clf_sgd,
    "MLP": clf_mlp
}

accuracy_scores = {}

for name, model in models.items():
    y_pred = model.predict(test_x_vectors)
    acc = accuracy_score(test_y, y_pred)
    accuracy_scores[name] = acc
    print(f"\n[{name} Evaluation]")
    print(f"Accuracy = {acc:.4f}")
    print("Confusion Matrix:")
    print(confusion_matrix(test_y, y_pred))
    print("Classification Report:")
    print(classification_report(test_y, y_pred))



--- Evaluating Models ---

[SVM Evaluation]
Accuracy = 0.8244
Confusion Matrix:
[[108  23]
 [ 23 108]]
Classification Report:
              precision    recall  f1-score   support

    NEGATIVE       0.82      0.82      0.82       131
    POSITIVE       0.82      0.82      0.82       131

    accuracy                           0.82       262
   macro avg       0.82      0.82      0.82       262
weighted avg       0.82      0.82      0.82       262


[Decision Tree Evaluation]
Accuracy = 0.6718
Confusion Matrix:
[[84 47]
 [39 92]]
Classification Report:
              precision    recall  f1-score   support

    NEGATIVE       0.68      0.64      0.66       131
    POSITIVE       0.66      0.70      0.68       131

    accuracy                           0.67       262
   macro avg       0.67      0.67      0.67       262
weighted avg       0.67      0.67      0.67       262


[MultinomialNB Evaluation]
Accuracy = 0.8473
Confusion Matrix:
[[116  15]
 [ 25 106]]
Classification Report:
   

## Determine and display the best model based on accuracy

In [37]:
best_model_name = max(accuracy_scores, key=accuracy_scores.get)
best_model = models[best_model_name]
print(f"\nBest Model from initial training: {best_model_name} with Accuracy = {accuracy_scores[best_model_name]:.4f}")



Best Model from initial training: MultinomialNB with Accuracy = 0.8473


## SAVE THE BEST MODEL   

In [39]:
with open('./models/best_sentiment_classifier.pkl', 'wb') as f:
    pickle.dump(best_model, f)
print("\nBest overall model saved as './models/best_sentiment_classifier.pkl'")



Best overall model saved as './models/best_sentiment_classifier.pkl'


### Save the TF-IDF vectorizer for later use

In [41]:
with open('./models/vectorizer.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)
print("TF-IDF vectorizer saved as './models/vectorizer.pkl'")


TF-IDF vectorizer saved as './models/vectorizer.pkl'


## MANUAL TEST AND MODEL RELOAD 

In [43]:
print("\n--- Manual Test After Reloading the Best Model ---")

with open('./models/best_sentiment_classifier.pkl', 'rb') as f:
    loaded_nb_model = pickle.load(f)



--- Manual Test After Reloading the Best Model ---


## Perform manual testing using the reloaded model

In [45]:
manual_test_set = ['awful, I want a refund', 'fantastic read, couldn’t put it down', 'not as expected, very upset']
manual_test_vectors = vectorizer.transform(manual_test_set)
manual_predictions = loaded_nb_model.predict(manual_test_vectors)
print("Manual Test Predictions :", manual_predictions)                         

Manual Test Predictions : ['NEGATIVE' 'POSITIVE' 'NEGATIVE']
