In [None]:
import pandas as pd
import numpy as np
import json
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn import svm
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from sklearn.model_selection import GridSearchCV
import random
import pickle

In [None]:
# Use path to file here...I like absolute path to minimize errors
file = "/home/nyangweso/Desktop/Ds_1/Machine-Learning-Projects/Comment_Classifier/data/Books_small_10000.json"

We use classes to store the details and help make the code look cleaner

In [None]:
# A C like enum class
class Sentiment:
    NEG = "NEGATIVE"
    NEUTRAL = "NEUTRAL"
    POS = "POSITIVE"

In [None]:
class Review:
    def __init__(self, comment, rating) -> None:
        self.comment = comment
        self.rating = rating
        self.sentiment = self.get_sentiment()

    def get_sentiment(self):
        if self.rating <= 2:
            return Sentiment.NEG
        elif self.rating == 3:
            return Sentiment.NEUTRAL
        else:
            return Sentiment.POS

    def __repr__(self) -> str:
        pass


In [None]:
class ReviewContainer:
    def __init__(self, reviews) -> None:
        self.reviews = reviews

    def get_text(self):
        return [a.comment for a in self.reviews]

    def get_sentiment(self):
        return [a.sentiment for a in self.reviews]

    def evenly_distribute(self):
        negative = list(filter(lambda x: x.sentiment ==
                        Sentiment.NEG, self.reviews))
        positive = list(filter(lambda x: x.sentiment ==
                        Sentiment.POS, self.reviews))
        positive_shrunk = positive[: len(negative)]

        self.reviews = negative + positive_shrunk
        random.shuffle(self.reviews)

        print(len(negative), len(positive_shrunk))


In [None]:
reviews = []
with open(file) as f:
    for line in f:
        data = json.loads(line)
        reviews.append(Review(data["reviewText"], data["overall"]))


In [None]:
# to access a single comment and rating, and its sentiment
n = 68
print(
    f"{reviews[n].comment} \nRating = {reviews[n].rating} \nRating is {reviews[n].sentiment}"
)


#### Preparing Data

In [None]:
training, test = train_test_split(reviews, test_size=0.33, random_state=42)

In [None]:
train_cont = ReviewContainer(training)
test_cont = ReviewContainer(test)

In [None]:
len(test) / len(reviews)

In [None]:
train_cont.evenly_distribute()
test_cont.evenly_distribute()


# Training models
train_x = train_cont.get_text()
train_y = train_cont.get_sentiment()


# Test models
test_x = test_cont.get_text()
test_y = test_cont.get_sentiment()

In [None]:
print(train_y.count(Sentiment.POS), train_y.count(Sentiment.NEG))


Bags Of Words Vectorization

In [None]:
vectorizer = TfidfVectorizer()
train_x_vectors = vectorizer.fit_transform(train_x)

# We only need to transform our test data...no fitting like in train data
test_x_vectors = vectorizer.transform(test_x)

We then create a model to work with from 'train_x_vectors' and 'train_y'.<br>
But 1st lets look at some models inorder for us to know the best option

### <u>Model Selection and Classification</u>
#### Classification

1. Linear SVM

In [None]:
# Lets leave this sentiment here for checking
test_y[3]

In [None]:
# Fisrt create a SVC classifier object
cls_svm = svm.SVC(kernel="linear")

# Then fit the classifier into the model using the _.fit method
cls_svm.fit(train_x_vectors, train_y)


In [None]:
# Finally we can use the .predict method on our test data to see if it can predict
# We use the vectorized test data 'test_x_vectors' to see if our classifer can predict
cls_svm.predict(test_x_vectors[3])

2. Decision Tree

In [None]:
cls_dec = DecisionTreeClassifier()
cls_dec.fit(train_x_vectors, train_y)

In [None]:
cls_dec.predict(test_x_vectors[3])

3. Naive Bayes (Gaussian Naive Bayes)

In [None]:
# cls_gnb = GaussianNB()
# cls_gnb.fit(train_x_vectors.toarray(), train_y)

In [None]:
# cls_gnb.predict(test_x_vectors[3].toarray())

4. Logistic Regression

In [None]:
cls_log = LogisticRegression(max_iter=1000)
cls_log.fit(train_x_vectors, train_y)


In [None]:
cls_log.predict(test_x_vectors[3])

#### Evaluation

Now that we've created sample models, lets see how well each model performs

In [None]:
# Mean Accuracy
print(f"Support Vector Machine = {cls_svm.score(test_x_vectors, test_y)}")
print(f"Decision Tree = {cls_dec.score(test_x_vectors, test_y)}")
# print(f"Gaussian Naive Bayes = {cls_gnb.score(test_x_vectors.toarray(), test_y)}")
print(f"Logistic Regression = {cls_log.score(test_x_vectors, test_y)}")

We see mean-wise svm and logistic look like they're good models overally \
Lets look at the F1 score also

In [None]:
# A quick function
def f1_score_calculator(y_true, y_pred):
    arr = f1_score(
        y_true,
        y_pred,
        average=None,
        labels=[Sentiment.POS, Sentiment.NEUTRAL, Sentiment.NEG],
    )
    return f"   {Sentiment.POS} = {arr[0] * 100: .2f},     {Sentiment.NEUTRAL} = {arr[1] * 100: .2f},      {Sentiment.NEG} = {arr[2] * 100: .2f}"


In [None]:
# F1 scores
print(
    f"Support Vector Machine f1 score >> {f1_score_calculator(test_y, cls_svm.predict(test_x_vectors))}"
)
print(
    f"Decision Tree f1 score >> {f1_score_calculator(test_y, cls_dec.predict(test_x_vectors))}"
)
# print(f"Gaussian Naive Bayes f1 score >> {f1_score_calculator(test_y, cls_gnb.predict(test_x_vectors.toarray()))}")
print(
    f"Logistic Regression f1 score >> {f1_score_calculator(test_y, cls_log.predict(test_x_vectors))}"
)

In [None]:
new_test_data = ["I loved this book! The characters were so well-developed and the plot kept me on the edge of my seat.",
                 "This book was a disappointment. The writing was poor and the story was predictable.",
                 "I couldn't put this book down! It was a real page-turner.",
                 "The ending of this book left me feeling unsatisfied. It felt rushed and incomplete.",
                 "The world-building in this book was incredible. I felt like I was really there.",
                 "I found this book to be boring and uneventful. I struggled to finish it.",
                 "The dialogue in this book was so natural and realistic. It really brought the characters to life.",
                 "This book had too many plot holes and inconsistencies for me to enjoy it.",
                 "The pacing of this book was perfect. It kept me engaged from start to finish.",
                 "I didn't connect with any of the characters in this book. They all felt flat and one-dimensional."]

new_test = vectorizer.transform(new_test_data)


In [None]:
for count, comment in enumerate(new_test_data):
    print(f'{comment} >>    {cls_log.predict(new_test)[count]}')

In [None]:
for count, comment in enumerate(new_test_data):
    print(f'{comment} >>    {cls_dec.predict(new_test)[count]}')

In [None]:
for count, comment in enumerate(new_test_data):
    print(f'{comment} >>    {cls_svm.predict(new_test)[count]}')

Tuning our model using <b>Grid Search</b>

In [None]:
parameters = {'kernel' : ('linear', 'rbf'), 'C' : (1,4,8,16,32)}

svc = svm.SVC()
clf = GridSearchCV(svc, param_grid=parameters, cv=5)
clf.fit(train_x_vectors, train_y)

In [None]:
print(f"Support Vector Machine = {clf.score(test_x_vectors, test_y)}")

We can see that for the same dataset, our model improved by 1%

#### Saving Our Model
We save our model so that we don't have to retrain it the next time we're using it using the pickle library

In [None]:
file  = "/home/nyangweso/Desktop/Ds_1/Machine-Learning-Projects/Category_Classifier/models/sentiment_classifier.pkl"
with open(file, 'wb') as f:
    pickle.dump(clf, f)

Load Classifier

In [None]:
with open(file, 'rb') as f:
    loaded_clf = pickle.load(f) 

In [None]:
loaded_clf.predict(test_x_vectors[0])