In [None]:
import pandas as pd
import numpy as np
import json
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import svm
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression

In [None]:
# Use path to file here...I like absolute path to minimize errors
file = "/home/nyangweso/Desktop/Ds_1/Machine-Learning-Projects/data/sentiment/Books_small.json"

We use classes to store the details and help make the code look cleaner

In [None]:
# A C like enum class
class Sentiment:
    NEG = "NEGATIVE"
    NEUTRAL = "NEUTRAL"
    POS = "POSITIVE"

In [None]:
class Review:
    def __init__(self, comment, rating) -> None:
        self.comment = comment
        self.rating = rating
        self.sentiment = self.get_sentiment()

    def get_sentiment(self):
        if self.rating <= 2:
            return Sentiment.NEG
        elif self.rating == 3:
            return Sentiment.NEUTRAL
        else:
            return Sentiment.POS

    def __repr__(self) -> str:
        pass


In [None]:
reviews = []
with open(file) as f:
    for line in f:
        data = json.loads(line)
        reviews.append(Review(data["reviewText"], data["overall"]))


In [None]:
# to access a single comment and rating, and its sentiment
n = 68
print(
    f"{reviews[n].comment} \nRating = {reviews[n].rating} \nRating is {reviews[n].sentiment}"
)


#### Preparing Data

In [None]:
training, test = train_test_split(reviews, test_size=0.33, random_state=42)

In [None]:
len(test) / len(reviews)

In [None]:
# Training models
train_x = [a.comment for a in training]
train_y = [a.sentiment for a in training]

# Test models
test_x = [a.comment for a in test]
test_y = [a.sentiment for a in test]

Bags Of Words Vectorization

In [None]:
vectorizer = CountVectorizer()
train_x_vectors = vectorizer.fit_transform(train_x)

# We only transform our test data...no fitting like in train data
test_x_vectors = vectorizer.transform(test_x)

We then create a model to work with from 'train_x_vectors' and 'train_y'.<br>
But 1st lets look at some models inorder for us to know the best option

### <u>Model Selection and Classification</u>
#### Classification

1. Linear SVM

In [None]:
# Lets leave this sentiment here for checking
test_y[3]

In [None]:
# Fisrt create a SVC classifier object
cls_svm = svm.SVC(kernel='linear')

# Then fit the classifier into the model using the _.fit method
cls_svm.fit(train_x_vectors, train_y)


In [None]:
# Finally we can use the .predict method on our test data to see if it can predict
# We use the vectorized test data 'test_x_vectors' to see if our classifer can predict
cls_svm.predict(test_x_vectors[3])

2. Decision Tree

In [None]:
cls_dec = DecisionTreeClassifier()
cls_dec.fit(train_x_vectors, train_y)

In [None]:
cls_dec.predict(test_x_vectors[3])

3. Naive Bayes (Gaussian Naive Bayes)

In [None]:
cls_gnb = GaussianNB()
cls_gnb.fit(train_x_vectors.toarray(), train_y)

In [None]:
cls_gnb.predict(test_x_vectors[3].toarray())

4. Logistic Regression

In [None]:
cls_log = LogisticRegression(max_iter=1000)
cls_log.fit(train_x_vectors, train_y)


In [None]:
cls_log.predict(test_x_vectors[3])

#### Evaluation

Now that we've created sample models, lets see how well each model performs