In [16]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt
import nltk
from nltk.tokenize import word_tokenize

# Download the punkt tokenizer if not already downloaded
nltk.download('punkt')

reviews = pd.read_csv("amazon_reviews.csv", encoding='utf-8')
selected_reviews = reviews[['reviewText', 'overall']]

selected_reviews.loc[:, 'reviewText'] = selected_reviews['reviewText'].fillna('')

# Tokenize the text using nltk
selected_reviews.loc[:, 'reviewText'] = selected_reviews['reviewText'].apply(word_tokenize)

# Join the tokens back into a string for TfidfVectorizer
selected_reviews.loc[:, 'reviewText'] = selected_reviews['reviewText'].apply(lambda x: ' '.join(x))

vectorizer = TfidfVectorizer()
x = vectorizer.fit_transform(selected_reviews['reviewText'])
y = selected_reviews['overall']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

model = MultinomialNB()
model.fit(x_train, y_train)

y_pred = model.predict(x_test)

print(classification_report(y_test, y_pred, zero_division=0))


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\natth\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


              precision    recall  f1-score   support

         1.0       0.00      0.00      0.00        50
         2.0       0.00      0.00      0.00        20
         3.0       0.00      0.00      0.00        25
         4.0       1.00      0.01      0.02       106
         5.0       0.80      1.00      0.89       782

    accuracy                           0.80       983
   macro avg       0.36      0.20      0.18       983
weighted avg       0.74      0.80      0.71       983



In [15]:
# Test the model
test_review = "hate this product"
test_review_vector = vectorizer.transform([test_review])
prediction = model.predict(test_review_vector)
print(f"The predicted rating for the test review is: {prediction[0]}")


The predicted rating for the test review is: 5.0
