In [1]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [2]:
df = pd.read_csv(r'C:\Users\Me\Desktop\LabFinals\Reviews.csv')
df.columns.tolist()

['Id',
 'ProductId',
 'UserId',
 'ProfileName',
 'HelpfulnessNumerator',
 'HelpfulnessDenominator',
 'Score',
 'Time',
 'Summary',
 'Text']

In [3]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'\S+@\S+','',text)
    text = re.sub(r"http\S+|www\S+|https\S+", '', text)
    text = re.sub(r'\d+','',text)
    text = re.sub(r'[^\w\\s]','',text)
    text = re.sub(r'\s+','',text).strip()
    return text

In [6]:
df['clean_text'] = df['Text'].apply(preprocess_text)

In [7]:
X = df['clean_text']
y = df['Score']

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)

In [9]:
vectorizer = TfidfVectorizer(stop_words='english',max_df=0.9,min_df=2)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

In [10]:
model = MultinomialNB()
model.fit(X_train_vec,y_train)
y_pred = model.predict(X_test_vec)

In [11]:
print("accuracy score : ",accuracy_score(y_test,y_pred))
print('\nclassification report: \n',classification_report(y_test,y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

accuracy score :  0.7354232085213429

classification report: 
               precision    recall  f1-score   support

           1       1.00      0.30      0.46     10326
           2       1.00      0.16      0.28      5855
           3       1.00      0.22      0.36      8485
           4       1.00      0.30      0.46     16123
           5       0.71      1.00      0.83     72902

    accuracy                           0.74    113691
   macro avg       0.94      0.40      0.48    113691
weighted avg       0.81      0.74      0.68    113691


Confusion Matrix:
 [[ 3098     0     0     0  7228]
 [    0   964     1     1  4889]
 [    0     0  1867     2  6616]
 [    0     0     0  4781 11342]
 [    0     1     0     0 72901]]


In [12]:
def predict_score(text):
    text = preprocess_text(text)
    vec = vectorizer.transform([text])
    pred = model.predict(vec)[0]
    return pred

In [14]:
pred = predict_score('I really liked the product. it was a 10/10.')
print(pred)
if pred>3:
    print('Best') 
elif pred>2 :
    print("Average")
else:
    print("Below Avg")

5
Best
