##### Use Case: Rating Prediction Create a model that will predict the rating based on the feedback of the customer. 
* Feature: Text Label: Stars 
* Dataset: yelp.csv

In [2]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
import nltk

df = pd.read_csv("yelp.csv")
df = df[["text", "stars"]]
df.columns = ["feedback", "rating"]

stop_words = set(stopwords.words("english"))
ps = PorterStemmer()

def preprocess_text(text):
    text = str(text).lower()
    text = re.sub(r"[^a-z]", " ", text)
    tokens = word_tokenize(text)
    tokens = [ps.stem(word) for word in tokens if word not in stop_words]
    return " ".join(tokens)

df["clean_feedback"] = df["feedback"].apply(preprocess_text)

X = TfidfVectorizer(max_features=5000).fit_transform(df["clean_feedback"])
y = df["rating"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = MultinomialNB()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Accuracy: 0.4685
              precision    recall  f1-score   support

           1       0.82      0.06      0.12       142
           2       1.00      0.01      0.01       166
           3       0.44      0.01      0.03       289
           4       0.42      0.76      0.54       739
           5       0.56      0.54      0.55       664

    accuracy                           0.47      2000
   macro avg       0.65      0.28      0.25      2000
weighted avg       0.55      0.47      0.40      2000

