# Restaurant Review Classifiers

## Overview
This notebook demonstrates a multi-output classification model for restaurant reviews, predicting food, service, and atmosphere ratings.

In [33]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import pandas as pd

In [34]:
class RestaurantReviewClassifier:
    def __init__(self, model='logistic_regression'):
        self.tfidf_vectorizer = TfidfVectorizer(
            stop_words='english',
            max_features=5000,
            ngram_range=(1, 2)
        )
        self.label_encoders = {
            'food': LabelEncoder(),
            'service': LabelEncoder(),
            'atmosphere': LabelEncoder()
        }

        # Initialize classifier based on input model
        if model == 'logistic_regression':
            base_classifier = LogisticRegression(max_iter=1000)
        elif model == 'random_forest':
            base_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
        elif model == 'svm':
            base_classifier = SVC(probability=True, kernel='linear')
        else:
            raise ValueError("Model not supported. Choose from 'logistic_regression', 'random_forest', 'svm'.")

        self.classifier = MultiOutputClassifier(base_classifier)

    def preprocess_data(self, df):
        # Drop rows with NaN in text column
        df = df.dropna(subset=['text'])

        # Fill NaN in categorical columns with 'None'
        columns = ['food', 'service', 'atmosphere']
        for col in columns:
            if col not in df.columns:
                df[col] = 'None'
            df.loc[:, col] = df[col].fillna('None')

        return df

    def train(self, df):
        # Preprocess data
        df = self.preprocess_data(df)

        # Vectorize text
        X = self.tfidf_vectorizer.fit_transform(df['text'].astype(str))

        # Encode labels dynamically
        y_dict = {}
        for col in ['food', 'service', 'atmosphere']:
            unique_labels = df[col].unique()
            self.label_encoders[col].fit(unique_labels)
            y_dict[col] = self.label_encoders[col].transform(df[col])

        y = pd.DataFrame(y_dict)

        # Split data
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, random_state=42
        )

        # Train classifier
        self.classifier.fit(X_train, y_train)

        # Predict
        y_pred = self.classifier.predict(X_test)

        # Evaluate
        print("\nModel Performance Metrics:")
        for i, col in enumerate(['food', 'service', 'atmosphere']):
            print(f"\n{col.capitalize()} Classification:")
            classes = self.label_encoders[col].classes_

            accuracy = accuracy_score(y_test.iloc[:, i], y_pred[:, i])
            print(f"Accuracy: {accuracy:.2%}")

            print(classification_report(
                y_test.iloc[:, i],
                y_pred[:, i],
                labels=range(len(classes)),
                target_names=classes
            ))

    def predict(self, texts):
        # Convert texts to strings and handle potential NaN
        texts = [str(text) if pd.notna(text) else '' for text in texts]

        # Vectorize input
        X = self.tfidf_vectorizer.transform(texts)

        # Predict
        predictions = self.classifier.predict(X)

        # Decode predictions
        results = []
        for pred in predictions:
            result = {
                col: self.label_encoders[col].inverse_transform([p])[0]
                for col, p in zip(['food', 'service', 'atmosphere'], pred)
            }
            results.append(result)

        return results


In [35]:
# Logistic Regression
classifier_lr = RestaurantReviewClassifier(model='logistic_regression')

# Random Forest
classifier_rf = RestaurantReviewClassifier(model='random_forest')

# Support Vector Machine
classifier_svm = RestaurantReviewClassifier(model='svm')


### Load Data

In [36]:
df = pd.read_csv('../raw_data/random_choice_data.csv')  # Load your dataset


### Train using Logistic Regression

In [37]:
# Train using Logistic Regression
classifier_lr.train(df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, col] = df[col].fillna('None')



Model Performance Metrics:

Food Classification:
Accuracy: 93.24%
              precision    recall  f1-score   support

    Negative       0.96      0.92      0.94       470
        None       0.96      0.95      0.95       460
    Positive       0.85      0.93      0.89       269

    accuracy                           0.93      1199
   macro avg       0.92      0.93      0.93      1199
weighted avg       0.93      0.93      0.93      1199


Service Classification:
Accuracy: 90.24%
              precision    recall  f1-score   support

    Negative       0.96      0.91      0.94       497
        None       0.87      0.91      0.89       381
    Positive       0.85      0.89      0.87       321

    accuracy                           0.90      1199
   macro avg       0.90      0.90      0.90      1199
weighted avg       0.90      0.90      0.90      1199


Atmosphere Classification:
Accuracy: 90.66%
              precision    recall  f1-score   support

    Negative       0.98      

### Train using Random Forest

In [38]:
classifier_rf.train(df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, col] = df[col].fillna('None')



Model Performance Metrics:

Food Classification:
Accuracy: 92.99%
              precision    recall  f1-score   support

    Negative       0.99      0.89      0.93       470
        None       0.99      0.94      0.96       460
    Positive       0.78      0.99      0.87       269

    accuracy                           0.93      1199
   macro avg       0.92      0.94      0.92      1199
weighted avg       0.94      0.93      0.93      1199


Service Classification:
Accuracy: 88.57%
              precision    recall  f1-score   support

    Negative       0.97      0.87      0.92       497
        None       0.86      0.91      0.89       381
    Positive       0.81      0.88      0.84       321

    accuracy                           0.89      1199
   macro avg       0.88      0.89      0.88      1199
weighted avg       0.89      0.89      0.89      1199


Atmosphere Classification:
Accuracy: 87.66%
              precision    recall  f1-score   support

    Negative       0.97      

### Train using SVM

In [39]:
classifier_svm.train(df)




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, col] = df[col].fillna('None')



Model Performance Metrics:

Food Classification:
Accuracy: 94.08%
              precision    recall  f1-score   support

    Negative       0.96      0.93      0.95       470
        None       0.97      0.95      0.96       460
    Positive       0.86      0.94      0.90       269

    accuracy                           0.94      1199
   macro avg       0.93      0.94      0.94      1199
weighted avg       0.94      0.94      0.94      1199


Service Classification:
Accuracy: 90.74%
              precision    recall  f1-score   support

    Negative       0.97      0.92      0.95       497
        None       0.86      0.91      0.88       381
    Positive       0.87      0.88      0.87       321

    accuracy                           0.91      1199
   macro avg       0.90      0.90      0.90      1199
weighted avg       0.91      0.91      0.91      1199


Atmosphere Classification:
Accuracy: 90.99%
              precision    recall  f1-score   support

    Negative       0.98      

### Predict using Logistic Regression



In [40]:
# load test_data_reviews
test_data = pd.read_csv('../test_data_reviews/2_4_review.csv')    

# Predict using Logistic Regression
predictions_lr = classifier_lr.predict(test_data['text'][0:5])
#print review text and predicted ratings
for i in range(5):
    print(test_data['wiI7pd'][i])
    print("Logistic Regression Predictions:", predictions_lr[i])
    print("\n")

Żenada ulotki zostawione w pensjonacie ale można sobie do nich dzwonić. Po kilkunastu wykonanych połączeniach i braku odzewu, udaliśmy się tam osobiście bo mieliśmy 750m. Po dotarciu na miejsce dodam że to była niedziela lokal Restro &…
Logistic Regression Predictions: {'food': 'None', 'service': 'None', 'atmosphere': 'Positive'}


Jedzenie okropne kebab się rozlatuje. Zamawiasz kebaba i dostajesz i tak tego z menu wszystko wygląda okropnie. Obsługa niemiła i wredna. Czas oczekiwania również długi.
Logistic Regression Predictions: {'food': 'Negative', 'service': 'Negative', 'atmosphere': 'None'}


Dostałem kebab, który został zamówiony na wskazany adres.  Kebab to jedna wielka porażka tłuszczowa.  Nie polecam nikomu, pizza przykleiła się do tektury z opakowania.
Logistic Regression Predictions: {'food': 'Positive', 'service': 'None', 'atmosphere': 'None'}


Jeśli jesteś miłośnikiem dobrych zartów, to ta pizzeria to jest właśnie to. Boczek przykleja się do pudełka niczym cygan do zasiłk

### Predict using Random Forest

In [41]:
# Predict using Random Forest
predictions_rf = classifier_rf.predict(test_data['text'][0:5])
#print review text and predicted ratings
for i in range(5):
    print(test_data['wiI7pd'][i])
    print("Random Forest Predictions:", predictions_rf[i])
    print("\n")

Żenada ulotki zostawione w pensjonacie ale można sobie do nich dzwonić. Po kilkunastu wykonanych połączeniach i braku odzewu, udaliśmy się tam osobiście bo mieliśmy 750m. Po dotarciu na miejsce dodam że to była niedziela lokal Restro &…
Random Forest Predictions: {'food': 'None', 'service': 'None', 'atmosphere': 'Positive'}


Jedzenie okropne kebab się rozlatuje. Zamawiasz kebaba i dostajesz i tak tego z menu wszystko wygląda okropnie. Obsługa niemiła i wredna. Czas oczekiwania również długi.
Random Forest Predictions: {'food': 'None', 'service': 'Negative', 'atmosphere': 'None'}


Dostałem kebab, który został zamówiony na wskazany adres.  Kebab to jedna wielka porażka tłuszczowa.  Nie polecam nikomu, pizza przykleiła się do tektury z opakowania.
Random Forest Predictions: {'food': 'Positive', 'service': 'None', 'atmosphere': 'None'}


Jeśli jesteś miłośnikiem dobrych zartów, to ta pizzeria to jest właśnie to. Boczek przykleja się do pudełka niczym cygan do zasiłku. Szynka konserwowa z

### Predict using SVM

In [42]:
# Predict using SVM
predictions_svm = classifier_svm.predict(test_data['text'][0:5])
#print review text and predicted ratings
for i in range(5):
    print(test_data['wiI7pd'][i])
    print("SVM Predictions:", predictions_svm[i])
    print("\n")



Żenada ulotki zostawione w pensjonacie ale można sobie do nich dzwonić. Po kilkunastu wykonanych połączeniach i braku odzewu, udaliśmy się tam osobiście bo mieliśmy 750m. Po dotarciu na miejsce dodam że to była niedziela lokal Restro &…
SVM Predictions: {'food': 'None', 'service': 'None', 'atmosphere': 'Positive'}


Jedzenie okropne kebab się rozlatuje. Zamawiasz kebaba i dostajesz i tak tego z menu wszystko wygląda okropnie. Obsługa niemiła i wredna. Czas oczekiwania również długi.
SVM Predictions: {'food': 'Negative', 'service': 'Negative', 'atmosphere': 'None'}


Dostałem kebab, który został zamówiony na wskazany adres.  Kebab to jedna wielka porażka tłuszczowa.  Nie polecam nikomu, pizza przykleiła się do tektury z opakowania.
SVM Predictions: {'food': 'Positive', 'service': 'None', 'atmosphere': 'None'}


Jeśli jesteś miłośnikiem dobrych zartów, to ta pizzeria to jest właśnie to. Boczek przykleja się do pudełka niczym cygan do zasiłku. Szynka konserwowa złapana na promocji pięknie

### Rating Function

In [61]:
def Final_Rating(reviews_path,classifier):
    test_data = pd.read_csv(reviews_path)
    # print("\n")
    # print(len(test_data))
    predictions = classifier.predict(test_data['text'])

    total_amount_of_predicions = len(predictions)
    Ratings = \
        {
            "food":{
                'None':0 ,'Positive':0,'Negative':0
            },
            "service":{
                'None':0 ,'Positive':0,'Negative':0
            },
            "atmosphere":{
                'None':0 ,'Positive':0,'Negative':0
            }
        }
    # print(Ratings)

    for i in predictions:
        for aspect in ["food","service","atmosphere"]:
            Ratings[aspect][i[aspect]] += 1

    for aspect in Ratings:
        for rate in Ratings[aspect]:
            Ratings[aspect][rate] =  round(Ratings[aspect][rate]/total_amount_of_predicions*100,2)

    food_final_score = (Ratings["food"]['None'] * 3 + Ratings["food"]['Positive'] * 5 + Ratings["food"]['Negative'] * 1) / 100
    service_final_score = (Ratings["service"]['None'] * 3 + Ratings["service"]['Positive'] * 5 + Ratings["service"]['Negative'] * 1) / 100
    atmosphere_final_score = (Ratings["atmosphere"]['None'] * 3 + Ratings["atmosphere"]['Positive'] * 5 + Ratings["atmosphere"]['Negative'] * 1) / 100

    return food_final_score,service_final_score,atmosphere_final_score,round((food_final_score+service_final_score+atmosphere_final_score)/3,1)


    # print("Food ", round(food_final_score,2))
    # print("Service ",round(service_final_score,2))
    # print("Atmosphere ",round(atmosphere_final_score,2))
    # print("Averge", round((food_final_score+service_final_score+atmosphere_final_score)/3,1))

#
# print("Rating 2,4  :")
# Final_Rating('../test_data_reviews/2_4_review.csv',classifier_lr)
# Final_Rating('../test_data_reviews/2_4_review.csv',classifier_rf)
# Final_Rating('../test_data_reviews/2_4_review.csv',classifier_svm)
#
# print("\n\nRating 2,9  :")
# Final_Rating('../test_data_reviews/2_9_review.csv',classifier_lr)
# Final_Rating('../test_data_reviews/2_9_review.csv',classifier_rf)
# Final_Rating('../test_data_reviews/2_9_review.csv',classifier_svm)
#
# print("\n\nRating 3,4  :")
# Final_Rating('../test_data_reviews/3_4_review.csv',classifier_lr)
# Final_Rating('../test_data_reviews/3_4_review.csv',classifier_rf)
# Final_Rating('../test_data_reviews/3_4_review.csv',classifier_svm)
#
# print("\n\nRating 4,7  :")
# Final_Rating('../test_data_reviews/4_7_review.csv',classifier_lr)
# Final_Rating('../test_data_reviews/4_7_review.csv',classifier_rf)
# Final_Rating('../test_data_reviews/4_7_review.csv',classifier_svm)

results = []

files = ['../test_data_reviews/2_4_review.csv',
         '../test_data_reviews/2_9_review.csv',
         '../test_data_reviews/3_4_review.csv',
         '../test_data_reviews/4_7_review.csv']

classifiers = {'Logistic Regression': classifier_lr,
               'Random Forest': classifier_rf,
               'SVM': classifier_svm}
for file in files:
    file_name = file.split('/')[-1]
    for name, clf in classifiers.items():
        rating = Final_Rating(file, clf)
        results.append({'File': file_name, 'Classifier': name, 'Food_Rating': rating[0], 'Service_Rating': rating[1],
                        'Atmosphere_Rating': rating[2], 'Averge_Rating': rating[3]})

df = pd.DataFrame(results)

print(df)

              File           Classifier  Food_Rating  Service_Rating  \
0   2_4_review.csv  Logistic Regression       2.8226          2.8231   
1   2_4_review.csv        Random Forest       4.3038          3.0763   
2   2_4_review.csv                  SVM       2.9240          2.7848   
3   2_9_review.csv  Logistic Regression       3.1997          3.0646   
4   2_9_review.csv        Random Forest       4.3160          3.3096   
5   2_9_review.csv                  SVM       3.2000          2.9354   
6   3_4_review.csv  Logistic Regression       3.6293          3.3828   
7   3_4_review.csv        Random Forest       4.2962          3.5183   
8   3_4_review.csv                  SVM       3.6790          3.3086   
9   4_7_review.csv  Logistic Regression       4.5320          4.2537   
10  4_7_review.csv        Random Forest       4.8162          4.1807   
11  4_7_review.csv                  SVM       4.6717          4.1364   

    Atmosphere_Rating  Averge_Rating  
0              3.0252   