# Restaurant Review Classifiers

## Overview
This notebook demonstrates a multi-output classification model for restaurant reviews, predicting food, service, and atmosphere ratings.

In [19]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import pandas as pd

In [20]:
class RestaurantReviewClassifier:
    def __init__(self, model='logistic_regression'):
        self.tfidf_vectorizer = TfidfVectorizer(
            stop_words='english',
            max_features=5000,
            ngram_range=(1, 2)
        )
        self.label_encoders = {
            'food': LabelEncoder(),
            'service': LabelEncoder(),
            'atmosphere': LabelEncoder()
        }

        # Initialize classifier based on input model
        if model == 'logistic_regression':
            base_classifier = LogisticRegression(max_iter=1000)
        elif model == 'random_forest':
            base_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
        elif model == 'svm':
            base_classifier = SVC(probability=True, kernel='linear')
        else:
            raise ValueError("Model not supported. Choose from 'logistic_regression', 'random_forest', 'svm'.")

        self.classifier = MultiOutputClassifier(base_classifier)

    def preprocess_data(self, df):
        # Drop rows with NaN in text column
        df = df.dropna(subset=['text'])

        # Fill NaN in categorical columns with 'None'
        columns = ['food', 'service', 'atmosphere']
        for col in columns:
            if col not in df.columns:
                df[col] = 'None'
            df.loc[:, col] = df[col].fillna('None')

        return df

    def train(self, df):
        # Preprocess data
        df = self.preprocess_data(df)

        # Vectorize text
        X = self.tfidf_vectorizer.fit_transform(df['text'].astype(str))

        # Encode labels dynamically
        y_dict = {}
        for col in ['food', 'service', 'atmosphere']:
            unique_labels = df[col].unique()
            self.label_encoders[col].fit(unique_labels)
            y_dict[col] = self.label_encoders[col].transform(df[col])

        y = pd.DataFrame(y_dict)

        # Split data
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, random_state=42
        )

        # Train classifier
        self.classifier.fit(X_train, y_train)

        # Predict
        y_pred = self.classifier.predict(X_test)

        # Evaluate
        print("\nModel Performance Metrics:")
        for i, col in enumerate(['food', 'service', 'atmosphere']):
            print(f"\n{col.capitalize()} Classification:")
            classes = self.label_encoders[col].classes_

            accuracy = accuracy_score(y_test.iloc[:, i], y_pred[:, i])
            print(f"Accuracy: {accuracy:.2%}")

            print(classification_report(
                y_test.iloc[:, i],
                y_pred[:, i],
                labels=range(len(classes)),
                target_names=classes
            ))

    def predict(self, texts):
        # Convert texts to strings and handle potential NaN
        texts = [str(text) if pd.notna(text) else '' for text in texts]

        # Vectorize input
        X = self.tfidf_vectorizer.transform(texts)

        # Predict
        predictions = self.classifier.predict(X)

        # Decode predictions
        results = []
        for pred in predictions:
            result = {
                col: self.label_encoders[col].inverse_transform([p])[0]
                for col, p in zip(['food', 'service', 'atmosphere'], pred)
            }
            results.append(result)

        return results


In [21]:
# Logistic Regression
classifier_lr = RestaurantReviewClassifier(model='logistic_regression')

# Random Forest
classifier_rf = RestaurantReviewClassifier(model='random_forest')

# Support Vector Machine
classifier_svm = RestaurantReviewClassifier(model='svm')


In [25]:
df = pd.read_csv('../raw_data/clean_data.csv')  # Load your dataset

if 'to_be_removed' in df.columns:
    df = df[df['to_be_removed'].isna()]

df_low_score = df[df['rating'] < 5]

# Sample 500 reviews with a score of 5
df_score_5_sample = df[df['rating'] == 5].sample(n=500, random_state=42)

df_filtered = pd.concat([df_low_score, df_score_5_sample])
df_filtered.reset_index(drop=True, inplace=True)

df = df_filtered

# Train using Logistic Regression
classifier_lr.train(df)

# Train using Random Forest
classifier_rf.train(df)

# Train using SVM
classifier_svm.train(df)




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, col] = df[col].fillna('None')



Model Performance Metrics:

Food Classification:
Accuracy: 77.53%
              precision    recall  f1-score   support

    Negative       0.75      0.45      0.56        60
        None       0.00      0.00      0.00        23
    Positive       0.78      0.98      0.87       184

    accuracy                           0.78       267
   macro avg       0.51      0.48      0.48       267
weighted avg       0.71      0.78      0.72       267


Service Classification:
Accuracy: 71.91%
              precision    recall  f1-score   support

    Negative       0.94      0.48      0.63        63
        None       0.66      0.76      0.70        98
    Positive       0.72      0.83      0.77       106

    accuracy                           0.72       267
   macro avg       0.77      0.69      0.70       267
weighted avg       0.75      0.72      0.71       267


Atmosphere Classification:
Accuracy: 73.78%
              precision    recall  f1-score   support

    Negative       0.00      

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, col] = df[col].fillna('None')



Model Performance Metrics:

Food Classification:
Accuracy: 75.28%
              precision    recall  f1-score   support

    Negative       0.71      0.37      0.48        60
        None       0.00      0.00      0.00        23
    Positive       0.76      0.97      0.85       184

    accuracy                           0.75       267
   macro avg       0.49      0.45      0.45       267
weighted avg       0.68      0.75      0.70       267


Service Classification:
Accuracy: 74.16%
              precision    recall  f1-score   support

    Negative       0.82      0.43      0.56        63
        None       0.74      0.82      0.78        98
    Positive       0.72      0.86      0.78       106

    accuracy                           0.74       267
   macro avg       0.76      0.70      0.71       267
weighted avg       0.75      0.74      0.73       267


Atmosphere Classification:
Accuracy: 74.16%
              precision    recall  f1-score   support

    Negative       0.00      

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, col] = df[col].fillna('None')



Model Performance Metrics:

Food Classification:
Accuracy: 77.15%
              precision    recall  f1-score   support

    Negative       0.62      0.55      0.58        60
        None       0.00      0.00      0.00        23
    Positive       0.81      0.94      0.87       184

    accuracy                           0.77       267
   macro avg       0.48      0.50      0.49       267
weighted avg       0.70      0.77      0.73       267


Service Classification:
Accuracy: 74.53%
              precision    recall  f1-score   support

    Negative       0.83      0.62      0.71        63
        None       0.69      0.79      0.73        98
    Positive       0.77      0.78      0.78       106

    accuracy                           0.75       267
   macro avg       0.76      0.73      0.74       267
weighted avg       0.75      0.75      0.74       267


Atmosphere Classification:
Accuracy: 74.91%
              precision    recall  f1-score   support

    Negative       0.00      

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [23]:
texts = ["The food was amazing!", "The service was slow.", "Loved the atmosphere!"]

# Predict using Logistic Regression
predictions_lr = classifier_lr.predict(texts)
print("Logistic Regression Predictions:", predictions_lr)

# Predict using Random Forest
predictions_rf = classifier_rf.predict(texts)
print("Random Forest Predictions:", predictions_rf)

# Predict using SVM
predictions_svm = classifier_svm.predict(texts)
print("SVM Predictions:", predictions_svm)



Logistic Regression Predictions: [{'food': 'Positive', 'service': 'None', 'atmosphere': 'None'}, {'food': 'Positive', 'service': 'Positive', 'atmosphere': 'None'}, {'food': 'Positive', 'service': 'Positive', 'atmosphere': 'Positive'}]
Random Forest Predictions: [{'food': 'Positive', 'service': 'None', 'atmosphere': 'None'}, {'food': 'Positive', 'service': 'Positive', 'atmosphere': 'None'}, {'food': 'Positive', 'service': 'None', 'atmosphere': 'Positive'}]
SVM Predictions: [{'food': 'Positive', 'service': 'None', 'atmosphere': 'None'}, {'food': 'Positive', 'service': 'Negative', 'atmosphere': 'None'}, {'food': 'Positive', 'service': 'Positive', 'atmosphere': 'Positive'}]


In [26]:
# load test_data_reviews
test_data = pd.read_csv('../test_data_reviews/2_4_review.csv')

# Predict using Logistic Regression
predictions_lr = classifier_lr.predict(test_data['text'][0:5])
#print review text and predicted ratings
for i in range(5):
    print(test_data['text'][i])
    print("Logistic Regression Predictions:", predictions_lr[i])
    print("\n")

embarrassment leaflets left guest house call dozen connections lack response went personally reaching place add sunday restro
Logistic Regression Predictions: {'food': 'Positive', 'service': 'None', 'atmosphere': 'None'}


eating terrible kebabs fall apart order kebab get menu everything looks terrible unpleasant mean service waiting time also long
Logistic Regression Predictions: {'food': 'Negative', 'service': 'Negative', 'atmosphere': 'None'}


got kebab ordered indicated address kebab one big fat failure not recommend anyone pizza stuck cardboard packaging
Logistic Regression Predictions: {'food': 'Positive', 'service': 'None', 'atmosphere': 'None'}


lover good jokes pizzeria bacon sticks box like gypsy benefit canned ham caught promotion beautifully decorates vomit price zlotys think lightly
Logistic Regression Predictions: {'food': 'Positive', 'service': 'None', 'atmosphere': 'None'}


not understand negative opinions addition season pizza made moment order serve minutes e much 