Importing required packages

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import string

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

from imblearn.over_sampling import SMOTE

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Reading in the data

In [3]:
# Reading the reviews
reviews_labelled= pd.read_csv("/content/drive/MyDrive/ise540/labelled_with_zeros.csv")

In [4]:
reviews_labelled.head()

Unnamed: 0.1,Unnamed: 0,Review,Review_Rating,Service,Food Quality,Ambiance/Atmosphere,Wait Time,Price,Menu Variety
0,0,The New Mexican a style cuisine here is damn g...,5.0,4.0,5.0,0.0,0.0,0.0,4.0
1,1,The Eastern European food is great but do not ...,5.0,0.0,5.0,0.0,5.0,0.0,5.0
2,2,Great service and great scene. Two levels with...,5.0,5.0,0.0,5.0,5.0,0.0,0.0
3,3,Had dinner here with my co-workers and this pl...,5.0,5.0,5.0,5.0,0.0,0.0,5.0
4,4,We have dined at Scoogi's many times and have ...,4.0,3.0,5.0,0.0,0.0,0.0,5.0


Checking the dataset for any null values

In [5]:
reviews_labelled = reviews_labelled.drop(columns='Unnamed: 0')

In [6]:
reviews_labelled.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 8 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Review               5000 non-null   object 
 1   Review_Rating        5000 non-null   float64
 2   Service              5000 non-null   float64
 3   Food Quality         5000 non-null   float64
 4   Ambiance/Atmosphere  5000 non-null   float64
 5   Wait Time            5000 non-null   float64
 6   Price                5000 non-null   float64
 7   Menu Variety         5000 non-null   float64
dtypes: float64(7), object(1)
memory usage: 312.6+ KB


In [7]:
reviews_labelled.isna().sum()

Review                 0
Review_Rating          0
Service                0
Food Quality           0
Ambiance/Atmosphere    0
Wait Time              0
Price                  0
Menu Variety           0
dtype: int64

Cleaning the Review text and preparing it for vectorization

In [8]:
# cleaning the reviews text

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [9]:
def preprocess_text(text, stp_wrds):
  # Remove punctuation and convert to lowercase
  text = text.translate(str.maketrans('', '', string.punctuation)).lower()
  # Tokenize the text
  tokens = word_tokenize(text)
  if stp_wrds:
    # Remove stopwords and lemmatize
    tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words]
  return (' '.join(tokens))

In [10]:
reviews_labelled['Review_cleaned_without_stpwrds'] = reviews_labelled['Review'].apply(preprocess_text, args=(True,))
reviews_labelled['Review_cleaned_with_stpwrds'] = reviews_labelled['Review'].apply(preprocess_text, args=(False,))

reviews_labelled.head()

Unnamed: 0,Review,Review_Rating,Service,Food Quality,Ambiance/Atmosphere,Wait Time,Price,Menu Variety,Review_cleaned_without_stpwrds,Review_cleaned_with_stpwrds
0,The New Mexican a style cuisine here is damn g...,5.0,4.0,5.0,0.0,0.0,0.0,4.0,new mexican style cuisine damn good love break...,the new mexican a style cuisine here is damn g...
1,The Eastern European food is great but do not ...,5.0,0.0,5.0,0.0,5.0,0.0,5.0,eastern european food great sleep burger low k...,the eastern european food is great but do not ...
2,Great service and great scene. Two levels with...,5.0,5.0,0.0,5.0,5.0,0.0,0.0,great service great scene two level live music...,great service and great scene two levels with ...
3,Had dinner here with my co-workers and this pl...,5.0,5.0,5.0,5.0,0.0,0.0,5.0,dinner coworkers place didnt disappoint great ...,had dinner here with my coworkers and this pla...
4,We have dined at Scoogi's many times and have ...,4.0,3.0,5.0,0.0,0.0,0.0,5.0,dined scoogis many time always left happy full...,we have dined at scoogis many times and have a...


In [11]:
X_df = reviews_labelled['Review_cleaned_without_stpwrds']
X_df.head()


0    new mexican style cuisine damn good love break...
1    eastern european food great sleep burger low k...
2    great service great scene two level live music...
3    dinner coworkers place didnt disappoint great ...
4    dined scoogis many time always left happy full...
Name: Review_cleaned_without_stpwrds, dtype: object

In [12]:
Y_df = reviews_labelled[['Service', 'Food Quality', 'Ambiance/Atmosphere', 'Wait Time', 'Price', 'Menu Variety']]
Y_df.head()

Unnamed: 0,Service,Food Quality,Ambiance/Atmosphere,Wait Time,Price,Menu Variety
0,4.0,5.0,0.0,0.0,0.0,4.0
1,0.0,5.0,0.0,5.0,0.0,5.0
2,5.0,0.0,5.0,5.0,0.0,0.0
3,5.0,5.0,5.0,0.0,0.0,5.0
4,3.0,5.0,0.0,0.0,0.0,5.0


Vectorizing cleaned Reviews using TF-IDF

In [13]:
# Initialize the TF-IDF vectorizer
vectorizer = TfidfVectorizer(max_features=1000)
tfidf_matrix = vectorizer.fit_transform(X_df)


tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())
tfidf_df = tfidf_df.reindex(Y_df.index)

df_combined = pd.concat([tfidf_df, Y_df], axis=1)
df_combined.head()

Unnamed: 0,10,12,15,20,25,30,35,40,45,50,...,young,youre,yum,yummy,Service,Food Quality,Ambiance/Atmosphere,Wait Time,Price,Menu Variety
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.232567,0.0,4.0,5.0,0.0,0.0,0.0,4.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.213567,0.0,0.0,0.0,5.0,0.0,5.0,0.0,5.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,5.0,0.0,5.0,5.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.097312,0.0,0.0,5.0,5.0,5.0,0.0,0.0,5.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,3.0,5.0,0.0,0.0,0.0,5.0


In [14]:
# Features (X) are the TF-IDF values
X = df_combined.drop(['Service', 'Food Quality', 'Ambiance/Atmosphere', 'Wait Time', 'Price', 'Menu Variety'], axis=1)

# Labels (y) are the class labels
y = df_combined[['Service', 'Food Quality', 'Ambiance/Atmosphere', 'Wait Time', 'Price', 'Menu Variety']]

In [15]:
X

Unnamed: 0,10,12,15,20,25,30,35,40,45,50,...,year,yelp,yes,yesterday,yet,youll,young,youre,yum,yummy
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.232567,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.213567,0.000000,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.097312,0.000000,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.131824,0.0,0.0,0.000000,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.0
4996,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.0
4997,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.0
4998,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.0


In [16]:
y

Unnamed: 0,Service,Food Quality,Ambiance/Atmosphere,Wait Time,Price,Menu Variety
0,4.0,5.0,0.0,0.0,0.0,4.0
1,0.0,5.0,0.0,5.0,0.0,5.0
2,5.0,0.0,5.0,5.0,0.0,0.0
3,5.0,5.0,5.0,0.0,0.0,5.0
4,3.0,5.0,0.0,0.0,0.0,5.0
...,...,...,...,...,...,...
4995,0.0,5.0,0.0,0.0,0.0,5.0
4996,5.0,5.0,5.0,0.0,5.0,5.0
4997,5.0,0.0,0.0,0.0,0.0,0.0
4998,4.0,3.0,4.0,0.0,0.0,0.0


Multiclass classification Model

In [17]:
class Ensemble_Classifier:
  def __init__(self):
    self.service_classifier = None
    self.food_qlt_classifier = None
    self.atmos_classifier = None
    self.wt_classifier = None
    self.price_classifier = None
    self.menu_classifier = None

  def fit(self, X_train, y_train, classifier="decision_tree"):
    self.service_classifier = OneAttributeClassifier(X_train, y_train[:,0], classifier)
    self.food_qlt_classifier = OneAttributeClassifier(X_train, y_train[:,1], classifier)
    self.atmos_classifier = OneAttributeClassifier(X_train, y_train[:,2], classifier)
    self.wt_classifier = OneAttributeClassifier(X_train, y_train[:,3], classifier)
    self.price_classifier = OneAttributeClassifier(X_train, y_train[:,4], classifier)
    self.menu_classifier = OneAttributeClassifier(X_train, y_train[:,5], classifier)

  def predict(self, X_test):
    service_preds = self.service_classifier.predict(X_test)
    food_qlt_preds = self.food_qlt_classifier.predict(X_test)
    atmos_preds = self.atmos_classifier.predict(X_test)
    wt_preds = self.wt_classifier.predict(X_test)
    price_preds = self.price_classifier.predict(X_test)
    menu_preds = self.menu_classifier.predict(X_test)

    multiclass_preds = np.column_stack((service_preds, food_qlt_preds, atmos_preds, wt_preds, price_preds, menu_preds))
    return multiclass_preds

In [18]:
def OneAttributeClassifier(X, y, classifier="decision_tree"):

  # transform the dataset to balance the class samples
  oversample = SMOTE()
  balanced_x, balanced_y = oversample.fit_resample(X, y)

  if classifier == "decision_tree":
    model = DecisionTreeClassifier(max_depth = 10).fit(balanced_x, balanced_y)
  elif classifier == "naive_bayes":
    model = MultinomialNB().fit(balanced_x, balanced_y)
  elif classifier == "svm":
    model = SVC(kernel = 'rbf').fit(balanced_x, balanced_y)
  elif classifier == "knn":
    model = KNeighborsClassifier(n_neighbors = 7).fit(X_train, y_train)

  return model


In [19]:
def acc(ground_truth, predictions):
  non_zero_positions = (ground_truth != 0) & (predictions != 0)
  correct_predictions = (ground_truth[non_zero_positions] == predictions[non_zero_positions])
  accuracy = correct_predictions.sum() / non_zero_positions.sum() if non_zero_positions.sum() > 0 else 0
  return accuracy


def multiclass_accuracy(y_true, y_pred):
  labels = ['Service', 'Food Quality', 'Atmosphere/Ambiance', 'Wait Time', 'Price', 'Menu Variety']

  sum_acc = 0

  for i in range(len(labels)):
    temp_acc = acc(y_true[:,i], y_pred[:,i])
    sum_acc += temp_acc
    print(f"{labels[i]} Accuracy: {temp_acc*100:.3f}%")

  mean_acc = sum_acc / 6

  print(f"Average accuracy: {mean_acc*100:.3f}%")


Training the model

In [20]:
X = X.to_numpy()
y = y.to_numpy()

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 0)

Using Decision Tree classifers

In [21]:
multiclass_classifier = Ensemble_Classifier()
multiclass_classifier.fit(X_train, y_train)
preds = multiclass_classifier.predict(X_test)

In [22]:
multiclass_accuracy(y_test, preds)

Service Accuracy: 44.017%
Food Quality Accuracy: 41.023%
Atmosphere/Ambiance Accuracy: 49.412%
Wait Time Accuracy: 36.458%
Price Accuracy: 36.098%
Menu Variety Accuracy: 43.333%
Average accuracy: 41.723%


Using Naive Bayes classifiers

In [23]:
multiclass_classifier = Ensemble_Classifier()
multiclass_classifier.fit(X_train, y_train, classifier="naive_bayes")
preds = multiclass_classifier.predict(X_test)

In [24]:
multiclass_accuracy(y_test, preds)

Service Accuracy: 55.660%
Food Quality Accuracy: 55.610%
Atmosphere/Ambiance Accuracy: 53.713%
Wait Time Accuracy: 53.769%
Price Accuracy: 48.438%
Menu Variety Accuracy: 57.895%
Average accuracy: 54.181%


Using SVM classifiers

In [28]:
multiclass_classifier = Ensemble_Classifier()
multiclass_classifier.fit(X_train, y_train, classifier="svm")
preds = multiclass_classifier.predict(X_test)

In [29]:
multiclass_accuracy(y_test, preds)

Service Accuracy: 68.036%
Food Quality Accuracy: 59.721%
Atmosphere/Ambiance Accuracy: 67.308%
Wait Time Accuracy: 75.758%
Price Accuracy: 55.000%
Menu Variety Accuracy: 79.167%
Average accuracy: 67.498%


In [30]:
def acc(ground_truth, predictions):
  non_zero_positions = (ground_truth != 0) & (predictions != 0)
  correct_predictions = (ground_truth[non_zero_positions] == predictions[non_zero_positions])
  accuracy = correct_predictions.sum() / non_zero_positions.sum() if non_zero_positions.sum() > 0 else 0
  return accuracy


def multiclass_accuracy1(y_true, y_pred):
  labels = ['Service', 'Food Quality', 'Atmosphere/Ambiance', 'Wait Time', 'Price', 'Menu Variety']

  sum_acc = 0

  for i in range(len(labels)):
    temp_acc = acc(y_true[:,i], y_pred[:,i])
    sum_acc += temp_acc
    print(f"{labels[i]} Accuracy: {temp_acc*100:.3f}%")

  mean_acc = sum_acc / 6

  print(f"Average accuracy: {mean_acc*100:.3f}%")
  return mean_acc

In [32]:
# Running SVM for 5 times
# Number of iterations
num_iterations = 5

# List to store accuracy scores
accuracy_scores = []

# Run the SVM model multiple times
for _ in range(num_iterations):
    # Fit the model to the training data
    multiclass_classifier.fit(X_train, y_train, classifier="svm")

    # Make predictions on the test set
    preds = multiclass_classifier.predict(X_test)

    # Calculate accuracy and append to the list
    accuracy = multiclass_accuracy1(y_test, preds)
    accuracy_scores.append(accuracy)
    print("-------------------------------")

# Print the accuracy scores
for i, accuracy in enumerate(accuracy_scores, 1):
    print(f"Iteration {i}: Accuracy = {accuracy}")


Service Accuracy: 67.621%
Food Quality Accuracy: 59.053%
Atmosphere/Ambiance Accuracy: 67.290%
Wait Time Accuracy: 75.758%
Price Accuracy: 57.143%
Menu Variety Accuracy: 77.778%
Average accuracy: 67.440%
-------------------------------
Service Accuracy: 67.318%
Food Quality Accuracy: 59.852%
Atmosphere/Ambiance Accuracy: 71.569%
Wait Time Accuracy: 75.000%
Price Accuracy: 46.667%
Menu Variety Accuracy: 80.000%
Average accuracy: 66.734%
-------------------------------
Service Accuracy: 68.393%
Food Quality Accuracy: 59.276%
Atmosphere/Ambiance Accuracy: 67.308%
Wait Time Accuracy: 75.000%
Price Accuracy: 55.000%
Menu Variety Accuracy: 78.723%
Average accuracy: 67.283%
-------------------------------
Service Accuracy: 67.679%
Food Quality Accuracy: 58.698%
Atmosphere/Ambiance Accuracy: 68.224%
Wait Time Accuracy: 73.529%
Price Accuracy: 57.143%
Menu Variety Accuracy: 80.000%
Average accuracy: 67.545%
-------------------------------
Service Accuracy: 68.941%
Food Quality Accuracy: 60.316%

In [37]:
avg_acc = 0
# Print the accuracy scores
for i, accuracy in enumerate(accuracy_scores, 1):
    print(f"Iteration {i}: Accuracy = {accuracy*100:.3f}%")
    avg_acc += (accuracy*100)
print("Average accuracy of the 5 iterations = ",avg_acc/5)

Iteration 1: Accuracy = 67.440%
Iteration 2: Accuracy = 66.734%
Iteration 3: Accuracy = 67.283%
Iteration 4: Accuracy = 67.545%
Iteration 5: Accuracy = 67.368%
Average accuracy of the 5 iterations =  67.27420895187319


Using KNN classifiers

In [25]:
multiclass_classifier = Ensemble_Classifier()
multiclass_classifier.fit(X_train, y_train, classifier="knn")
preds = multiclass_classifier.predict(X_test)

In [26]:
multiclass_accuracy(y_test, preds)

Service Accuracy: 60.168%
Food Quality Accuracy: 53.689%
Atmosphere/Ambiance Accuracy: 51.852%
Wait Time Accuracy: 59.459%
Price Accuracy: 50.000%
Menu Variety Accuracy: 62.745%
Average accuracy: 56.319%


Vectorizing cleaned Reviews using word2vec

In [None]:
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize

In [None]:
X_df = reviews_labelled['Review_cleaned_with_stpwrds']
X_df.head()

0    the new mexican a style cuisine here is damn g...
1    the eastern european food is great but do not ...
2    great service and great scene two levels with ...
3    had dinner here with my coworkers and this pla...
4    we have dined at scoogis many times and have a...
Name: Review_cleaned_with_stpwrds, dtype: object

In [None]:
tokenized_reviews = [word_tokenize(review.lower()) for review in X_df]

In [None]:
model = Word2Vec(sentences=tokenized_reviews, vector_size=600, window=5, min_count=1, workers=4)

In [None]:
def average_word_vectors(words, model, num_features):
    feature_vector = np.zeros((num_features,), dtype="float32")
    nwords = 0

    for word in words:
        if word in model.wv:
            nwords += 1
            feature_vector = np.add(feature_vector, model.wv[word])

    if nwords > 0:
        feature_vector = np.divide(feature_vector, nwords)

    return feature_vector

In [None]:
num_features = 600
review_vectors = [average_word_vectors(words, model, num_features) for words in tokenized_reviews]

In [None]:
review_vectors = np.array(review_vectors)
review_vectors.shape

(5000, 600)

In [None]:
review_vectors

array([[ 0.3541788 , -0.02073769,  0.23443381, ..., -0.22455208,
         0.08243532, -0.19788782],
       [ 0.27921224, -0.00779332,  0.1844329 , ..., -0.25068343,
         0.08288784, -0.21675383],
       [ 0.31287816,  0.07218107,  0.03257269, ..., -0.25145587,
         0.04620647, -0.1201149 ],
       ...,
       [ 0.33951864, -0.07765932,  0.04883456, ..., -0.17442721,
         0.03595851, -0.13202581],
       [ 0.3233184 , -0.00886238,  0.09482341, ..., -0.19799496,
         0.02325204, -0.09679104],
       [ 0.30739933, -0.02704054,  0.18469694, ..., -0.17747292,
        -0.00331634, -0.09047659]], dtype=float32)

In [None]:
labels = Y_df.to_numpy()
labels.shape

(5000, 6)

In [None]:
labels

array([[4., 5., 0., 0., 0., 4.],
       [0., 5., 0., 5., 0., 5.],
       [5., 0., 5., 5., 0., 0.],
       ...,
       [5., 0., 0., 0., 0., 0.],
       [4., 3., 4., 0., 0., 0.],
       [0., 3., 4., 0., 4., 0.]])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(review_vectors, labels, random_state = 0)

Using Decision Tree Classifiers

In [None]:
multiclass_classifier = Ensemble_Classifier()
multiclass_classifier.fit(X_train, y_train, classifier="decision_tree")
preds = multiclass_classifier.predict(X_test)

In [None]:
multiclass_accuracy(y_test, preds)

Service Accuracy: 36.682%
Food Quality Accuracy: 31.827%
Atmosphere/Ambiance Accuracy: 40.155%
Wait Time Accuracy: 35.417%
Price Accuracy: 30.992%
Menu Variety Accuracy: 38.908%
Average accuracy: 35.663%


Using SVM Classifiers

In [None]:
multiclass_classifier = Ensemble_Classifier()
multiclass_classifier.fit(X_train, y_train, classifier="svm")
preds = multiclass_classifier.predict(X_test)

In [None]:
multiclass_accuracy(y_test, preds)

Service Accuracy: 48.373%
Food Quality Accuracy: 44.070%
Atmosphere/Ambiance Accuracy: 47.031%
Wait Time Accuracy: 45.339%
Price Accuracy: 38.947%
Menu Variety Accuracy: 42.815%
Average accuracy: 44.429%


Using KNN Classifers

In [None]:
multiclass_classifier = Ensemble_Classifier()
multiclass_classifier.fit(X_train, y_train, classifier="knn")
preds = multiclass_classifier.predict(X_test)

In [None]:
multiclass_accuracy(y_test, preds)

Service Accuracy: 50.596%
Food Quality Accuracy: 47.533%
Atmosphere/Ambiance Accuracy: 54.676%
Wait Time Accuracy: 70.968%
Price Accuracy: 25.000%
Menu Variety Accuracy: 63.380%
Average accuracy: 52.026%
