# ***Tanguy Dabadie NLP Project - Basic Model***

In [1]:
import numpy as np
import pandas as pd

import plotly.express as px
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split


Let's start from the DataFrame we built in the Exploratory Data Analysis notebook :

In [2]:
file = pd.read_csv('Restaurant reviews.csv')
df = pd.DataFrame(file)
df = df.drop(['Restaurant','Reviewer','Metadata','Time','Pictures','7514'], axis=1)
df['Rating'] = df['Rating'].replace('nan', np.nan)
df = df.dropna(subset=['Rating'])
df = df[~df['Rating'].isin(['Like', '1.5', '2.5', '3.5', '4.5'])]

print(f" The rating unique values are : {df['Rating'].unique()}")
df

 The rating unique values are : ['5' '4' '1' '3' '2']


Unnamed: 0,Review,Rating
0,"The ambience was good, food was quite good . h...",5
1,Ambience is too good for a pleasant evening. S...,5
2,A must try.. great food great ambience. Thnx f...,5
3,Soumen das and Arun was a great guy. Only beca...,5
4,Food is good.we ordered Kodi drumsticks and ba...,5
...,...,...
9991,I was never a fan of Chinese food until I visi...,5
9992,I visited this restaurant with friends and was...,5
9993,"Im going to cut to the chase, The food is exce...",5
9995,Madhumathi Mahajan Well to start with nice cou...,3


## ***1. Preprocessing the data***

Let's import the prepocessing function we created in the preprocessing_function.py file :

In [3]:
from preprocessing_function import preprocess_text, matrix_report

In [4]:
processed = df['Review'].iloc[:200].apply(lambda x: preprocess_text(str(x)) if not isinstance(x, float) else "")
processed

0      ambience good food quite good saturday lunch c...
1      ambience good pleasant evening service prompt ...
2      must try great food great ambience thnx servic...
3      soumen da arun great guy behavior sincerety go...
4      food ordered kodi drumstick basket mutton biry...
                             ...                        
195    chiken musallam really good dat keema also nic...
196    experienced best haleem murg musalam taste goo...
197    nyc restaurant come taste haleem even veg sooo...
198    tasty good health haleem atmosphere also good ...
199    tasty specially biryani satisfied service good...
Name: Review, Length: 200, dtype: object

As indicated in the file, our preprocessing function tokenize the text, remove punctuation and convert to lowercase, remove stopwords and lemmatize our text.

## Most and less common words

Let's see what are the most and less common words in our text :

In [5]:
from collections import Counter

# Use the Counter class to return the most frequent words
cnt = Counter()

# 1: join all the text in the text_wo_stop column using the join() function
joined_text = " ".join(processed)

# 2: tokenize the text by using the split() function
split = joined_text.split()

# 3: instantiate the Counter class with your tokenized array
word_counter = Counter(split)

# 4: use the most_common class method to return the most frequent words
most_common = word_counter.most_common()

most_common[:10]

[('good', 287),
 ('food', 181),
 ('service', 141),
 ('place', 105),
 ('biryani', 70),
 ('ambience', 69),
 ('nice', 68),
 ('visit', 59),
 ('time', 55),
 ('staff', 53)]

In [6]:
most_common[-10:]

[('murg', 1),
 ('musalam', 1),
 ('respectable', 1),
 ('nyc', 1),
 ('sooo', 1),
 ('imporvement', 1),
 ('health', 1),
 ('atmosphere', 1),
 ('sense', 1),
 ('humour', 1)]

## Feature extraction

In [7]:
# repeat with a TDIDF vectorizer
tfidf = TfidfVectorizer()
#fit and transform, same code as previously
processed_vectors = tfidf.fit_transform(processed)
processed_vectors

<200x1083 sparse matrix of type '<class 'numpy.float64'>'
	with 3717 stored elements in Compressed Sparse Row format>

In [8]:
# create a dataframe of vectors as previously
tfidf_df = pd.DataFrame(processed_vectors.toarray())
tfidf_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1073,1074,1075,1076,1077,1078,1079,1080,1081,1082
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
196,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
197,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
198,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
# use imshow to plot the vector representations
px.imshow(tfidf_df, color_continuous_scale='RdBu_r', title="Representation of [200 x 1561] TF-IDF encodings")

## ***2. Train a baseline model***

The goal here is simply to obtain a baseline model which we'll use as reference for future experiments. Let's train a first machine learning model without any particular parameter tuning of feature engineering.

In [10]:
class_labels = ['1', '2', '3', '4', '5']

In [11]:
# Split the data into training and testing sets
X = df['Review']
y = df['Rating']

test_size=0.2 
random_state=42

# Constitute the data that will be used in our machine learning model
X_train, X_test, y_train, y_test =  train_test_split(X, y, test_size=test_size, random_state=random_state)

## Model definition

In [12]:
text_clf = Pipeline([
    ("CountVectorizer", CountVectorizer(preprocessor=preprocess_text)),
    ("MultinomialNB", MultinomialNB())
])

I uncountered an issue : some NaN values in the Review column part of the X_train data. Therefore I removed the X_train and y_train of those lines :

In [13]:
# Mask to identify the lines where X_train are NaN values
mask = X_train.notnull()

# Apply the mask to X_train and y_train
X_train = X_train[mask]
y_train = y_train[mask]

## Fiting and prediction

In [14]:
text_clf.fit(X_train, y_train)
y_pred = text_clf.predict(X_test)

## Report

In [15]:
matrix_report(rep_y_true= y_test, rep_y_pred= y_pred, class_labels= class_labels)

              precision    recall  f1-score   support

           1       0.72      0.82      0.76       355
           2       0.12      0.01      0.01       146
           3       0.47      0.19      0.28       231
           4       0.48      0.61      0.54       481
           5       0.73      0.82      0.78       751

    accuracy                           0.64      1964
   macro avg       0.50      0.49      0.47      1964
weighted avg       0.59      0.64      0.60      1964



### What we can see from that first model :

First we obtain a global accuracy score of 0.64 which is not so good but we can see an explanation to this : the 2 and 3 classes don't have many samples to help the model training.