<a href="https://colab.research.google.com/github/SamJSui/KnoxReviews_Recommendation_System/blob/main/EDA/KnoxMeals_EDA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Introduction

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Prologue
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Train Test Split
from sklearn.model_selection import train_test_split

# NLP
import nltk
nltk.download('stopwords')
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords 
from nltk.tokenize import WordPunctTokenizer
import string

from scipy import sparse

pd.options.display.max_columns = 999

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Data Preparation

Reading in the table and previewing the table's contents in a DataFrame format

In [None]:
df_reviews = pd.read_csv('/content/drive/MyDrive/KnoxReviews_Recommandation_Engine/data/reviews.csv')

df_reviews.head()

Unnamed: 0,Restaurant,Time of Review,Review Author,Author ID,Review Rating,Review Text,Review Likes
0,Parkside Grill,09/18/2022 01:56:48,Baird Montgomery,100599669051573872330,4,Great steak and pasta meals. Quick service and...,0
1,Parkside Grill,09/18/2022 01:03:21,Brooke Robinette,100120446110809657279,4,Excellent food and service just a slightly hig...,0
2,Parkside Grill,09/17/2022 01:26:03,Riley,117968823744066661691,5,"Amazing food, great service, people were so fr...",0
3,Parkside Grill,09/16/2022 13:49:59,Eric Johnson,102325409891965001150,5,Dropped into this establishment for a quick lu...,0
4,Parkside Grill,09/09/2022 22:55:49,Karl Myers,109209093038981174322,5,"Great relaxing atmosphere, service great food...",0


In [None]:
df_reviews.shape

(33937, 7)

Removing any null values from the dataset

In [None]:
df_reviews.dropna().isnull().any()

Restaurant        False
Time of Review    False
Review Author     False
Author ID         False
Review Rating     False
Review Text       False
Review Likes      False
dtype: bool

## Data Cleaning

In [None]:
stop = []
for word in stopwords.words('english'):
    s = [char for char in word if char not in string.punctuation]
    stop.append(''.join(s))

In [None]:
df_reviews['Review Text'] = df_reviews['Review Text'].astype(str)
df_reviews['Review Text'] = df_reviews['Review Text'].str.replace("(Translated by Google)","")
df_reviews['Review Text'] = df_reviews['Review Text'].str.replace("Translated by Google","")

  


In [None]:
def text_process(mess):
    """
    Takes in a string of text, then performs the following:
    1. Remove all punctuation
    2. Remove all stopwords
    3. Returns a list of the cleaned text
    """
    # Check characters to see if they are in punctuation
    nopunc = [char for char in mess if char not in string.punctuation and char.isascii()]
    # Join the characters again to form the string.
    nopunc = ''.join(nopunc)
    ' '.join(nopunc.split())
    nopunc = nopunc.lstrip().lower()
    nopunc = nopunc.replace("translated google","")
    
    # Now just remove any stopwords
    return " ".join([word for word in nopunc.split() if word not in stop and word.isascii()])

df_reviews['Review Text'] = df_reviews['Review Text'].apply(text_process)

In [None]:
df_user = df_reviews[['Author ID', 'Review Text', 'Review Rating']]
df_business = df_reviews[['Restaurant', 'Review Text', 'Review Rating']]

In [None]:
df_user.head()

Unnamed: 0,Author ID,Review Text,Review Rating
0,100599669051573872330,great steak pasta meals quick service friendly...,4
1,100120446110809657279,excellent food service slightly higher price e...,4
2,117968823744066661691,amazing food great service people friendly caring,5
3,102325409891965001150,dropped establishment quick lunch recent trip ...,5
4,109209093038981174322,great relaxing atmosphere service great food w...,5


In [None]:
df_user_group = df_reviews.groupby('Author ID').agg({'Review Text': ' '.join})
df_business_group = df_reviews.groupby('Restaurant').agg({'Review Text': ' '.join})

In [None]:
df_user_group

Unnamed: 0_level_0,Review Text
Author ID,Unnamed: 1_level_1
100001133782992136042,insanely good definitely recommended
100002772261813018643,visited saturday march 5th mom decided hangout...
100002789207838418095,top 1 list favorite places kiev everything per...
100003285792332079925,great pizza go along high school football combo
100003676116767318619,great location friendly staff
...,...
118439810893469881990,easily top choice eat knoxville food always am...
118441710194218903020,frequent customer melting pot restaurant amazi...
118442319772692428834,food good always service good usual
118445889703692298546,loved


In [None]:
df_reviews

Unnamed: 0,Restaurant,Time of Review,Review Author,Author ID,Review Rating,Review Text,Review Likes
0,Parkside Grill,09/18/2022 01:56:48,Baird Montgomery,100599669051573872330,4,great steak pasta meals quick service friendly...,0
1,Parkside Grill,09/18/2022 01:03:21,Brooke Robinette,100120446110809657279,4,excellent food service slightly higher price e...,0
2,Parkside Grill,09/17/2022 01:26:03,Riley,117968823744066661691,5,amazing food great service people friendly caring,0
3,Parkside Grill,09/16/2022 13:49:59,Eric Johnson,102325409891965001150,5,dropped establishment quick lunch recent trip ...,0
4,Parkside Grill,09/09/2022 22:55:49,Karl Myers,109209093038981174322,5,great relaxing atmosphere service great food w...,0
...,...,...,...,...,...,...,...
33932,Turquaze (Туркуаз),06/12/2021 10:42:46,Настя Пелешенко,110888098791362774076,5,wonderful restaurant heart kiev beautiful tast...,6
33933,Turquaze (Туркуаз),06/12/2021 10:37:41,Ирина,114302829717458513962,5,beautiful dignified establishment heart kiev e...,5
33934,Turquaze (Туркуаз),06/06/2021 09:37:29,Анастасия Шидловская,106685344361560770190,5,looking place eat deliciously location conveni...,7
33935,Turquaze (Туркуаз),05/30/2021 12:13:15,موضي الدايل,102432905134212520939,5,wonderful eat delicious treat classy original,3
