# EDA Restaurant Reviews


Using Python 3.13

Using this for an inital look at the data and to play around with it.
We can grab more datasets too

In [6]:
import src.data_cleaning as clean
import src.data_transformation as transform


Space for File Operations


# Here lies an example of the data cleaning modules


In [7]:
files = ['../data/Restaurant_Reviews_Test.tsv',
         '../data/Restaurant_Reviews.csv',
         '../data/Restaurant_Reviews2.txt']

dfs = clean.load_text_to_df(files, columns = ["Review", "Like"], line_length = 2)


In [8]:
df_tsv = dfs['Restaurant_Reviews_Test']
df_tsv.head()

Unnamed: 0,Review,Like
0,Wow... Loved this place.,1.0
1,Crust is not good .,0.0
2,Wow... Loved this place.,1.0
3,"""""""""",
4,Not tasty and the texture was just nasty.,0.0


In [9]:
df_csv = dfs['Restaurant_Reviews']
df_csv.head()

Unnamed: 0,Review,Like
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [10]:
df_txt = dfs['Restaurant_Reviews2']
df_txt.head()

Unnamed: 0,Review,Like
0,Review,Liked
1,Wow... Loved this place.,1
2,Crust is not good.,0
3,Not tasty and the texture was just nasty.,0
4,Stopped by during the late May bank holiday of...,1


In [11]:
# Remove Duplicates
df_tsv_remove_dup = clean.remove_duplicates_fuzzy(df_tsv, "Review", threshold = 90)

df_tsv_remove_dup.head()

Unnamed: 0,Review,Like
0,Wow... Loved this place.,1.0
1,Crust is not good .,0.0
2,"""""""""",
3,Not tasty and the texture was just nasty.,0.0
4,Stopped by during the late May bank holiday of...,1.0


In [12]:
# Normalize the Text (Lowercase & Remove Punctuation, Remove Whitespaces)
df_tsv_norm = clean.normalize_text(df_tsv_remove_dup, "Review")
df_tsv_norm.head()

Unnamed: 0,Review,Like
0,wow loved this place,1.0
1,crust is not good,0.0
2,,
3,not tasty and the texture was just nasty,0.0
4,stopped by during the late may bank holiday of...,1.0


In [13]:
# Handle Missing Values
df_tsv_na = clean.handle_missing_values(df_tsv, "Review")
df_tsv_na.head()

Unnamed: 0,Review,Like
0,wow loved this place,1.0
1,crust is not good,0.0
2,not tasty and the texture was just nasty,0.0
3,stopped by during the late may bank holiday of...,1.0
4,the selection on the menu was great and so wer...,1.0


In [14]:
# Original "Dirty" Text Data to demonstrate Mass Cleaning FUnction
df_test = clean.load_text_to_df(['../data/Restaurant_Reviews_Test.tsv'],
                            columns =["Review", "Like"], line_length = 0)
df_tsv_dirty = df_test['Restaurant_Reviews_Test']
df_tsv_dirty.head()

Unnamed: 0,Review,Like
0,Wow... Loved this place.,1.0
1,Crust is not good .,0.0
2,Wow... Loved this place.,1.0
3,"""""""""",
4,Not tasty and the texture was just nasty.,0.0


In [15]:
# Mass Cleaning FUnction (Remove Duplicates, Normalize, Handle Missing Values)
cleaned_df_tsv = clean.clean_dataframe(df_tsv_dirty, "Review")
cleaned_df_tsv.head()

Unnamed: 0,Review,Like
0,wow loved this place,1.0
1,crust is not good,0.0
2,not tasty and the texture was just nasty,0.0
3,stopped by during the late may bank holiday of...,1.0
4,the selection on the menu was great and so wer...,1.0


# Here lies examples of the data transformation modules


In [16]:
# Tokenize Dataframe Example
cleaned_df_tsv_tokenized = transform.tokenize_dataframe(cleaned_df_tsv, "Review")
cleaned_df_tsv_tokenized.head()

Unnamed: 0,Review,Like,Tokenized Text
0,wow loved this place,1.0,"[wow, loved, this, place]"
1,crust is not good,0.0,"[crust, is, not, good]"
2,not tasty and the texture was just nasty,0.0,"[not, tasty, and, the, texture, was, just, nasty]"
3,stopped by during the late may bank holiday of...,1.0,"[stopped, by, during, the, late, may, bank, ho..."
4,the selection on the menu was great and so wer...,1.0,"[the, selection, on, the, menu, was, great, an..."


In [17]:
# Remove Stopwords Example
cleaned_df_tsv_stopword = transform.remove_stopwords(cleaned_df_tsv_tokenized, "Review", custom_stopword = None, new_column = "Review No Stopwords")

cleaned_df_tsv_stopword.head()

Unnamed: 0,Review,Like,Tokenized Text,Review No Stopwords
0,wow loved this place,1.0,"[wow, loved, this, place]","[wow, loved, place]"
1,crust is not good,0.0,"[crust, is, not, good]","[crust, not, good]"
2,not tasty and the texture was just nasty,0.0,"[not, tasty, and, the, texture, was, just, nasty]","[not, tasty, texture, just, nasty]"
3,stopped by during the late may bank holiday of...,1.0,"[stopped, by, during, the, late, may, bank, ho...","[stopped, during, late, may, bank, holiday, of..."
4,the selection on the menu was great and so wer...,1.0,"[the, selection, on, the, menu, was, great, an...","[selection, menu, great, so, prices]"


In [18]:
# Label Data Sentiment Example

cleaned_df_tsv_labeled = transform.label_data_sentiment(cleaned_df_tsv_stopword, "Review", new_column = "Review Sentiment")

cleaned_df_tsv_labeled.head()

Unnamed: 0,Review,Like,Tokenized Text,Review No Stopwords,Review Sentiment
0,wow loved this place,1.0,"[wow, loved, this, place]","[wow, loved, place]",Neutral
1,crust is not good,0.0,"[crust, is, not, good]","[crust, not, good]",Positive
2,not tasty and the texture was just nasty,0.0,"[not, tasty, and, the, texture, was, just, nasty]","[not, tasty, texture, just, nasty]",Neutral
3,stopped by during the late may bank holiday of...,1.0,"[stopped, by, during, the, late, may, bank, ho...","[stopped, during, late, may, bank, holiday, of...",Neutral
4,the selection on the menu was great and so wer...,1.0,"[the, selection, on, the, menu, was, great, an...","[selection, menu, great, so, prices]",Positive
