# Mexican Restaurant Reviews Text pre-processing

#### Module Imports

In [1]:
import pandas as pd 
import numpy as np 
import re 
import matplotlib.pyplot as plt 
import seaborn as sns 
from scipy.stats import stats
import string 
import nltk
from nltk.corpus import stopwords
import string 
from collections import Counter
from nltk import ne_chunk, pos_tag, word_tokenize
from nltk.tree import Tree

#### Read data

In [2]:
#pandas to read data 

mexican_reviews = pd.read_csv(r"C:\Users\CallumO'Neill\windows_directory\data\master_mexican_restaurant_reviews.csv")


In [3]:
mexican_reviews.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,review_title,review_date,description,date_of_visit
0,0,0,Great breakfast service,8 July 2023,Awa served us breakfast and she was wonderful!...,Date of visit: July 2023
1,1,1,Nice service,8 July 2023,"Dear Guest, Thank you for the feedback and you...",Date of visit: July 2023
2,2,2,Ask for Dhaval! He was great!,8 July 2023,Alston and Caleb amazing staff Very nice food ...,Date of visit: July 2023
3,3,3,Fusion,8 July 2023,"Dear Guest, Thank you so much for your kind wo...",Date of visit: July 2023
4,4,4,Great evening meal out,8 July 2023,Dhivakar our waiter was extraordinary! The foo...,Date of visit: July 2023


#### removal of unnamed: 0 column from data frame 

In [4]:
#function to remove 'Unnamed: 0' and reset index()

def unnamed_removal(data, *columns):
    """Removing the default index column that comes when a dataframe 
    is imported. Also resetting the index """

    removed_column = data.drop(columns=[*columns])
    new_dataframe = removed_column.reset_index(drop=True)
    return new_dataframe

mexican_reviews = unnamed_removal(mexican_reviews, 'Unnamed: 0.1', 'Unnamed: 0')
mexican_reviews.head()


Unnamed: 0,review_title,review_date,description,date_of_visit
0,Great breakfast service,8 July 2023,Awa served us breakfast and she was wonderful!...,Date of visit: July 2023
1,Nice service,8 July 2023,"Dear Guest, Thank you for the feedback and you...",Date of visit: July 2023
2,Ask for Dhaval! He was great!,8 July 2023,Alston and Caleb amazing staff Very nice food ...,Date of visit: July 2023
3,Fusion,8 July 2023,"Dear Guest, Thank you so much for your kind wo...",Date of visit: July 2023
4,Great evening meal out,8 July 2023,Dhivakar our waiter was extraordinary! The foo...,Date of visit: July 2023


#### filtering out descriptions that were manager responses. They start out with 'Dear' or 'Hello'

In [5]:
#filtering out function. ~symbol indicates what "isn't"

def filter_manager_response(df, column, *conditions):
    filtered_df = df[~df[column].str.lower().str.startswith((conditions))].reset_index(drop=True)
    return filtered_df

mexican_reviews = filter_manager_response(mexican_reviews, 'description', 'dear', 'hello')
len(mexican_reviews)

14947

#### Text pre-processing: all lower case and punctuation 

In [6]:
#checking the data types

def type_check(data, *columns):
    """ function that checks the data types of each column """

    for column in columns:
        print(f"{column} data type: {data[column].dtypes}")

#* symbol unpacks the columns 
type_check(mexican_reviews, *mexican_reviews.columns)


review_title data type: object
review_date data type: object
description data type: object
date_of_visit data type: object


In [7]:
#class to convert to lower case and remove punctuation

class CaseAndPunctuationProcessor:
    @staticmethod
    def all_lower(data, *columns):
        """Method that applies to all text columns"""
        
        """ iterating through each column of the data frame """
        for column in columns:
            data[column] = data[column].str.lower()
        return data

    @staticmethod
    def remove_punctuation(text):
        punctuations = string.punctuation

        if isinstance(text, str):
            return text.translate(str.maketrans('','', punctuations))
        
        else:
            return text
    

processor = CaseAndPunctuationProcessor()
lower_data = processor.all_lower(mexican_reviews, 'review_title', 'description').applymap(processor.remove_punctuation)

lower_data.head()


        

Unnamed: 0,review_title,review_date,description,date_of_visit
0,great breakfast service,8 July 2023,awa served us breakfast and she was wonderful ...,Date of visit July 2023
1,ask for dhaval he was great,8 July 2023,alston and caleb amazing staff very nice food ...,Date of visit July 2023
2,great evening meal out,8 July 2023,dhivakar our waiter was extraordinary the food...,Date of visit July 2023
3,fabulous food and cocktails and even more fabu...,7 July 2023,dhaval explained the menu and made the best re...,Date of visit July 2023
4,great service,7 July 2023,alston and caleb were brilliant servers very a...,Date of visit July 2023


In [8]:
review_text_data = lower_data[['review_title', 'description']]
review_text_data.head()

Unnamed: 0,review_title,description
0,great breakfast service,awa served us breakfast and she was wonderful ...
1,ask for dhaval he was great,alston and caleb amazing staff very nice food ...
2,great evening meal out,dhivakar our waiter was extraordinary the food...
3,fabulous food and cocktails and even more fabu...,dhaval explained the menu and made the best re...
4,great service,alston and caleb were brilliant servers very a...


#### Text pre-processing: Removal of stop words 

In [9]:
" ".join(stopwords.words('english'))

"i me my myself we our ours ourselves you you're you've you'll you'd your yours yourself yourselves he him his himself she she's her hers herself it it's its itself they them their theirs themselves what which who whom this that that'll these those am is are was were be been being have has had having do does did doing a an the and but if or because as until while of at by for with about against between into through during before after above below to from up down in out on off over under again further then once here there when where why how all any both each few more most other some such no nor not only own same so than too very s t can will just don don't should should've now d ll m o re ve y ain aren aren't couldn couldn't didn didn't doesn doesn't hadn hadn't hasn hasn't haven haven't isn isn't ma mightn mightn't mustn mustn't needn needn't shan shan't shouldn shouldn't wasn wasn't weren weren't won won't wouldn wouldn't"

In [10]:
#make stopwords into a set 

stop_words = set(stopwords.words('english'))

In [13]:
#convert review text columns into a list for indexing slicing 

review_text_column_list = review_text_data.columns.to_list()

In [17]:
#stop words removal function
def stopwords_removal(text):
    """ splits words into just one word provided they are not a stop word """

    return ' '.join([word for word in text.split() if word not in stop_words])

for text_column in review_text_column_list:
    review_text_data[text_column] = review_text_data[text_column].apply(lambda stop_word: stopwords_removal(stop_word))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  review_text_data[text_column] = review_text_data[text_column].apply(lambda stop_word: stopwords_removal(stop_word))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  review_text_data[text_column] = review_text_data[text_column].apply(lambda stop_word: stopwords_removal(stop_word))


In [18]:
review_text_data

Unnamed: 0,review_title,description
0,great breakfast service,awa served us breakfast wonderful ate restaura...
1,ask dhaval great,alston caleb amazing staff nice food spotless ...
2,great evening meal,dhivakar waiter extraordinary food incredible ...
3,fabulous food cocktails even fabulous service,dhaval explained menu made best recommendation...
4,great service,alston caleb brilliant servers attentive food ...
...,...,...
14942,reliable little stop,great little stop vegetarian one place always ...
14943,feedback,passing waterloo work stopped mi casa lunch fo...
14944,quick easy,usually get treat fridays go route meeting alw...
14945,spicy option spicy personal choice,occasional visit never disappoints ok visit ev...
