# libraries

In [1]:
import sys
import os
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import en_core_web_sm
from spacy.matcher import Matcher
from spacy.matcher import PhraseMatcher
from spacy import displacy


%matplotlib inline

# to import Database class from data_collection folder
module_path = os.path.abspath(os.path.join('../..')+'/data/data_collection')
if module_path not in sys.path:
    sys.path.append(module_path)

# now that the folder is in the path, ../data_collection/database.py can be imported
from storage_managers.database import Database

# Import reviews data and target feature

In [5]:
db = Database()

# get halal-reviews (reviews that include the word 'halal')
reviews_sql = '''SELECT * FROM reviews'''
reviews_df = db.select_df(reviews_sql)

In [6]:
# clean up review data and group by restaurant

# drop Aya Kitchen
aya_id = 'y6BfLt9Gvrq2JsJvjkjdIQ'
reviews_df.drop(reviews_df[reviews_df['restaurant_id'] == aya_id].index, inplace=True)

# group reviews per restaurant
grouped_reviews_df = reviews_df.groupby('restaurant_id').agg(lambda x: ' '.join(x)) # combine review text
grouped_reviews_df['review_count'] = grouped_reviews_df['review_date'].apply(lambda x: len(x)) # count reviews per restaurnat

In [7]:
grouped_reviews_df.head()

Unnamed: 0_level_0,review_text,review_date,review_count
restaurant_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
-6tSx9IXwt1btreBtcyZ0g,Mexican Corn Dogs is very tasty.\nSausage Plat...,3/8/2020 3/23/2020 1/24/2020 3/10/2019 7/11/20...,117
-7Dq_NtTyd7WV-Nt1f-o9A,Love love love their wings and mozzarella stic...,5/31/2016 9/11/2017,19
-D-ye8DU5KndJ7mqBpKU1g,Excellent halal cart (and I eat at a lot of ha...,3/21/2020 7/31/2018,19
-E5PfobEXNQ_WC8Qv2wGJA,"I ordered wings , each order is supposed to be...",5/29/2019 5/14/2019 3/22/2020 5/6/2019 4/16/20...,78
-GYT90fOCNjTbjCjj4wuiw,I feel compelled to leave a review for my very...,3/16/2019 6/3/2016 8/10/2007 5/2/2013 5/3/2010...,57


# Import FP and TN from Random Forest Model in Baseline_3

In [11]:
# read pickles
FP_list = pd.read_pickle(os.getcwd() + '/FP_list.pkl')
TN_list = pd.read_pickle(os.getcwd() + '/TN_list.pkl')

# save FP reviews to csv for easier viewing
grouped_reviews_df.loc[FP_list, 'review_text'].to_csv(os.getcwd() + '/FP_reviews.csv')

### Add 'halal cart(s)' to halal_truck feature

In [15]:
def is_halal_truck(doc_col):
    # initialize phrase matcher
    matcher = PhraseMatcher(nlp.vocab)
    # add phrases
    matcher.add('halal', None, nlp('halal guys'), nlp('halal truck'), nlp('halal cart'), nlp('halal carts'))
    # phrase match
    res = []
    count = []
    for i, doc in doc_col.iteritems():
        match = matcher(doc)
        res.append(True if len(match) else False)
        count.append(len(match))
    return res, count

In [23]:
# # read pickle of dataframe with all reviews data, including Spacy Doc, and features
# data_df = pd.read_pickle('/Users/wesamazaizeh/Desktop/Projects/halal_o_meter/src/features/feature_engineering/feature_engineering_v2_last_result.pkl')

# regenerate modified halal_truck feature that includes instances of halal cart(s)
nlp = en_core_web_sm.load()
data_df['halal_truck'], data_df['halal_truck_count'] = is_halal_truck(data_df['doc'])
data_df['halal_truck_percent'] = data_df['halal_truck_count'] / data_df['halal_review_count']
pd.crosstab( data_df['halal'], data_df['halal_truck'])

halal_truck,False,True
halal,Unnamed: 1_level_1,Unnamed: 2_level_1
False,117,108
True,415,197


- Modified halal_truck feature captures 108 out of the 225 non-halal restaurant, and 197/612 halal restaurants.
- This is compared to 41/225 non-halal and 98/612 halal restaurants previously. The 

## Pickle dataframe with features for modeling 

In [24]:
file_name = os.getcwd() + '/restaurant_cat_and_num_v2.pkl'
data_df.drop(['review_text', 'review_date', 'restaurant_name', 'doc'] ,axis=1).to_pickle(file_name)