# libraries

In [1]:
import sys
import os
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import en_core_web_sm
from collections import Counter
from spacy.matcher import PhraseMatcher, Matcher


%matplotlib inline

# to import Database class from data_collection folder
module_path = os.path.abspath(os.path.join('../..')+'/data/data_collection')
if module_path not in sys.path:
    sys.path.append(module_path)

# now that the folder is in the path, ../data_collection/database.py can be imported
from database import Database

# Import raw features dataframe with Spacy Docs

In [2]:
# read pickle of dataframe with all reviews data, including Spacy Doc, and features
data_df = pd.read_pickle('/Users/wesamazaizeh/Desktop/Projects/halal_o_meter/src/features/feature_engineering/feature_engineering_v2_last_result.pkl')

# Import FP from XGBoost with oversampling model in Baseline_v4

In [3]:
# read pickles
FP_list = pd.read_pickle(os.getcwd() + '/FP_list_v2.pkl')
TN_list = pd.read_pickle(os.getcwd() + '/TN_list_v2.pkl')

# save FP reviews to csv for easier viewing
FP_df = data_df.loc[FP_list]
TN_df = data_df.loc[TN_list]
# FP_df.to_csv(os.getcwd() + '/FP_reviews_v2.csv')

# get word frequency
def most_common_n(text, n):
    nlp = en_core_web_sm.load()
    doc = nlp(text) # since I cannot join Doc objects easily I'll combine the strings
    #remove stopwords, punctuations, and whitespaces
    words = [token.lemma_.lower() for token in doc if token.is_stop != True and token.is_punct != True and token.is_space != True]
    word_freq = Counter(words)
    common_words = word_freq.most_common(n)
    return (common_words)
    
FP_top_50 = most_common_n(". ".join(FP_df['review_text']), 50) # 50 most common words from all FP reviews
TN_top_50 = most_common_n(". ".join(TN_df['review_text']), 50) # 50 most common words from all TN reviews
print(FP_top_50)

[('food', 58), ('halal', 51), ('good', 41), ('chicken', 36), ('burger', 35), ('place', 33), ('order', 24), ('great', 21), ('eat', 20), ('fry', 19), ('meat', 17), ('like', 17), ('try', 16), ('restaurant', 14), ('come', 14), ('$', 12), ('time', 12), ('shawarma', 11), ('2', 11), ('service', 11), ('thing', 11), ('beef', 10), ('go', 10), ('look', 10), ('ask', 10), ('price', 10), ('serve', 10), ('indian', 10), ('kosher', 9), ('lunch', 9), ('guy', 9), ('taste', 9), ('bowl', 9), ('know', 8), ('get', 8), ('3', 8), ('include', 8), ('sauce', 8), ('spice', 8), ('menu', 8), ('medium', 8), ('bacon', 8), ('entree', 7), ('sandwich', 7), ('people', 7), ('want', 7), ('salad', 7), ('say', 7), ('star', 7), ('hot', 7)]


In [4]:
print(TN_top_50)

[('food', 148), ('halal', 147), ('good', 118), ('cart', 95), ('chicken', 95), ('like', 87), ('sauce', 78), ('order', 69), ('come', 60), ('place', 55), ('$', 55), ('rice', 55), ('lamb', 53), ('try', 51), ('time', 50), ('eat', 49), ('get', 48), ('meat', 48), ('taste', 44), ('go', 44), ('great', 42), ('sandwich', 39), ('salad', 37), ('know', 36), ('service', 35), ('definitely', 35), ('white', 35), ('restaurant', 34), ('hot', 34), ('platter', 34), ('think', 32), ('guy', 31), ('lunch', 31), ('line', 30), ('delicious', 30), ('dish', 30), ('gyro', 28), ('grill', 28), ('want', 28), ('noodle', 28), ('ask', 27), ('price', 27), ('meal', 26), ('long', 26), ('burger', 26), ('pita', 26), ('day', 25), ('little', 25), ('find', 24), ('fry', 24)]


### get sentencs around the word halal

In [5]:
nlp = en_core_web_sm.load()
doc = nlp(". ".join(FP_df['review_text']))
sents = [doc[token.i:token.i-1].sent for token in doc if token.text.lower() == 'halal']

In [6]:
sents[:5]

[For the people concerned about the Halal meat, only the chicken is Halal as per their server.,
 For the people concerned about the Halal meat, only the chicken is Halal as per their server.,
 However, i live in oak park and prefer halal/kosher food.,
 To my knowledge, this is the only halal restaurant in Oak Park.,
 But, only the chicken is halal, nothing else.]

In [7]:
from spacy import displacy

displacy.render(sents[0], style="dep")

  "__main__", mod_spec)



Using the 'dep' visualizer
Serving on http://0.0.0.0:5000 ...

Shutting down server on port 5000.


# New Features:
    - Seperate different meats in halal_relevant b/c their distribution seems to be different in FP & TN
    - Add 'only' matching if sentence also includes halal to see if it improves model. If not, match for 'only' with halal as part of token's head/ children.

### 1. Seperate halal relevant phrases

In [15]:
# initialize nlp if necessary
nlp = en_core_web_sm.load()

def halal_match_and_count(doc_col, pattern):
    # initialize phrase matcher
    matcher = PhraseMatcher(nlp.vocab)
    # add phrases
    matcher.add(pattern, None, nlp(pattern))
    # phrase match
    res = []
    count = []
    for i, doc in doc_col.iteritems():
        match = matcher(doc)
        res.append(True if len(match) else False)
        count.append(len(match))
    return res, count

def get_match_data(text):
    pattern = ' '.join(['halal', text])
    bool_col_name = pattern.replace(' ', '_')
    count_col_name = bool_col_name + '_count'
    data_df[bool_col_name], data_df[count_col_name] = halal_match_and_count(data_df['doc'], pattern)
    if (bool_col_name in data_df.columns) and (count_col_name in data_df.columns):
        print('Matched and counted "Halal {}"'.format(text))

word_list = ['chicken', 'beef', 'lamb', 'goat', 'burger', 'fried', 'meat',
             'place', 'spot', 'options', 'restaurant', 'place']
for word in word_list:
    get_match_data(word)

Matched and counted "Halal chicken"
Matched and counted "Halal beef"
Matched and counted "Halal lamb"
Matched and counted "Halal goat"
Matched and counted "Halal burger"
Matched and counted "Halal fried"
Matched and counted "Halal meat"
Matched and counted "Halal place"
Matched and counted "Halal spot"
Matched and counted "Halal options"
Matched and counted "Halal restaurant"
Matched and counted "Halal place"


In [16]:
pd.crosstab(data_df['halal'], data_df['halal_chicken'])

halal_chicken,False,True
halal,Unnamed: 1_level_1,Unnamed: 2_level_1
False,211,14
True,563,49


### 2. Redefine halal_relevant to include 'is halal' only and change name to is_halal

In [17]:
# initialize nlp if necessary
nlp = en_core_web_sm.load()

def halal_matches_and_counts(doc_col, patterns):
    # initialize phrase matcher
    matcher = PhraseMatcher(nlp.vocab)
    # add phrases
    matcher.add('matcher', None, nlp(*patterns))
    # phrase match
    res = []
    count = []
    for i, doc in doc_col.iteritems():
        match = matcher(doc)
        res.append(True if len(match) else False)
        count.append(len(match))
    return res, count

data_df['is_halal'], data_df['is_halal_count'] = halal_matches_and_counts(data_df['doc'], ('is halal', "'s halal'"))

In [18]:
pd.crosstab(data_df['halal'], data_df['is_halal'])

is_halal,False,True
halal,Unnamed: 1_level_1,Unnamed: 2_level_1
False,202,23
True,461,151


### 3. Roughly detect partial halal establishments

In [19]:
# initialize nlp if necessary
nlp = en_core_web_sm.load()

def double_match_and_count(doc_col, word_1, word_2):
    # initialize phrase matcher
    matcher_1 = Matcher(nlp.vocab)
    matcher_2 = Matcher(nlp.vocab)
    # add matching patterns
    pattern1 = [{'LOWER': word_1}]; matcher_1.add(word_1, None, pattern1)
    pattern2 = [{'LOWER': word_2}]; matcher_2.add(word_2, None, pattern2)
    # match for word_1 first, which preferably should be less common
    res = []
    count = []
    for i, doc in doc_col.iteritems():
        double_match_counter = 0 
        first_matches = matcher_1(doc)
        for match_id, start, end in first_matches:
            doc2 = nlp(doc[start].sent.text)
            second_match = matcher_2(doc2)
            if len(second_match):
                double_match_counter+=1
        res.append(True if double_match_counter>0 else False)
        count.append(double_match_counter)
    return res, count
    
data_df['partial_halal'], data_df['partial_halal_count'] = double_match_and_count(data_df['doc'], 'only', 'halal')

In [20]:
pd.crosstab(data_df['halal'], data_df['partial_halal'])

partial_halal,False,True
halal,Unnamed: 1_level_1,Unnamed: 2_level_1
False,189,36
True,480,132


## Pickle dataframe with features for modeling 

In [21]:
file_name = os.getcwd() + '/restaurant_cat_and_num_v3.pkl'
data_df.drop(['review_text', 'review_date', 'restaurant_name', 'doc'] ,axis=1).to_pickle(file_name)

In [24]:
data_df.columns

Index(['review_text', 'review_date', 'halal_review_count', 'restaurant_name',
       'total_review_count', 'halal', 'halal_in_name', 'halal_review_percent',
       'doc', 'halal_relevant_count', 'halal_relevant', 'halal_negation_count',
       'halal_negation', 'halal_truck_count', 'halal_truck',
       'non_halal_relevant_count', 'non_halal_relevant', 'creekstone_count',
       'creekstone', 'hala_bacon_count', 'halal_bacon',
       'halal_negation_percent', 'halal_chicken', 'halal_chicken_count',
       'halal_beef', 'halal_beef_count', 'halal_lamb', 'halal_lamb_count',
       'halal_goat', 'halal_goat_count', 'halal_burger', 'halal_burger_count',
       'halal_fried', 'halal_fried_count', 'halal_meat', 'halal_meat_count',
       'halal_place', 'halal_place_count', 'halal_spot', 'halal_spot_count',
       'halal_options', 'halal_options_count', 'halal_restaurant',
       'halal_restaurant_count', 'is_halal', 'is_halal_count', 'partial_halal',
       'partial_halal_count'],
      dtyp