# libraries

In [4]:
import sys
import os
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import en_core_web_sm
from collections import Counter
from spacy.matcher import PhraseMatcher, Matcher


%matplotlib inline

# to import Database class from data_collection folder
module_path = os.path.abspath(os.path.join('../..')+'/data/data_collection')
if module_path not in sys.path:
    sys.path.append(module_path)

# now that the folder is in the path, ../data_collection/database.py can be imported
from storage_managers.database import Database

# initialize spacy
nlp = en_core_web_sm.load()

# Import raw features dataframe with Spacy Docs

In [2]:
# read pickle of dataframe with all reviews data, including Spacy Doc, and features
data_df = pd.read_pickle('/Users/wesamazaizeh/Desktop/Projects/halal_o_meter/src/features/feature_engineering/features_draft_v2.2.pkl')

## Make features from most common words before and after 'halal' in reviews text 

In [5]:
def common_at_position_from_halal(doc_col, position=1, n=10):
    # initialize matcher
    matcher = Matcher(nlp.vocab)
    # specify match pattern
    pattern = [{'LOWER': 'halal'}]
    matcher.add('halal', None, pattern)
    word_freqs = Counter()
#     noun_freqs = Counter()
    for doc in doc_col:
        matches = matcher(doc)
        words = []
        nouns = []
        for match_id, start, end in matches:
            if start+position < len(doc):
                token = doc[start+position]
            else:
                break
            # all tokens that arent stop words or punctuations
            words.append(token.text.lower() if token.is_stop != True and token.is_punct != True and token.is_space != True else '')
#             # noun tokens that arent stop words or punctuations
#             nouns.append(token.text.lower()) if token.is_stop != True and token.is_punct != True and token.pos_ == "NOUN" else nouns
            # most common tokens
        word_freq = Counter(words)
        word_freqs = word_freqs + word_freq
#         # most common noun tokens
#         noun_freq = Counter(nouns)
#         noun_freqs = noun_freqs + noun_freq
        
#     common_nouns = noun_freqs.most_common(n)
    common_words = word_freqs.most_common(n)
    return common_words #, common_nouns
    
common100_after_halal = common_at_position_from_halal(data_df['doc'], position=1, n=100)
common100_before_halal = common_at_position_from_halal(data_df['doc'], position=-1, n=100)

In [6]:
# use words with more than 10 instances 
more_than_10counts = [freq[1] > 10 for freq in common100_after_halal]
words_after = np.array(common100_after_halal)[more_than_10counts].tolist()
words_after = [word[0] for word in words_after]
# words_after = words_after + [''] # could be eliminated bc the 

more_than_10counts = [freq[1] > 10 for freq in common100_before_halal]
words_before = np.array(common100_before_halal)[more_than_10counts].tolist()
words_before = [word[0] for word in words_before]
# words_before = words_before + ['']

def count_matches(doc_col, words, position=1):
    '''
    Input: 
        - Pandas Series with Spacy Doc object for every restaurants' collection of reviews
    Output:
        - Pandas Dataframe with columns of counts of 'halal X' for X in arg:words. The counts are
          per restaurant reviews
    '''
    # dataframe with created columns
    if position>0:
        df = pd.DataFrame(columns=['halal_' + word + '_count' for word in words], index=doc_col.index)
    else:
        df = pd.DataFrame(columns=[word + '_halal_count' for word in words], index=doc_col.index)
    # for progress reporting
    c=1
    for word in words:
        # initialize matcher
        matcher = Matcher(nlp.vocab)
        # specify match pattern
        if position>0 and word:
            pattern = [{'LOWER': 'halal'}, {'LOWER': word}]
        elif position<0 and word:
            pattern = [{'LOWER': word}, {'LOWER': 'halal'}]
        else:
            pattern = [{'LOWER': 'halal'}]
        matcher.add('halal', None, pattern)
        for i, doc in doc_col.iteritems():
            matches = matcher(doc)
            col_name = 'halal_' + word + '_count' if position>0 else word + '_halal_count'
            df.loc[i, col_name] = len(matches)
        print('[{}/{}]'.format(c, len(words)), end='\r', flush=True)
        c += 1
    return df

df_after = count_matches(data_df['doc'], words_after, position=1)
df_before = count_matches(data_df['doc'], words_before, position=-1)

[43/43]

In [7]:
def process(df):
    div_col = df.columns[np.logical_or(df.columns.str.startswith('_'), df.columns.str.contains('__'))]
    df2 = df.drop(div_col, axis=1) 
    df2 = df2.div(df[div_col].values)
    change_names = dict((old_name, old_name.replace('count', 'freq')) for old_name in df2.columns)
    df2.rename(columns=change_names, inplace=True)
    return df2
    
df_after = process(df_after)
df_before = process(df_before)

In [8]:
# combine new features with necessary ones from version 4. Then, pickle and save
data_df2 = pd.concat([data_df[['halal_in_name', 'halal_review_percent', 'halal']], df_after, df_before], axis=1)

file_name_v5 = os.getcwd() + '/restaurant_cat_and_num_v5.pkl'
data_df2.to_pickle(file_name_v5)