# libraries

In [1]:
import os, sys
import pandas as pd
import en_core_web_sm
import string
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.attrs import LOWER, POS, ENT_TYPE, IS_ALPHA
from spacy.tokens import Doc
import numpy
import re
from sklearn.feature_extraction.text import TfidfVectorizer
import json
from spacy.matcher import Matcher
import numpy as np


# to import Database class from data_collection folder
module_path = os.path.abspath(os.path.join('../..')+'/data/data_collection')
if module_path not in sys.path:
    sys.path.append(module_path)

# now that the folder is in the path, ../data_collection/database.py can be imported
from storage_managers.database import Database

# initialize spacy
nlp = en_core_web_sm.load()

## Preprocessing functions

In [2]:
def fetch_data() -> pd.DataFrame():
    db = Database()
    # get halal-reviews and its restaurant data
    data_sql = '''SELECT b.platform_id, b.name as restaurant_name, r.review_text, r.username, r.rating,
                concat(review_date,date) as review_date, r.helpful_count, b.address, b.image_url,
                b.lat, b.lng, b.total_review_count, b.total_halal_review_count
                FROM reviews r
                JOIN businesses b
                ON r.restaurant_id = b.platform_id
                WHERE r.review_text IS NOT NULL '''
    data_df = db.select_df(data_sql)
    return data_df

def format_columns(data_df) -> pd.DataFrame():
    str_to_num_cols = ['rating']
    for col in str_to_num_cols:
        data_df[col] = data_df[col].str.extract('(\d+)')
        data_df[col].fillna(-1, inplace=True)
        data_df[col] = data_df[col].astype(int)

    str_to_lower = ['restaurant_name', 'review_text', 'username']
    for col in str_to_lower:
        data_df[col] = data_df[col].str.lower()
    return data_df


def collapse_reviews(data_df, f = {}, agg_col = '') -> pd.DataFrame():
    # combine  reviews text per restaurant
    for col in data_df.columns[data_df.columns != agg_col]:
        if col not in f.keys():
            f[col] = 'first'
    collapsed_df = data_df.groupby(agg_col, as_index=False).agg(f)
    return collapsed_df
    ### ignore these for now:
    ## include date info? then have to consider classification per review and how to aggregate classifications
    #  per restaurant?


def clean_review_text(data_df, nlp, save_doc = True) -> pd.DataFrame():
    # initialize NLP tools
    punctuations = string.punctuation
    stop_words = STOP_WORDS

    # column containing review texts
    reviews_col = data_df.columns[ data_df.columns.str.contains('text')][0]
    if save_doc:
        # initialize column to save Spacy doc object
        data_df['doc'] = ''

    for i, text in data_df[reviews_col].items():
        ## clean (convert to lowercase and remove punctuations and   characters and then strip)
        text = re.sub(r'[^\w\s]', '', str(text).lower().strip())
        ## Tokenize (convert from string to list)
        lst_text = text.split()
        ## remove Stopwords
        lst_text = [word for word in lst_text if word not in STOP_WORDS]
        ## save Spacy doc object
        doc = nlp(' '.join(lst_text)[:1000000])
        if doc == '' : 
            print('index: ', i)
        if save_doc:
            data_df.at[i, 'doc'] = doc
        ## Lemmatisation (convert the word into root word)
        lem_doc = [word.lemma_ for word in doc]
        ## back to string from list
        data_df.loc[i, 'clean_text'] = ' '.join(lem_doc)

    return data_df

def tf_idf(corpus, n=100, ngram=(1)):
    vectorizer = TfidfVectorizer(max_features=n, ngram_range=ngram)
    vectors = vectorizer.fit_transform(corpus)
    names = vectorizer.get_feature_names()
    data = vectors.todense().tolist()
    return names, data

def custom_matcher(nlp, doc_col, pattern=[], pattern_name=''):
    # initialize matcher
    matcher = Matcher(nlp.vocab)

    # specify match pattern
    matcher.add(pattern_name, pattern)

    res_df = pd.DataFrame(columns = [pattern_name, pattern_name+'_count'])
    # create two columns with counts/ presence of the match respectively
    for i, doc in doc_col.iteritems():
        match = matcher(doc)
        count = len(match)
        res_df.loc[i, pattern_name+'_count'] = count
        res_df.loc[i, pattern_name] = True if count>0 else False
    return res_df

def print_top(tfidf_dict, n=10):
    sorted_tfidf_dict = sorted(tfidf_dict, key=lambda x: sum(tfidf_dict[x]) / len(tfidf_dict[x]), reverse=True)[:n]
    for word in sorted_tfidf_dict:
        print(word)

## Extract key words from labeled restaurants

In [7]:
# get review data
test_df = fetch_data()

# add basic features
test_df['halal_in_name'] = test_df.apply(lambda row: True if 'halal' in row['restaurant_name'].lower() else False, axis=1)
test_df['percent_halal_reviews'] = test_df['total_halal_review_count'] / test_df['total_review_count']

# preprocess
test_df = format_columns(test_df)
agg_col = 'platform_id'
f = {'review_text' : lambda x: ' '.join(x), 
     'rating' : 'mean',
     'helpful_count': 'mean',}
for col in test_df.columns:
    if (col != agg_col) and col not in f.keys():
        f[col] = 'first'
test_df = collapse_reviews(test_df, f=f, agg_col=agg_col)

# keep only labeled data
target_df = pd.read_csv('/Users/wesamazaizeh/Desktop/Projects/halal_o_meter/src/features/target_feature/label_target.csv', index_col=0)
target_df['halal'] = target_df['halal'].apply(lambda row: True if row == 'TRUE' else False)
test_df = test_df.merge(target_df[['platform_id', 'halal']], on='platform_id')

# clean up text
test_df = clean_review_text(test_df, nlp=nlp)

# extract 1000 halal keywords
corpus = test_df['clean_text'][test_df['halal']].astype(str)
words, scores = tf_idf(corpus, n=1000, ngram=(1,2))
top_halal = dict( zip(words, scores))
print('Top halal restaurant keywords by avg tfidf score:')
print_top(top_halal, n=10)

lamb gyro
excited
good
halal option
gem
believe
be not
baba
good service
literally


In [13]:
# extract 1000 non-halal keywords
non_corpus = test_df['clean_text'][~test_df['halal']].astype(str)
non_words, non_scores = tf_idf(non_corpus, n=1000, ngram=(1,2))
top_non_halal = dict( zip(non_words, non_scores))
print('Top non-halal restaurant keywords by avg tfidf score:')
print_top(top_non_halal, n=10)

attitude
buy
accommodate
definitely
chili
cart pron
beef
definitely recommend
black
chicken lamb


In [None]:
# How much overlap is there

# Compare relative tf-idf scores rather than exclude

In [None]:
Docs:
    prupose
    good at highly specialized data science things
    end to end (collection, cleaning, modeling, validation, loop, productionize)
    different models, features,
    
    significance
    novelty
    novel method