In [None]:
import pandas as pd
import numpy as np
import nltk
nltk.download('punkt')
from textblob import TextBlob
#from nltk.stem.porter import PorterStemmer

#### Method 1 : Extracting aspects using lexicon matching using a corpus from yelp 
Reference : https://medium.com/@borgesalkan/yelp-popular-dishes-with-aspect-based-sentiment-analysis-796c191245bf

Lexicon : the vocabulary of a person, language, or branch of knowledge. This can include the technical terms of a particular subject or field / how the terms are usually used as well. 

Domain specific lexicons extracted (using NER) from the yelp review dataset was used

In [None]:
# LEXICONS from yelp corpus so might not include local food names eg zi char etc
FOOD_LEXICONS = set(pd.read_csv("../data/rule_mining/yelp_lexicon/yelpnlg-lexicons/food.csv", header=None, names = ["food"])["food"].str.lower())
AMBIENCE_LEXICONS = set(pd.read_csv("../data/rule_mining/yelp_lexicon/yelpnlg-lexicons/ambience.csv", header=None, names = ["ambience"])["ambience"].str.lower())
PRICE_LEXICONS = set(pd.read_csv("../data/rule_mining/yelp_lexicon/yelpnlg-lexicons/price.csv", header=None, names = ["price"])["price"].str.lower())
SERVICE_LEXICONS = set(pd.read_csv("../data/rule_mining/yelp_lexicon/yelpnlg-lexicons/service.csv", header=None, names = ["service"])["service"].str.lower())
TIME_LEXICONS = set(["time","queue","wait"]) # aspects extracted from topic modelling
PORTION_LEXICONS = set(["portion","size","serving"]) # aspects extracted from topic modelling

In [None]:
def _extract_ngrams(data, num):
   n_grams = TextBlob(data).ngrams(num)
   return [' '.join(grams).lower() for grams in n_grams]

def _delete_duplicate_food_n_grams(text, foods):
   foods.sort(key=lambda x: -len(x.split()))  # Sort desc by number of words
   result_foods = []
   for food in foods:
       if food in text:
           text = text.replace(food, '')
           result_foods.append(food)
   
   return result_foods

In [None]:
aspects = ["food","ambience","price","service","time","portion"]
ASPECTS_LEXICONS = [FOOD_LEXICONS,AMBIENCE_LEXICONS,PRICE_LEXICONS,SERVICE_LEXICONS,TIME_LEXICONS,PORTION_LEXICONS]
all_text = []

def extract_aspects(text):
    text = text.lower()   #lower review text
    all_aspects = {}
    for aspect, ASPECT_LEXICONS in zip(aspects,ASPECTS_LEXICONS):
           aspect_list = set()
           for n in range(3, 0, -1):
               n_grams = _extract_ngrams(text, n)
               #n_grams_stemmed = [stemmer.stem(n_gram) for n_gram in n_grams]
               n_grams_set = set(n_grams).union(n_grams)
               aspect_list = aspect_list.union(n_grams_set.intersection(ASPECT_LEXICONS))
           aspect_list = list(aspect_list)
           aspect_list = _delete_duplicate_food_n_grams(text, aspect_list)
           if len(aspect_list) != 0 :
               all_text.extend(aspect_list)
               all_aspects[aspect] = aspect_list
    return all_aspects

In [None]:
reviews = pd.read_csv('../data/train_test/reviews_nontest.csv')
reviews = reviews.loc[reviews.review_body.notnull(),"review_body"]
reviews["aspects"] = reviews.map(extract_aspects)

In [None]:
df = pd.DataFrame({"review": reviews, "aspects": reviews.aspects})
df.to_csv("../data/rule_mining/aspects_yelp.csv")

#### Add in Burple reviews specific food names into lexicon list

Use POStagging to extract out food names (usually nouns)

In [None]:
import nltk
import re
import pprint
from nltk import Tree
import pdb

# to extract NounPhrases - food names
# Penn Treebank Tag Set differentiates between four different types of nouns: NN, NNS, NNP, NNPS. We will want to consider all types of nouns:
# proper and common, singular/mass and plural. Rather than writing separate rules for each case, we can use regular expressions to include them all.
patterns=""".    
    NP: {<NN*>+}
    {<NN*><CC>*<NN*>+}
    {<NP><CC><NP>}
    {<RB><NN*>+}
    """
# Chunking breaks a text up into user-defined units ('chunks') that contain certain types of words (nouns, adjectives, verbs) or
# phrases (noun phrases, verb phrases, prepositional phrases). What makes chunking with the NLTK different from using a built-in string
# method like split is the NLTK's ability to analyze the text and tag each word with its part of speech
NPChunker = nltk.RegexpParser(patterns)

def prepare_text(input):
    sentences = nltk.sent_tokenize(input)
    sentences = [nltk.word_tokenize(sent) for sent in sentences] 
    sentences = [nltk.pos_tag(sent) for sent in sentences]
    sentences = [NPChunker.parse(sent) for sent in sentences]
    return sentences

# combine nouns to noun phrases
def parsed_text_to_NP(sentences):
    nps = []
    for sent in sentences:
        tree = NPChunker.parse(sent)
        for subtree in tree.subtrees():
            if subtree.label() == 'NP':  # only extract noun phrases
                t = subtree
                t = ' '.join(word for word, tag in t.leaves())
                nps.append(t)
    return nps

def sent_parse(input):
    sentences = prepare_text(input)
    nps = parsed_text_to_NP(sentences)
    return nps


In [None]:
reviews["food"] = reviews.map(sent_parse)
df = pd.DataFrame({"review": reviews, "Food": reviews.food})
df.to_csv("../data/rule_mining/food_extraction.csv")

In [None]:
import ast
df1 = pd.read_csv("../data/rule_mining/aspects_yelp.csv",index_col=0)
reviews_new_food = pd.read_csv('./data/rule_mining/food_extraction.csv') # code in drive
df1["new_food"] = reviews_new_food.Food
df1.new_food.fillna("[]",inplace=True)
df1.aspects.fillna("{}",inplace=True)
df1["new_food"] = df1.new_food.map(lambda x : [text.lower() for text in ast.literal_eval(x)])

In [None]:
df1.to_csv("../data/rule_mining/aspects_yelp_with_new.csv")

In [None]:
df1 = pd.read_csv("../data/rule_mining/aspects_yelp_with_new.csv",index_col=0)
df1.head()

In [None]:
def remove_duplicates(row):
    foods_ner = ast.literal_eval(row["new_food"])
    aspects = ast.literal_eval(row["aspects"])
    for word in foods_ner:
        if word not in all_text:
            if "food" not in aspects.keys() or aspects["food"] is None :
                aspects["food"] = []               
            aspects["food"].append(word)
            print(aspects)
    print(aspects)
    return aspects

In [None]:
df1["aspects_new"] = df1.apply(remove_duplicates,axis=1)
df1.to_csv("../data/rule_mining/final_aspects_terms.csv") # for rule mining to extract out adjectives