In [1]:
import pickle
import re
import spacy
from spacy import displacy
import pandas as pd
nlp = spacy.load("en_core_web_sm")

# Import Reviews as DataFrame

In [2]:
#data = pd.read_csv('reviews.csv')

data = "Affordable Korean Food At Funan. Beef kalbi was disappointing, kimchi pancake is on the saltier side.... Fried chicken is normal..."

# Init functions to remove and extract hashtags

In [3]:
def removeHashTags(review_str):
    
    '''
    Input: Review in string format.
    Output: String of the review with starting hashtags all removed.
            (hashtags in the middle of reviews will not be removed
            to preserve phrasing)
    
    E.g.
    
    removeHashTags("#Amazing #Delicious #Foodcoma #iHateHashTags The wagyu beef was the #BEST i've every tried")
    
    >> The wagyu beef was the BEST i've ever tried.
    '''
    
    review = review_str.split()
    
    temp = 0
    
    for i in range(len(review)):
        
        text = review[i]
        
        if text[0] == '#' and temp == i:
            review[i] = ' '
            temp += 1
            
        elif text[0] == '#':
            review[i] = review[i].lstrip('#')
    
    return ' '.join(review).lstrip()


###-------------------------------------------###
###-------------------------------------------###
###-------------------------------------------###


def getHashTags(review_str):
    
    '''
    Input: review text in string format.
    Output: List of HashTags
    
    e.g. getHashTags('The #amazing wagyu #beef was so #juicy')
    >>> [#amazing, #beef, #juicy]
    '''
    
    return re.findall('#\w+',review_str)

In [4]:
import spacy

nlp = spacy.load("en_core_web_sm")
doc = nlp(data)

text = []
pos = []
dep = []
for token in doc:
    text.append(token.text)
    pos.append(token.pos_)
    dep.append(token.dep_)
    
df = pd.DataFrame(list(zip(text, pos, dep)), 
               columns =['text', 'pos','dep']) 

options = {"compact": True,"font": "Source Sans Pro"}
displacy.serve(doc, style="dep", options=options)
df

  "__main__", mod_spec)



Using the 'dep' visualizer
Serving on http://0.0.0.0:5000 ...

Shutting down server on port 5000.


Unnamed: 0,text,pos,dep
0,Affordable,ADJ,amod
1,Korean,ADJ,compound
2,Food,PROPN,ROOT
3,At,ADP,prep
4,Funan,PROPN,pobj
5,.,PUNCT,punct
6,Beef,PROPN,compound
7,kalbi,PROPN,nsubj
8,was,AUX,ccomp
9,disappointing,ADJ,acomp


# Get Nouns

A 2-stage process is used to extract all nouns and adjective phrases.

<ol>
    <li>We __extract all unigram nouns__ and append it into the "results" list.</li>
<li>Next, we __extract bigram and trigram units__ that have predefined POS tag sequences, and continue appending the extracted results to the "results" list. The current implementation below uses 7 rules.</li>
 </ol>

In [8]:
# Store all results in this list
results = []
doc = nlp(data)

for w in doc:
    if w.pos_ == 'NOUN':
        results.append(str(w))

# POS and Dependency Extraction

In [23]:
# results = []

# For each review
doc = nlp(data)

text_list = []
pos_list =[]
dep_list=[]

# For each word in the review
for w in doc:

    extract = None

    text_list.append(str(w.text))
    pos_list.append(w.pos_)
    dep_list.append(w.dep_)

    if len(text_list) > 3:
        # Rule 1
        if pos_list[-1] in ['NOUN', 'PROPN'] and pos_list[-2] in ['NOUN', 'PROPN'] and pos_list[-3] == 'ADJ':
            extract = text_list[-3] + ' ' +  text_list[-2] + ' ' + text_list[-1]
            print('Rule 1: ',extract)

        # Rule 2
        elif pos_list[-1] in ['NOUN', 'PROPN'] and pos_list[-2] in ['NOUN', 'PROPN'] and pos_list[-3] == 'VERB':
            extract = text_list[-3] + ' ' + text_list[-2] + ' ' + text_list[-1]
            print('Rule 2 ',extract)

        # Rule 3
        elif pos_list[-1] in ['NOUN', 'PROPN'] and pos_list[-2] in ['NOUN', 'PROPN'] and pos_list[-3] == 'NOUN':
            extract = text_list[-3] + ' ' + text_list[-2] + ' ' + text_list[-1]
            print('Rule 3: ',extract)

        # Rule 4
        elif pos_list[-1] in ['NOUN', 'PROPN'] and pos_list[-2] in ['NOUN', 'PROPN']:
            extract = text_list[-2] + ' ' + text_list[-1]
            print('Rule 4: ',extract,'({},{})'.format(pos_list[-2], pos_list[-1]))

        # Rule 5
        elif pos_list[-1] in ['NOUN', 'PROPN'] and pos_list[-2] == 'VERB':
            extract = text_list[-2] + ' ' + text_list[-1]
            print('Rule 5: ',extract)

        # Rule 6
        elif pos_list[-1] in ['NOUN', 'PROPN'] and pos_list[-2] == 'ADJ':
            extract = text_list[-2] + ' ' + text_list[-1]
            print('Rule 6: ',extract, '({},{})'.format(pos_list[-2], pos_list[-1]))

        # Rule 7
        if pos_list[-1] in ['NOUN', 'PROPN'] and dep_list[-1]=='dobj':
            extract = text_list[-2] + ' ' + text_list[-1]
            print('Rule 7: ',extract)

        if extract != None:

            results.append(extract)
    

Rule 4:  Beef kalbi (PROPN,PROPN)
Rule 4:  kimchi pancake (PROPN,PROPN)
Rule 6:  saltier side (ADJ,NOUN)
Rule 6:  Fried chicken (ADJ,NOUN)


# Format and export results as txt file

Returns all unique elements in the "results" list, and sorts all elements from A-Z

In [19]:
unique_results = sorted(list(set(results)))

print("Export successful: {} phrases".format(len(unique_results)))
unique_results

Export successful: 6 phrases


['Beef kalbi',
 'Fried chicken',
 'chicken',
 'kimchi pancake',
 'saltier side',
 'side']

After manual cleaning to create final list of food entities (or names) => [beef,cake,chicken,fried chicken,pancake]