In [1]:
import pickle
import re
import spacy
from spacy import displacy
import pandas as pd
nlp = spacy.load("en_core_web_sm")

# Import Reviews as DataFrame

In [2]:
data = pd.read_csv('reviews.csv')

# Init functions to remove and extract hashtags

In [3]:
def removeHashTags(review_str):
    
    '''
    Input: Review in string format.
    Output: String of the review with starting hashtags all removed.
            (hashtags in the middle of reviews will not be removed
            to preserve phrasing)
    
    E.g.
    
    removeHashTags("#Amazing #Delicious #Foodcoma #iHateHashTags The wagyu beef was the #BEST i've every tried")
    
    >> The wagyu beef was the BEST i've ever tried.
    '''
    
    review = review_str.split()
    
    temp = 0
    
    for i in range(len(review)):
        
        text = review[i]
        
        if text[0] == '#' and temp == i:
            review[i] = ' '
            temp += 1
            
        elif text[0] == '#':
            review[i] = review[i].lstrip('#')
    
    return ' '.join(review).lstrip()


###-------------------------------------------###
###-------------------------------------------###
###-------------------------------------------###


def getHashTags(review_str):
    
    '''
    Input: review text in string format.
    Output: List of HashTags
    
    e.g. getHashTags('The #amazing wagyu #beef was so #juicy')
    >>> [#amazing, #beef, #juicy]
    '''
    
    return re.findall('#\w+',review_str)

# Get Nouns

A 2-stage process is used to extract all nouns and adjective phrases.

<ol>
    <li>We __extract all unigram nouns__ and append it into the "results" list.</li>
<li>Next, we __extract bigram and trigram units__ that have predefined POS tag sequences, and continue appending the extracted results to the "results" list. The current implementation below uses 7 rules.</li>
 </ol>

In [4]:
# Store all results in this list
results = []

for i in range(data.shape[0]):
    rev = data['Unnamed: 0'].iloc[i]
    rev = rev.replace('#','')
    doc = nlp(rev)

    for w in doc:
        if w.pos_ == 'NOUN':
            results.append(str(w))

# POS and Dependency Extraction

In [5]:
# results = []

# For each review
for n in range(data.shape[0]):
    
    rev = data['Review'].iloc[n]
        
    try:
        rev = rev.replace('\n', ' ')
    except:
        continue

    rev = removeHashTags(rev)
    doc = nlp(rev)

    text_list = []
    pos_list =[]
    dep_list=[]
    
    # For each word in the review
    for w in doc:
        
        extract = None
        
        text_list.append(str(w.text))
        pos_list.append(w.pos_)
        dep_list.append(w.dep_)
        
        if len(text_list) > 3:
            # Rule 1
            if pos_list[-1] in ['NOUN', 'PROPN'] and pos_list[-2] in ['NOUN', 'PROPN'] and pos_list[-3] == 'ADJ':
                extract = text_list[-3] + ' ' +  text_list[-2] + ' ' + text_list[-1]

            # Rule 2
            elif pos_list[-1] in ['NOUN', 'PROPN'] and pos_list[-2] in ['NOUN', 'PROPN'] and pos_list[-3] == 'VERB':
                extract = text_list[-3] + ' ' + text_list[-2] + ' ' + text_list[-1]

            # Rule 3
            elif pos_list[-1] in ['NOUN', 'PROPN'] and pos_list[-2] in ['NOUN', 'PROPN'] and pos_list[-3] == 'NOUN':
                extract = text_list[-3] + ' ' + text_list[-2] + ' ' + text_list[-1]

            # Rule 4
            elif pos_list[-1] in ['NOUN', 'PROPN'] and pos_list[-2] in ['NOUN', 'PROPN']:
                extract = text_list[-2] + ' ' + text_list[-1]

            # Rule 5
            elif pos_list[-1] in ['NOUN', 'PROPN'] and pos_list[-2] == 'VERB':
                extract = text_list[-2] + ' ' + text_list[-1]

            # Rule 6
            elif pos_list[-1] in ['NOUN', 'PROPN'] and pos_list[-2] == 'ADJ':
                extract = text_list[-2] + ' ' + text_list[-1]
            
            # Rule 7
            if pos_list[-1] in ['NOUN', 'PROPN'] and dep_list[-1]=='dobj':
                extract = text_list[-2] + ' ' + text_list[-1]
         
            if extract != None:
         
                results.append(extract)
    

# Format and export results as txt file

Returns all unique elements in the "results" list, and sorts all elements from A-Z

In [7]:
unique_results = sorted(list(set(results)))

# Export
with open('master_wordlist.txt', 'w', encoding='utf-8') as f:
    for text in unique_results:
        f.write(text +'\n')

print("Export successful: {} phrases".format(len(unique_results)))

Export successful: 9631 phrases
