In [0]:
!pip install scattertext

!pip install -U spacy
!python -m spacy download en_core_web_sm

Collecting scattertext
[?25l  Downloading https://files.pythonhosted.org/packages/e0/6a/d2b2af772934a946cbebb47cb068b4631ed437a264d9cfc7ef5761f95e00/scattertext-0.0.2.56-py3-none-any.whl (6.8MB)
[K     |████████████████████████████████| 6.8MB 4.1MB/s 
[?25hCollecting mock
  Downloading https://files.pythonhosted.org/packages/05/d2/f94e68be6b17f46d2c353564da56e6fb89ef09faeeff3313a046cb810ca9/mock-3.0.5-py2.py3-none-any.whl
Installing collected packages: mock, scattertext
Successfully installed mock-3.0.5 scattertext-0.0.2.56
Collecting flask_cors
  Downloading https://files.pythonhosted.org/packages/78/38/e68b11daa5d613e3a91e4bf3da76c94ac9ee0d9cd515af9c1ab80d36f709/Flask_Cors-3.0.8-py2.py3-none-any.whl
Installing collected packages: flask-cors
Successfully installed flask-cors-3.0.8
Collecting python-decouple
  Downloading https://files.pythonhosted.org/packages/c7/82/dd20cdca396f58be86c6e710a3958f4a34ca98c5dd3989ee978b6cb9f97e/python-decouple-3.3.tar.gz
Building wheels for collected

In [29]:
from flask import Flask, render_template, request, jsonify
import json
import warnings
import pandas as pd
import spacy
import scattertext as st
from lxml import html
from requests import Session
from concurrent.futures import ThreadPoolExecutor as Executor
import requests
# from flask_cors import CORS
# from decouple import config
import re
pd.options.display.max_rows = 999
pd.options.display.max_columns = 999
pd.set_option('display.max_colwidth', 1000)
nlp = spacy.load("en_core_web_sm")#if you run into problems here, 'Restart Runtime' and run all, it might fix things.
base_url = "https://www.yelp.com/biz/" 
api_url = "/review_feed?sort_by=date_desc&start="
bid = 'Rc1lxc5lSKJYd162JHNMfQ'

class Scraper():
    def __init__(self):
        self.data = pd.DataFrame()

    def get_data(self, n, bid=bid):
        with Session() as s:
            with s.get(base_url+bid+api_url+str(n*20)) as resp: #makes an http get request to given url and returns response as json
                r = json.loads(resp.content) #converts json response into a dictionary
                _html = html.fromstring(r['review_list']) #loads from dictionary

                dates = _html.xpath("//div[@class='review-content']/descendant::span[@class='rating-qualifier']/text()")
                reviews = [el.text for el in _html.xpath("//div[@class='review-content']/p")]
                ratings = _html.xpath("//div[@class='review-content']/descendant::div[@class='biz-rating__stars']/div/@title")

                df = pd.DataFrame([dates, reviews, ratings]).T

                self.data = pd.concat([self.data,df])

    def scrape(self): #makes it faster
        # multithreaded looping
        with Executor(max_workers=40) as e:
            list(e.map(self.get_data, range(10)))

s = Scraper()
s.scrape()
df = s.data


def customtokensize(text):
    return re.findall("[\w']+", str(text))

df['tokenized_text'] = df[1].apply(customtokensize)
# stopwords = ['and','was','were','had','check-in','=','= =','u','want', 'u want', 'cuz','him',"i've",'on', 'her','told','ins', '1 check','I', 'i"m', 'i', ' ', 'it', "it's", 'it.','they', 'the', 'this','its', 'l','they','this',"don't",'the ', ' the', 'it', 'i"ve', 'i"m', '!', '1','2','3','4', '5','6','7','8','9','0','/','.',',']

stopwords = [',','"','!','-','&','?']

def filter_stopwords(text):
  nonstopwords = []
  for i in text:
    if i not in stopwords:
      nonstopwords.append(i)
  return nonstopwords
df['tokenized_text'] = df['tokenized_text'].apply(filter_stopwords)
df['parts_of_speech_reference'] = df['tokenized_text'].apply(filter_stopwords)
df['parts_of_speech_reference'] = df['parts_of_speech_reference'].str.join(' ')
# df['parts_of_speech_reference'] = df['tokenized_text'].str.join(' ')
df.head(1)

Unnamed: 0,0,1,2,tokenized_text,parts_of_speech_reference
0,\n 10/2/2019\n,Came here with my mom before a broadway show for her bday! The table was approx the width of my big toe but it was super cute and the food was amazing!! Def make a reservation there was a line out the door within the hr of it opening for dinner service!!,5.0 star rating,"[Came, here, with, my, mom, before, a, broadway, show, for, her, bday, The, table, was, approx, the, width, of, my, big, toe, but, it, was, super, cute, and, the, food, was, amazing, Def, make, a, reservation, there, was, a, line, out, the, door, within, the, hr, of, it, opening, for, dinner, service]",Came here with my mom before a broadway show for her bday The table was approx the width of my big toe but it was super cute and the food was amazing Def make a reservation there was a line out the door within the hr of it opening for dinner service


In [30]:
def find_nounchunks(x):
  """noun chunks greater than 2 without stopwords"""
  noun_list = []
  doc = nlp(str(x))
  for chunk in doc.noun_chunks:
    if len(chunk) > 2:
      noun_list.append(chunk)
  return noun_list

df['nounchunk_list'] = df[1].apply(find_nounchunks)
df[[1,'nounchunk_list']].head(1)

Unnamed: 0,1,nounchunk_list
0,Came here with my mom before a broadway show for her bday! The table was approx the width of my big toe but it was super cute and the food was amazing!! Def make a reservation there was a line out the door within the hr of it opening for dinner service!!,"[(a, broadway, show), (my, big, toe)]"


In [31]:
def find_words_after_and_before_THE_A_AN(x):
  """find_words_after_and_before_THE"""
  total_lists = []
  doc = nlp(str(x))
  for token in range(len(doc)):
    try:
      word_list1 = []
      if (doc[token].pos_ == 'PRON') or ((doc[token-1].pos_ == 'VERB') and (doc[token].text == 'the')) or ((doc[token-1].pos_ == 'ADV') and (doc[token].text == 'the')) or (doc[token].text == 'The') or (doc[token].text == 'a') or (doc[token].text == 'A') or (doc[token].text == 'an') or (doc[token].text == 'An'):
        word_list1.append(doc[token])
        word_list1.append(doc[token+1])
        word_list1.append(doc[token+2])
        word_list1.append(doc[token+3])
        word_list1.append(doc[token+4])
        word_list1.append(doc[token+5])
        word_list1.append(doc[token+6])
        word_list1.append(doc[token+7])
        word_list1.append(doc[token+8])
        word_list1.append(doc[token+9])
        word_list1.append(doc[token+10])
        word_list1.append(doc[token+11])
      if len(word_list1) != 0:
        total_lists.append(word_list1)
    except IndexError or TypeError as e:
      for token in range(len(doc)):
        try:
          word_list2 = []
          if (doc[token].pos_ == 'PRON') or ((doc[token].pos_ == 'the') and (doc[token].pos_ == 'PRON')) or (doc[token].text == 'The') or (doc[token].text == 'a') or (doc[token].text == 'A') or (doc[token].text == 'an') or (doc[token].text == 'An'):
            word_list2.append(doc[token])
            word_list2.append(doc[token+1])
            word_list2.append(doc[token+2])
            word_list2.append(doc[token+3])
            word_list2.append(doc[token+4])
            word_list2.append(doc[token+5])
            word_list2.append(doc[token+6])
            word_list2.append(doc[token+7])
          if (len(word_list2) != 0) and (False for i in word_list2 for i in word_list1) and (False for i in word_list2 for i in total_lists):
            total_lists.append(word_list2) 
        except IndexError or TypeError as e:
          for token in range(len(doc)):
            try:
              word_list3 = []
              if ((doc[token-1].pos_ == 'VERB') and (doc[token].text == 'the')) or ((doc[token-1].pos_ == 'ADV') and (doc[token].text == 'the')):
                word_list3.append(doc[token-1])
                word_list3.append(doc[token])
                word_list3.append(doc[token+1])
                word_list3.append(doc[token+2])
              if (len(word_list3) != 0 ) and (False for i in word_list3 for i in word_list2) and (False for i in word_list3 for i in total_lists):
                total_lists.append(word_list3)
            except IndexError or TypeError as e:
              for token in range(len(doc)):
                try:
                  word_list4 = []
                  if (doc[token].text == 'the') or (doc[token].text == 'a') or (doc[token].text == 'an'):
                    word_list4.append(doc[token])
                    word_list4.append(doc[token+1])
                    word_list4.append(doc[token+2])
                    word_list4.append(doc[token+3])
                  if (len(word_list4) != 0 ) and (False for i in word_list4 for i in word_list3) and (False for i in word_list4 for i in total_lists):
                    total_lists.append(word_list4)
                except IndexError or TypeError as e:
                  pass
  return total_lists

df['words_around_THE_A_AN'] = df['parts_of_speech_reference'].apply(find_words_after_and_before_THE_A_AN)
df[[1,'words_around_THE_A_AN']].head(1)

Unnamed: 0,1,words_around_THE_A_AN
0,Came here with my mom before a broadway show for her bday! The table was approx the width of my big toe but it was super cute and the food was amazing!! Def make a reservation there was a line out the door within the hr of it opening for dinner service!!,"[[a, broadway, show, for, her, bday, The, table, was, approx, the, width], [The, table, was, approx, the, width, of, my, big, toe, but, it], [the, width, of, my, big, toe, but, it, was, super, cute, and], [it, was, super, cute, and, the, food, was, amazing, Def, make, a], [a, reservation, there, was, a, line, out, the, door, within, the, hr], [there, was, a, line, out, the, door, within, the, hr, of, it], [a, line, out, the, door, within, the, hr, of, it, opening, for], [a, broadway, show, for, her, bday, The, table], [The, table, was, approx, the, width, of, my], [it, was, super, cute, and, the, food, was], [a, reservation, there, was, a, line, out, the], [there, was, a, line, out, the, door, within], [a, line, out, the, door, within, the, hr], [approx, the, width, of]]"


In [32]:
def find_phrases(x):
  long_phrase_list = []
  doc = nlp(str(x))
  for token in range(len(doc)):
    sub_list = []
    try:
      if (doc[token-1].text == 'on') and (doc[token].text == 'a') or (doc[token].text == 'for') or (doc[token].text == 'food is') or (doc[token].lemma_ == 'order') or (doc[token].text == 'but') or (doc[token].text == 'not') or (doc[token].pos_ == 'VERB' and doc[token+1].pos_ == 'NOUN') or (doc[token-1].pos_ == 'NOUN' and doc[token].pos_ == 'VERB') or (doc[token].lemma_ == 'service') or (doc[token].lemma_ == 'wait') or (doc[token].pos_ == 'ADJ'and doc[token+1].pos_ =='NOUN') or (doc[token].pos_ == 'VERB'and doc[token+1].pos_ =='NOUN') or (doc[token].pos_ == 'ADJ'and doc[token+1].pos_ == 'ADJ'and doc[token+2].pos_ =='NOUN')or (doc[token].pos_ == 'ADJ'and doc[token+1].pos_ =='VERB'and doc[token+2].pos_ =='NOUN') or (doc[token].pos_ == 'ADJ'and doc[token+1].pos_ == 'NOUN'and doc[token+2].pos_ =='NOUN'):
        if doc[token-1] not in sub_list and doc[token] not in sub_list and doc[token+1] not in sub_list and doc[token+2] not in sub_list and doc[token+3] not in sub_list:
          sub_list.append(doc[token-1])
          sub_list.append(doc[token])
          sub_list.append(doc[token+1])
          sub_list.append(doc[token+2])
          sub_list.append(doc[token+3])
    except IndexError as e:
      try:
        if (doc[token-1].text == 'on') and (doc[token].text == 'a') or (doc[token].text == 'for') or (doc[token].text == 'food is') or (doc[token].lemma_ == 'order') or (doc[token].text == 'but') or (doc[token].text == 'not') or (doc[token].pos_ == 'VERB' and doc[token+1].pos_ == 'NOUN') or (doc[token-1].pos_ == 'NOUN' and doc[token].pos_ == 'VERB') or (doc[token].lemma_ == 'service') or (doc[token].lemma_ == 'wait') or (doc[token].pos_ == 'ADJ'and doc[token+1].pos_ =='NOUN') or (doc[token].pos_ == 'VERB'and doc[token+1].pos_ =='NOUN') or (doc[token].pos_ == 'ADJ'and doc[token+1].pos_ == 'ADJ'and doc[token+2].pos_ =='NOUN')or (doc[token].pos_ == 'ADJ'and doc[token+1].pos_ =='VERB'and doc[token+2].pos_ =='NOUN') or (doc[token].pos_ == 'ADJ'and doc[token+1].pos_ == 'NOUN'and doc[token+2].pos_ =='NOUN'):
          if doc[token-1] not in sub_list and doc[token] not in sub_list and doc[token+1] not in sub_list and doc[token+2] not in sub_list:
            sub_list.append(doc[token-1])
            sub_list.append(doc[token])
            sub_list.append(doc[token+1])
            sub_list.append(doc[token+2])
      except IndexError as e:
        try:
          if (doc[token-1].text == 'on') and (doc[token].text == 'a') or (doc[token].text == 'for') or (doc[token].text == 'food is') or (doc[token].lemma_ == 'order') or (doc[token].text == 'but') or (doc[token].text == 'not') or (doc[token].pos_ == 'VERB' and doc[token+1].pos_ == 'NOUN') or (doc[token-1].pos_ == 'NOUN' and doc[token].pos_ == 'VERB') or (doc[token].lemma_ == 'service') or (doc[token].lemma_ == 'wait') or (doc[token].pos_ == 'ADJ'and doc[token+1].pos_ =='NOUN') or (doc[token].pos_ == 'VERB'and doc[token+1].pos_ =='NOUN') or (doc[token].pos_ == 'ADJ'and doc[token+1].pos_ == 'ADJ'and doc[token+2].pos_ =='NOUN')or (doc[token].pos_ == 'ADJ'and doc[token+1].pos_ =='VERB'and doc[token+2].pos_ =='NOUN') or (doc[token].pos_ == 'ADJ'and doc[token+1].pos_ == 'NOUN'and doc[token+2].pos_ =='NOUN'):
            if doc[token-1] not in sub_list and doc[token] not in sub_list and doc[token+1] not in sub_list:
              sub_list.append(doc[token-1])
              sub_list.append(doc[token])
              sub_list.append(doc[token+1])
        except IndexError as e:
          try:
            if (doc[token].text == 'on') or (doc[token].text == 'for') or (doc[token].text == 'food is') or (doc[token].lemma_ == 'order') or (doc[token].text == 'but') or (doc[token].text == 'not') or (doc[token].pos_ == 'VERB' and doc[token+1].pos_ == 'NOUN') or (doc[token-1].pos_ == 'NOUN' and doc[token].pos_ == 'VERB') or (doc[token].lemma_ == 'service') or (doc[token].lemma_ == 'wait') or (doc[token].pos_ == 'ADJ'and doc[token+1].pos_ =='NOUN') or (doc[token].pos_ == 'VERB'and doc[token+1].pos_ =='NOUN') or (doc[token].pos_ == 'ADJ'and doc[token+1].pos_ == 'ADJ'and doc[token+2].pos_ =='NOUN')or (doc[token].pos_ == 'ADJ'and doc[token+1].pos_ =='VERB'and doc[token+2].pos_ =='NOUN') or (doc[token].pos_ == 'ADJ'and doc[token+1].pos_ == 'NOUN'and doc[token+2].pos_ =='NOUN'):
              if doc[token] not in sub_list and doc[token+1] not in sub_list:
                sub_list.append(doc[token])
                sub_list.append(doc[token+1])
          except IndexError as e:
            try:
              #captures words at the end of reviews
              if (doc[token].text == 'food is') or (doc[token].lemma_ == 'order') or (doc[token].text == 'but') or (doc[token].text == 'not') or (doc[token-1].pos_ == 'NOUN' and doc[token].pos_ == 'VERB') or (doc[token].lemma_ == 'service') or (doc[token].lemma_ == 'wait') or (doc[token-1].pos_ == 'ADJ'and doc[token].pos_ =='NOUN') or (doc[token-1].pos_ == 'VERB'and doc[token].pos_ =='NOUN')or (doc[token-1].pos_ == 'NOUN'and doc[token].pos_ =='NOUN'):
                if doc[token] not in sub_list and doc[token+1] not in sub_list:
                  sub_list.append(doc[token-1])
                  sub_list.append(doc[token])
            except IndexError as e:
              try:
                #captures words from the start of reviews
                if (doc[token].text == 'food is') or (doc[token].lemma_ == 'order') or (doc[token].text == 'but') or (doc[token].text == 'not') or (doc[token].pos_ == 'NOUN' and doc[token+1].pos_ == 'VERB') or (doc[token].lemma_ == 'service') or (doc[token].lemma_ == 'wait') or (doc[token].pos_ == 'ADJ'and doc[token+1].pos_ =='NOUN') or (doc[token].pos_ == 'VERB'and doc[token+1].pos_ =='NOUN')or (doc[token].pos_ == 'NOUN'and doc[token+1].pos_ =='NOUN'):
                  if doc[token] not in sub_list and doc[token+1] not in sub_list:
                    sub_list.append(doc[token])
                    sub_list.append(doc[token+1])
              except IndexError as e:
                pass
    if (len(sub_list) != 0) and (sub_list not in long_phrase_list):
      long_phrase_list.append(sub_list)
  return long_phrase_list


df['AdjNouns_NounNouns_Service_Food_For'] = df[1].apply(find_phrases)
df[[1,'nounchunk_list','AdjNouns_NounNouns_Service_Food_For','words_around_THE_A_AN']].sample(1)

Unnamed: 0,1,nounchunk_list,AdjNouns_NounNouns_Service_Food_For,words_around_THE_A_AN
13,"I went here one night to celebrate my birthday, in January. The place has a classic look with red chairs. It's spacious and intimate with dimly lit light areas. There's also an outside area that is heated so you can do that even in the colder months confortab. Staff was warm and welcoming. Loved the food as well. Had the Shepard's pie and a ceviche dish, both awesome.","[(a, classic, look), (dimly, lit, light, areas), (an, outside, area), (the, colder, months, confortab), (the, Shepard, 's, pie), (a, ceviche, dish)]","[[a, classic, look, with, red], [with, red, chairs, ., It], [lit, light, areas, ., There], [an, outside, area, that, is], [the, colder, months, confortab, .]]","[[I, went, here, one, night, to, celebrate, my, birthday, in, January, The], [The, place, has, a, classic, look, with, red, chairs, It, 's, spacious], [a, classic, look, with, red, chairs, It, 's, spacious, and, intimate, with], [It, 's, spacious, and, intimate, with, dimly, lit, light, areas, There, 's], [There, 's, also, an, outside, area, that, is, heated, so, you, can], [an, outside, area, that, is, heated, so, you, can, do, that, even], [you, can, do, that, even, in, the, colder, months, confortab, Staff, was], [the, food, as, well, Had, the, Shepard, 's, pie, and, a, ceviche], [I, went, here, one, night, to, celebrate, my], [The, place, has, a, classic, look, with, red], [a, classic, look, with, red, chairs, It, 's], [It, 's, spacious, and, intimate, with, dimly, lit], [There, 's, also, an, outside, area, that, is], [an, outside, area, that, is, heated, so, you], [you, can, do, that, even, in, the, colder], [Loved, the, food, as]]"


In [33]:
def find_word_segments(x):
  long_phrase_list = []
  doc = nlp(str(x))
  try:
    for token in range(len(doc)):
      sub_list = []
      if (token < (int(len(doc))-5)) and ((doc[token].pos_ == 'ADJ'and doc[token+1].pos_ == 'NOUN'and doc[token+2].pos_ =='NOUN') or (doc[token].text == 'food') or (doc[token].lemma_ == 'order') or (doc[token].text == 'but') or (doc[token].text == 'not') or (doc[token].pos_ == 'VERB' and doc[token+1].pos_ == 'NOUN') or (doc[token-1].pos_ == 'NOUN' and doc[token].pos_ == 'VERB') or (doc[token].lemma_ == 'service') or (doc[token].lemma_ == 'wait') or (doc[token].pos_ == 'ADJ'and doc[token+1].pos_ =='NOUN') or (doc[token].pos_ == 'VERB'and doc[token+1].pos_ =='NOUN') or (doc[token].pos_ == 'ADJ'and doc[token+1].pos_ == 'ADJ'and doc[token+2].pos_ =='NOUN')or (doc[token].pos_ == 'ADJ'and doc[token+1].pos_ =='VERB'and doc[token+2].pos_ =='NOUN')):
        sub_list.append(doc[token])
        sub_list.append(doc[token+1])
        sub_list.append(doc[token+2])
        sub_list.append(doc[token+3])
        sub_list.append(doc[token+4])
        sub_list.append(doc[token+5])
        # sub_list.append(doc[token+6])
      if len(sub_list) != 0:
        long_phrase_list.append(sub_list)
  except IndexError as e:
    pass
  try:
    for token in range(len(doc)):
      sub_list = []
      if (token < (int(len(doc))-5)) and ((doc[token].pos_ == 'PRON') or ((doc[token-1].pos_ == 'VERB') and (doc[token].text == 'the')) or ((doc[token-1].pos_ == 'ADV') and (doc[token].text == 'the')) or (doc[token].text == 'The') or (doc[token].text == 'a') or (doc[token].text == 'A') or (doc[token].text == 'an') or (doc[token].text == 'An')):
        sub_list.append(doc[token])
        sub_list.append(doc[token+1])
        sub_list.append(doc[token+2])
        sub_list.append(doc[token+3])
        sub_list.append(doc[token+4])
        sub_list.append(doc[token+5])
      if len(sub_list) != 0:
        long_phrase_list.append(sub_list)
  except IndexError as e:
    pass
  return long_phrase_list

df['word_segments'] = df[1].apply(find_word_segments)
df[[1,'word_segments']].head(1)

Unnamed: 0,1,word_segments
0,Came here with my mom before a broadway show for her bday! The table was approx the width of my big toe but it was super cute and the food was amazing!! Def make a reservation there was a line out the door within the hr of it opening for dinner service!!,"[[big, toe, but, it, was, super], [but, it, was, super, cute, and], [food, was, amazing, !, !, Def], [a, broadway, show, for, her, bday], [The, table, was, approx, the, width], [the, width, of, my, big, toe], [it, was, super, cute, and, the], [a, reservation, there, was, a, line], [there, was, a, line, out, the], [a, line, out, the, door, within], [it, opening, for, dinner, service, !]]"


In [143]:
df['word_segments_unpacked'] = df['word_segments'].apply(lambda l: [item for sublist in l for item in sublist])
df['word_segments_unpacked'] = df['word_segments_unpacked'].astype(str)
df['word_segments_unpacked'] = df['word_segments_unpacked'].apply(lambda x: ''.join([str(i) for i in x]))
df.head(1)

Unnamed: 0,0,1,2,tokenized_text,parts_of_speech_reference,nounchunk_list,words_around_THE_A_AN,AdjNouns_NounNouns_Service_Food_For,word_segments,word_segments_unpacked
0,\n 10/2/2019\n,Came here with my mom before a broadway show for her bday! The table was approx the width of my big toe but it was super cute and the food was amazing!! Def make a reservation there was a line out the door within the hr of it opening for dinner service!!,5.0 star rating,"[Came, here, with, my, mom, before, a, broadway, show, for, her, bday, The, table, was, approx, the, width, of, my, big, toe, but, it, was, super, cute, and, the, food, was, amazing, Def, make, a, reservation, there, was, a, line, out, the, door, within, the, hr, of, it, opening, for, dinner, service]",Came here with my mom before a broadway show for her bday The table was approx the width of my big toe but it was super cute and the food was amazing Def make a reservation there was a line out the door within the hr of it opening for dinner service,"[(a, broadway, show), (my, big, toe)]","[[a, broadway, show, for, her, bday, The, table, was, approx, the, width], [The, table, was, approx, the, width, of, my, big, toe, but, it], [the, width, of, my, big, toe, but, it, was, super, cute, and], [it, was, super, cute, and, the, food, was, amazing, Def, make, a], [a, reservation, there, was, a, line, out, the, door, within, the, hr], [there, was, a, line, out, the, door, within, the, hr, of, it], [a, line, out, the, door, within, the, hr, of, it, opening, for], [a, broadway, show, for, her, bday, The, table], [The, table, was, approx, the, width, of, my], [it, was, super, cute, and, the, food, was], [a, reservation, there, was, a, line, out, the], [there, was, a, line, out, the, door, within], [a, line, out, the, door, within, the, hr], [approx, the, width, of]]","[[show, for, her, bday, !], [my, big, toe, but, it], [toe, but, it, was, super], [opening, for, dinner, service, !], [dinner, service, !, !]]","[[big, toe, but, it, was, super], [but, it, was, super, cute, and], [food, was, amazing, !, !, Def], [a, broadway, show, for, her, bday], [The, table, was, approx, the, width], [the, width, of, my, big, toe], [it, was, super, cute, and, the], [a, reservation, there, was, a, line], [there, was, a, line, out, the], [a, line, out, the, door, within], [it, opening, for, dinner, service, !]]","[big, toe, but, it, was, super, but, it, was, super, cute, and, food, was, amazing, !, !, Def, a, broadway, show, for, her, bday, The, table, was, approx, the, width, the, width, of, my, big, toe, it, was, super, cute, and, the, a, reservation, there, was, a, line, there, was, a, line, out, the, a, line, out, the, door, within, it, opening, for, dinner, service, !]"


In [175]:
phrase_count = df[['word_segments_unpacked', 2]]
import numpy as np

s= phrase_count.apply(lambda x: pd.Series(x['word_segments_unpacked']),axis=1).stack().reset_index(level=1, drop=True)
s.name = 'word_segments_unpacked'

phrase_count = phrase_count.drop('word_segments_unpacked', axis=1).join(s)
phrase_count.head(1)

Unnamed: 0,2,word_segments_unpacked
0,5.0 star rating,"[big, toe, but, it, was, super, but, it, was, super, cute, and, food, was, amazing, !, !, Def, a, broadway, show, for, her, bday, The, table, was, approx, the, width, the, width, of, my, big, toe, it, was, super, cute, and, the, a, reservation, there, was, a, line, there, was, a, line, out, the, a, line, out, the, door, within, it, opening, for, dinner, service, !]"


In [243]:
phrase_count = pd.DataFrame(df['word_segments_unpacked'].str.split(',').tolist(), index=df[2]).stack()
phrase_count = phrase_count.reset_index()[[0, 2]] # var1 variable is currently labeled 0
phrase_count.columns = ['word_segments_unpacked', 'ratings'] # renaming var1
phrase_count = phrase_count.reset_index(drop=False)

# x = ''
# replaceDict = {'to' : '', '!]' : '', '-' : '' }    
# stopwords = ['to','We','had','.',"'re",'it','or','he','she','we','us','how','went','no',"'d",'"','of','has','a','by','bit','thing','NaN','place',' ','on','so','ok','i','and','they','None','was','you',"'ve",'did',"'d",'be','and','but','is','as','a','b','c','d','e','f','g','u','it','!','&','you','has','-','None',':','and','had','was','him','so','my',' ','did','would','on','her','him','it','is','by','bit','thing','NaN','place','[',']','while','check-in','=','= =','want', 'good','husband', 'want','love','something','your','they','your','cuz','him',"i've",'her','told', '1 check', 'i"m', "it's",'they', ' the','the ',' ', 'this','its','they','this',"don't",'the',',', 'it', 'i"ve', 'i"m', '!', '1','2','3','4', '5','6','7','8','9','0','/','.']
# def filter_stopwords(text):
#   for i in str(text):
#     if i not in stopwords:
#       return str(text)

# phrase_count['word_segments_unpacked'] = phrase_count['word_segments_unpacked'].map(filter_stopwords)
phrase_count['word_segments_unpacked'] = phrase_count['word_segments_unpacked'].str.replace('[','')
phrase_count['word_segments_unpacked'] = phrase_count['word_segments_unpacked'].str.replace(']','')
phrase_count['word_segments_unpacked'] = phrase_count['word_segments_unpacked'].str.replace('-','')
phrase_count['word_segments_unpacked'] = phrase_count['word_segments_unpacked'].str.replace('.','')
phrase_count['word_segments_unpacked'] = phrase_count['word_segments_unpacked'].str.replace(' ','')
phrase_count['word_segments_unpacked'] = phrase_count['word_segments_unpacked'].str.lower()
# phrase_count = phrase_count[phrase_count['word_segments_unpacked']!=i for i in stopwords]

stopwords = ['from','first','here','only','put','where','got','sure','definitely','food','yet','our','go','since','really','very','two',"n't",'with','if',"'s",'which','came','all','me','(',')','makes','make','were','immediately','get','been','ahead','also','that','one','have','see','what','to','we','had','.',"'re",'it','or','he','she','we','us','how','went','no','"','of','has','by','bit','thing','place','so','ok','and','they','none','was','you',"'ve",'did','be','and','but','is','as','&','you','has','-',':','and','had','was','him','so','my','did','would','her','him','it','is','by','bit','thing','place','[',']','while','check-in','=','= =','want', 'good','husband', 'want','love','something','your','they','your','cuz','him',"i've",'her','told', 'check', 'i"m', "it's",'they', 'this','its','they','this',"don't",'the',',', 'it', 'i"ve', 'i"m', '!', '1','2','3','4', '5','6','7','8','9','0','/','.']
def filter_stopwords(text):
  for i in str(text):
    if i not in stopwords:
      return str(text)

phrase_count = phrase_count[~phrase_count['word_segments_unpacked'].isin(stopwords)]#if item in stopwords list partially matches, delete

#full matches
phrase_count = phrase_count[phrase_count['word_segments_unpacked']!='i']
phrase_count = phrase_count[phrase_count['word_segments_unpacked']!='a']
phrase_count = phrase_count[phrase_count['word_segments_unpacked']!='an']
phrase_count = phrase_count[phrase_count['word_segments_unpacked']!='am']
phrase_count = phrase_count[phrase_count['word_segments_unpacked']!='at']
phrase_count = phrase_count[phrase_count['word_segments_unpacked']!='are']
phrase_count = phrase_count[phrase_count['word_segments_unpacked']!='in']
phrase_count = phrase_count[phrase_count['word_segments_unpacked']!='on']
phrase_count = phrase_count[phrase_count['word_segments_unpacked']!='for']
phrase_count = phrase_count[phrase_count['word_segments_unpacked']!='']
phrase_count.sample(1)

Unnamed: 0,index,word_segments_unpacked,ratings
10376,10376,other,4.0 star rating


In [244]:
#test values
phrase_count.iloc[4085]['word_segments_unpacked']

'24'

In [0]:
# phrase_count.tail(188)

In [250]:
phrase_count_pivot = pd.pivot_table(phrase_count, index='word_segments_unpacked', columns='ratings', aggfunc='count', fill_value=0)
phrase_count_pivot
phrase_count_pivot.sort_values(by=('index','1.0 star rating'), ascending=False)

Unnamed: 0_level_0,index,index,index,index,index
ratings,1.0 star rating,2.0 star rating,3.0 star rating,4.0 star rating,5.0 star rating
word_segments_unpacked,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
service,14,13,14,31,49
order,13,4,1,7,9
wait,10,2,2,5,7
not,8,24,26,22,18
,7,12,12,15,26
...,...,...,...,...,...
forward,0,0,0,0,1
fortunately,0,0,0,0,2
forgot,0,0,1,0,0
forever,0,0,2,3,0


In [251]:
phrase_count_pivot.columns = [''.join(col).strip() for col in phrase_count_pivot.columns.values]
phrase_count_pivot = pd.DataFrame(phrase_count_pivot.to_records())
phrase_count_pivot

Unnamed: 0,word_segments_unpacked,index1.0 star rating,index2.0 star rating,index3.0 star rating,index4.0 star rating,index5.0 star rating
0,$,0,1,4,7,1
1,'d,0,0,2,2,3
2,'ll,0,0,0,3,0
3,'m,1,0,2,7,3
4,+,0,0,0,1,0
...,...,...,...,...,...,...
1333,yummy,0,0,0,0,3
1334,zone,0,0,0,1,0
1335,zucchini,0,0,0,1,1
1336,,7,12,12,15,26


In [0]:
def get_rating_scores(x):
  

In [41]:
phrase_count_pivot.sort_values(by=('index','1.0 star rating'), ascending=False)

Unnamed: 0_level_0,index,index,index,index,index
ratings,1.0 star rating,2.0 star rating,3.0 star rating,4.0 star rating,5.0 star rating
word_segments_unpacked,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
was,33,29,58,82,148
and,31,25,50,129,191
it,28,12,35,69,100
to,26,17,36,70,62
we,22,40,31,47,96
...,...,...,...,...,...
happy,0,0,1,2,1
happen,0,0,1,0,0
hang,0,0,0,1,0
hands,0,0,0,1,0


In [42]:
phrase_count_pivot.sort_values(by=('index','5.0 star rating'), ascending=True)

Unnamed: 0_level_0,index,index,index,index,index
ratings,1.0 star rating,2.0 star rating,3.0 star rating,4.0 star rating,5.0 star rating
word_segments_unpacked,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
starters,0,1,0,0,0
serving,0,0,3,2,0
served,5,0,0,0,0
sense,0,0,0,4,0
send,0,0,0,1,0
...,...,...,...,...,...
we,22,40,31,47,96
it,28,12,35,69,100
food,18,19,19,45,117
was,33,29,58,82,148


In [0]:
phrase_count_pivot.sort_values(phrase_count_pivot.columns.tolist()).sort_index(level=1, ascending=False, 
                                                        sort_remaining=False)

Unnamed: 0_level_0,index,index,index,index,index
ratings,1.0 star rating,2.0 star rating,3.0 star rating,4.0 star rating,5.0 star rating
word_segments_unpacked,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
yummy,0,0,0,0,1
yelp,0,0,1,0,0
worst,1,0,0,0,0
wonderful,0,0,0,1,2
wait,0,0,0,2,0
...,...,...,...,...,...
're,0,0,0,3,1
'm,1,0,2,7,3
'll,0,0,0,3,0
'd,0,0,2,2,3


In [0]:
corpus = st.CorpusFromPandas(df, 
                          category_col=2, 
                          text_col='word_segments_unpacked',
                          nlp=nlp).build()

term_freq_df = corpus.get_term_freq_df()
term_freq_df['highratingscore'] = corpus.get_scaled_f_scores('5.0 star rating')

term_freq_df['poorratingscore'] = corpus.get_scaled_f_scores('1.0 star rating')
dh = term_freq_df.sort_values(by= 'highratingscore', ascending = False)
dh = dh[['highratingscore', 'poorratingscore']]
dh = dh.reset_index(drop=False)
dh = dh.rename(columns={'highratingscore':'score'})
dh = dh.drop(columns='poorratingscore')
dh.head()

Unnamed: 0,term,score
0,cocktails,1.0
1,was great,0.99383
2,food is,0.981038
3,by,0.979595
4,loved,0.973645


In [0]:
def extract_adj_noun_verb(x):
  doc = nlp(str(x))
  for token in doc:
    if token.pos_ == 'NOUN':
      return token.text

dh['term']= dh['term'].apply(extract_adj_noun_verb)

stopwords = ['a','by','bit','thing','NaN','place']
def filter_stopwords(text):
  try:
    for i in text:
      if i not in stopwords:
        return text
  except TypeError as e:
    pass

dh['term'] = dh['term'].apply(filter_stopwords)
dh = dh[dh['term'].notnull()]#create a new dataframe with only terms that are nouns
dh = dh.groupby(['term']).agg('min')
dh.tail()

Unnamed: 0_level_0,score
term,Unnamed: 1_level_1
yelp,0.532899
yelpers,0.5
yesterday,0.532899
zone,0.5
zucchini,0.5


In [0]:
dh = dh.reset_index(drop=False)

In [0]:
dh = dh.sort_values(by = 'score', ascending=False)
dh.tail(20)

Unnamed: 0,term,score
126,deal,0.281153
203,fries,0.281153
401,return,0.281153
298,minutes,0.28049
416,scallops,0.262622
355,plates,0.245336
77,check,0.245336
199,food,0.245336
93,cocktail,0.227305
202,friends,0.227305


In [0]:
dh['term'].iloc[-1]

'order'

In [0]:
#find where the worst terms appear in the review word segments and extract that word phrase to give context
worst_term = df[df['word_segments_unpacked'].str.contains(dh['term'].iloc[-1])]
worst_term = worst_term.sort_values(by=2, ascending=True)
worst_term.head(1)

Unnamed: 0,0,1,2,tokenized_text,parts_of_speech_reference,nounchunk_list,words_around_THE_A_AN,AdjNouns_NounNouns_Service_Food_For,word_segments,word_segments_unpacked
1,\n 8/17/2019\n,This is one of the worst restaurants I have been to. Myself and two friends had a reservation for 1145 as we had theater tickets for a 200 show . When I entered the restaurant I was greeted by a woman who had a scowl on her face. Once my friends arrived we were seated in a booth. It took at least ten to fifteen minutes before someone came to take our drink orders our drinks were delivered with minimal taste and minimal alcohol. I ordered a seltzer with pomegranate juice and I got seltzer with maybe a splash of cranberry. Our food arrived forty minutes later after we complained . The manager was less than helpful or apologetic. Our food was lukewarm at best. My eggs were like eating leftovers and too boot there was no compensation or apologies on behalf of the restaurant. Save your money and a headache and check our a classier better run establishment,1.0 star rating,"[This, is, one, of, the, worst, restaurants, I, have, been, to, Myself, and, two, friends, had, a, reservation, for, 1145, as, we, had, theater, tickets, for, a, 200, show, When, I, entered, the, restaurant, I, was, greeted, by, a, woman, who, had, a, scowl, on, her, face, Once, my, friends, arrived, we, were, seated, in, a, booth, It, took, at, least, ten, to, fifteen, minutes, before, someone, came, to, take, our, drink, orders, our, drinks, were, delivered, with, minimal, taste, and, minimal, alcohol, I, ordered, a, seltzer, with, pomegranate, juice, and, I, got, seltzer, with, maybe, a, splash, of, cranberry, ...]",This is one of the worst restaurants I have been to Myself and two friends had a reservation for 1145 as we had theater tickets for a 200 show When I entered the restaurant I was greeted by a woman who had a scowl on her face Once my friends arrived we were seated in a booth It took at least ten to fifteen minutes before someone came to take our drink orders our drinks were delivered with minimal taste and minimal alcohol I ordered a seltzer with pomegranate juice and I got seltzer with maybe a splash of cranberry Our food arrived forty minutes later after we complained The manager was less than helpful or apologetic Our food was lukewarm at best My eggs were like eating leftovers and too boot there was no compensation or apologies on behalf of the restaurant Save your money and a headache and check our a classier better run establishment,"[(the, worst, restaurants), (a, 200, show), (at, least, ten, to, fifteen, minutes), (our, drink, orders), (maybe, a, splash), (our, a, classier, better, run, establishment)]","[[I, have, been, to, Myself, and, two, friends, had, a, reservation, for], [Myself, and, two, friends, had, a, reservation, for, 1145, as, we, had], [a, reservation, for, 1145, as, we, had, theater, tickets, for, a, 200], [we, had, theater, tickets, for, a, 200, show, When, I, entered, the], [a, 200, show, When, I, entered, the, restaurant, I, was, greeted, by], [I, entered, the, restaurant, I, was, greeted, by, a, woman, who, had], [the, restaurant, I, was, greeted, by, a, woman, who, had, a, scowl], [I, was, greeted, by, a, woman, who, had, a, scowl, on, her], [a, woman, who, had, a, scowl, on, her, face, Once, my, friends], [who, had, a, scowl, on, her, face, Once, my, friends, arrived, we], [a, scowl, on, her, face, Once, my, friends, arrived, we, were, seated], [we, were, seated, in, a, booth, It, took, at, least, ten, to], [a, booth, It, took, at, least, ten, to, fifteen, minutes, before, someone], [It, took, at, least, ten, to, fifteen, minutes, before, someone, came, to], [...","[[the, worst, restaurants, I, have], [reservation, for, 1145, as, we], [tickets, for, a, 200, show], [friends, arrived, we, were, seated], [drink, orders, , our, drinks], [with, minimal, taste, and, minimal], [and, minimal, alcohol, ., ], [I, ordered, a, seltzer, with], [with, pomegranate, juice, and, I], [I, got, seltzer, with, maybe], [food, arrived, forty, minutes, later], [like, eating, leftovers, and, too], [classier, better, run, establishment], [better, run, establishment]]","[[worst, restaurants, I, have, been, to], [arrived, we, were, seated, in, a], [orders, , our, drinks, were, delivered], [minimal, taste, and, minimal, alcohol, .], [minimal, alcohol, ., , I, ordered], [ordered, a, seltzer, with, pomegranate, juice], [pomegranate, juice, and, I, got, seltzer], [got, seltzer, with, maybe, a, splash], [food, arrived, forty, minutes, later, after], [arrived, forty, minutes, later, after, we], [food, was, lukewarm, at, best, .], [eating, leftovers, and, too, boot, there], [I, have, been, to, ., ], [Myself, and, two, friends, had, a], [a, reservation, for, 1145, as, we], [we, had, theater, tickets, for, a], [a, 200, show, ., , When], [I, entered, the, restaurant, I, was], [the, restaurant, I, was, greeted, by], [I, was, greeted, by, a, woman], [a, woman, who, had, a, scowl], [who, had, a, scowl, on, her], [a, scowl, on, her, face, .], [we, were, seated, in, a, booth], [a, booth, ., , It, took], [It, took, at, least, ten, to], [someone, came, to, tak...","[worst, restaurants, I, have, been, to, arrived, we, were, seated, in, a, orders, , our, drinks, were, delivered, minimal, taste, and, minimal, alcohol, ., minimal, alcohol, ., , I, ordered, ordered, a, seltzer, with, pomegranate, juice, pomegranate, juice, and, I, got, seltzer, got, seltzer, with, maybe, a, splash, food, arrived, forty, minutes, later, after, arrived, forty, minutes, later, after, we, food, was, lukewarm, at, best, ., eating, leftovers, and, too, boot, there, I, have, been, to, ., , Myself, and, two, friends, had, a, a, reservation, for, 1145, as, we, we, had, theater, tickets, for, a, a, 200, show, ., , When, I, entered, the, restaurant, I, was, the, restaurant, I, was, greeted, by, I, was, greeted, by, a, woman, a, woman, who, had, a, scowl, who, had, a, scowl, on, her, a, scowl, on, her, face, ., we, were, seated, in, a, booth, a, booth, ., , It, took, It, took, at, least, ten, to, someone, came, to, take, our, drink, I, ordered, a, seltzer, with, pomegran..."


In [0]:
word_list_top_bottom = ''
for i in worst_term['word_segments_unpacked'][0]:
  i = i.replace(',','')
  word_list_top_bottom += i
print(word_list_top_bottom[:90])

[service was horrible . We had wait 25 minutes before someone served came in after and we 


In [0]:
for i in list(str(worst_term['word_segments_unpacked'])):
  print(i)

1
 
 
 
 
 
[
w
o
r
s
t
,
 
r
e
s
t
a
u
r
a
n
t
s
,
 
I
,
 
h
a
v
e
,
 
b
e
e
n
,
 
t
o
,
 
a
r
r
i
v
e
d
,
 
w
e
,
 
w
e
r
e
,
 
s
e
a
t
e
d
,
 
i
n
,
 
a
,
 
o
r
d
e
r
s
,
 
 
,
 
o
u
r
,
 
d
r
i
n
k
s
,
 
w
e
r
e
,
 
d
e
l
i
v
e
r
e
d
,
 
m
i
n
i
m
a
l
,
 
t
a
s
t
e
,
 
a
n
d
,
 
m
i
n
i
m
a
l
,
 
a
l
c
o
h
o
l
,
 
.
,
 
m
i
n
i
m
a
l
,
 
a
l
c
o
h
o
l
,
 
.
,
 
 
,
 
I
,
 
o
r
d
e
r
e
d
,
 
o
r
d
e
r
e
d
,
 
a
,
 
s
e
l
t
z
e
r
,
 
w
i
t
h
,
 
p
o
m
e
g
r
a
n
a
t
e
,
 
j
u
i
c
e
,
 
p
o
m
e
g
r
a
n
a
t
e
,
 
j
u
i
c
e
,
 
a
n
d
,
 
I
,
 
g
o
t
,
 
s
e
l
t
z
e
r
,
 
g
o
t
,
 
s
e
l
t
z
e
r
,
 
w
i
t
h
,
 
m
a
y
b
e
,
 
a
,
 
s
p
l
a
s
h
,
 
f
o
o
d
,
 
a
r
r
i
v
e
d
,
 
f
o
r
t
y
,
 
m
i
n
u
t
e
s
,
 
l
a
t
e
r
,
 
a
f
t
e
r
,
 
a
r
r
i
v
e
d
,
 
f
o
r
t
y
,
 
m
i
n
u
t
e
s
,
 
l
a
t
e
r
,
 
a
f
t
e
r
,
 
w
e
,
 
f
o
o
d
,
 
w
a
s
,
 
l
u
k
e
w
a
r
m
,
 
a
t
,
 
b
e
s
t
,
 
.
,
 
e
a
t
i
n
g
,
 
l
e
f
t
o
v
e
r
s
,
 
a
n
d
,
 
t
o
o
,
 
b
o
o
t
,
 
t
h
e
r
e
,
 
I
,
 
h
a
v
e
,
 
b


In [0]:
for i in worst_term['word_segments'][0]:
  for x in i:
    if 'ordered' in list(str(x)):
      print(x)

In [0]:
dh['term'].iloc[-1]

'order'

In [0]:
for i in worst_term['word_segments'][0]:
  if 'service' in list(str(i)):
    print(list(i))

In [0]:
# dh['term'].iloc[-1] in worst_term['word_segments']
# for i in worst_term[1].iloc[1]:
#   if dh['term'].iloc[-1] in worst_term[1]:
#     print(i)

for i in worst_term['word_segments'][0]:
  for list_of_words in i:
    for y in list_of_words:
      if str(y) == dh['term'].iloc[-1]:
        print(list_of_words)

TypeError: ignored