In [265]:
# Impport Required packages
import pandas as pd
import numpy as np
import spacy
from spacytextblob.spacytextblob import SpacyTextBlob
en = spacy.load('en_core_web_sm')
en.add_pipe('spacytextblob')
import deplacy
import stanza
stanza.download('en')
nlp = stanza.Pipeline(lang='en', processors='tokenize, sentiment')
from statistics import mean
import xml.etree.ElementTree as ET
from scipy import spatial

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/master/resources_1.2.0.json: 128kB [00:00, 13.5MB/s]                    
2022-01-07 19:19:19 INFO: Downloading default packages for language: en (English)...
2022-01-07 19:19:20 INFO: File exists: C:\Users\vibkr\stanza_resources\en\default.zip.
2022-01-07 19:19:24 INFO: Finished downloading models and saved to C:\Users\vibkr\stanza_resources.
2022-01-07 19:19:24 INFO: Loading these models for language: en (English):
| Processor | Package  |
------------------------
| tokenize  | combined |
| sentiment | sstplus  |

2022-01-07 19:19:24 INFO: Use device: cpu
2022-01-07 19:19:24 INFO: Loading: tokenize
2022-01-07 19:19:24 INFO: Loading: sentiment
2022-01-07 19:19:24 INFO: Done loading processors!


In [266]:
# Read raw reviews
# df = pd.read_csv("scraped_output.csv")
df = pd.read_csv("scraped_output.csv")
df.dropna(inplace=True)
df.head(3)

Unnamed: 0.1,Unnamed: 0,Review,Date,Stars,Restaurant
0,0,"(Translated by Google) Good hamburgers, delici...",5 days ago,4 stars,TFDB
3,3,"Burger was ok, not fantastic. Place was extre...",2 weeks ago,3 stars,TFDB
4,4,"One of the better burgers I've had, the bread ...",2 weeks ago,5 stars,TFDB


In [267]:
def get_feature_descriptors(text):
    doc = en(text)
    # deplacy.render(doc)
    words_seen = set()
    fd_dict = dict()
    for sent in doc.sents:
        for word in sent:
            if word.pos_ == "NOUN":
                temp_noun = word.text;
                fd_dict[word.text] = [] # Empty list
            if word.pos_ in ["ADV", "ADJ", "VERB"]:
                try:
                    fd_dict[temp_noun].append(word.text)
                except:
                    pass;
    return [fd_dict]

In [268]:
txt = """The food was really good, and my mom thinks it is amazing"""
get_feature_descriptors(txt)


[{'food': ['really', 'good'], 'mom': ['thinks', 'amazing']}]

In [269]:
# Apply function to get feature-descriptors
df["feature_descriptors"] = df["Review"].apply(lambda x:get_feature_descriptors(x))

In [270]:
df.head(3)

Unnamed: 0.1,Unnamed: 0,Review,Date,Stars,Restaurant,feature_descriptors
0,0,"(Translated by Google) Good hamburgers, delici...",5 days ago,4 stars,TFDB,"[{'hamburgers': ['delicious', 'great'], 'cockt..."
3,3,"Burger was ok, not fantastic. Place was extre...",2 weeks ago,3 stars,TFDB,"[{'night': [], 'room': [], 'temperature': [], ..."
4,4,"One of the better burgers I've had, the bread ...",2 weeks ago,5 stars,TFDB,"[{'burgers': ['had'], 'bread': ['soft', 'well'..."


In [271]:
def get_review_level_sentiment(feat):
    feat = feat[0]
    comb_dict = dict()
    for key in feat.keys():
        arr = feat[key]
        if key in comb_dict:
            for word in arr:
                doc = nlp(word)
                for i, sentence in enumerate(doc.sentences):
                    # print(word, sentence.sentiment)
                    comb_dict[key].append(sentence.sentiment)
        else:
            comb_dict[key] = []
            for word in arr:
                doc = nlp(word)
                for i, sentence in enumerate(doc.sentences):
                    # print(word, sentence.sentiment)
                    comb_dict[key].append(sentence.sentiment-1)
    return [comb_dict]


        


In [272]:
df.iloc[5,5]

[{'burgers': ['reminds', 'only'],
  'thing': ['missing', 'perfect'],
  'weather': ['Fantastic'],
  'char': ['patty'],
  'onion': [],
  'rings': ['nicely', 'done']}]

In [273]:
df["sentiment_feat"] = df["feature_descriptors"].apply(lambda x:get_review_level_sentiment(x))

In [274]:
df.head(3)

Unnamed: 0.1,Unnamed: 0,Review,Date,Stars,Restaurant,feature_descriptors,sentiment_feat
0,0,"(Translated by Google) Good hamburgers, delici...",5 days ago,4 stars,TFDB,"[{'hamburgers': ['delicious', 'great'], 'cockt...","[{'hamburgers': [1, 1], 'cocktails': [1]}]"
3,3,"Burger was ok, not fantastic. Place was extre...",2 weeks ago,3 stars,TFDB,"[{'night': [], 'room': [], 'temperature': [], ...","[{'night': [], 'room': [], 'temperature': [], ..."
4,4,"One of the better burgers I've had, the bread ...",2 weeks ago,5 stars,TFDB,"[{'burgers': ['had'], 'bread': ['soft', 'well'...","[{'burgers': [0], 'bread': [0, 1, 0], 'meat': ..."


In [275]:
# Get avg sentiment for each feature at restaurant level

# restaurants = df["Restaurant"].unique()
# restaurants = list(restaurants)

# avg_sent_list = []

# for rest in restaurants:
#     tdf = df[df["Restaurant"] == rest]

#     rmap = dict()

#     ser = tdf["sentiment_feat"].tolist()
#     ser = [item for sublist in ser for item in sublist]
#     # print(ser)
    
#     for d in ser:
#         rmap.update(d)
    
#     for key in rmap.keys():
#         if len(rmap[key]) > 0:
#             m = mean(rmap[key])
#             rmap[key] = m
#     # print(rmap)
#     avg_sent_list.append((rest, rmap))


In [276]:
# further process to categorise
corpus = {
  'food': ['pasta', 'food', 'drink', 'cocktails', 'pizza', 'roti', 'naan', 'noodles', 'burger', 'buns', 'bread', "soup"],
  'service': ['service', "waiter", 'plate', 'delivery', 'manager', 'staff', 'chef', 'chefs'],
  'price': ['price', 'value', 'cheap', 'expensive', 'bargain', 'deal', 'money', 'affordable', 'pricey'],
  'ambience': ['light', 'table', 'carpet', 'decoration', 'view', 'ambience'],
  'location' : ['area', 'location', 'destination', 'place'],
}

# Then, find word vector
embeddings_index = {}
with open('glove.6B.100d.txt', encoding="utf8") as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, "f", sep=" ")
        embeddings_index[word] = coefs

print("Found %s word vectors." % len(embeddings_index))

vect_corpus = {
    'food':[], 'service': [], 'price': [], 'ambience':[], 'location':[]
  }

for key in corpus.keys():
  arr = corpus[key]
  for word in arr:
    vect_corpus[key].append(embeddings_index[word])
  vect_corpus[key] = np.average(vect_corpus[key], axis=0)

def recalculate():
  # Calculate avg
  vect_corpus[key] = np.average(vect_corpus[key], axis=0)
  return vect_corpus


Found 400000 word vectors.


In [277]:

def categorise(feat):
  feat = feat[0]
  feat = feat.copy()
  # print(feat)
  retdict = dict()

  for key in feat.keys():
    # Get key's vector
    # print(key)
    try:
      vkey = embeddings_index[key]
      # print("VKEY", vkey, "=====\n\n")

      max_sim = -1
      cat = ""

      # Comp to each feature
      for aspect in corpus.keys():
        # Get cosine sim for each
        result = 1 - spatial.distance.cosine(vkey, vect_corpus[aspect])
    
        if result > max_sim:
          # print(result)
          max_sim = result;
          cat = aspect
      if cat in retdict:
        for i in feat[key]:
          retdict[cat].append(i)

      else:
        retdict[cat] = feat[key]
      # print(retdict)
      # corpus[cat].append(key)
      # recalculate()
    except:
      pass
    
  return retdict;

In [278]:
print(df.iloc[3, 5])
categorise(df.iloc[0, 5])

[{'time': [], 'date': [], 'pm': [], 'wife': ['popped'], 'way': [], 'errands': ['was', 'wonderful', 'had'], 'no.1': ['had', 'nice'], 'touch': ['had'], 'servicing': [], 'sizes': ['smaller'], 'size': [], 'discerning': []}]


{'food': ['delicious', 'great', 'Original']}

In [279]:
df["categorised_sentiment"] = df["sentiment_feat"].apply(lambda x:categorise(x))