In [1]:
import numpy as np
import pandas as pd
import nltk
nltk.download('punkt') # one time execution
import re
cols = ['business_id','cool','date','funny','review_id','stars','useful','user_id']
path ='D:\s8.json'
df = pd.read_json(path)
df = df.drop(cols,axis=1) 
df = df.reset_index(drop=True)
df.head()

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\zhang\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Unnamed: 0,text
0,Loved the new owners of this spot! The minute ...
1,My son and I took a trip out to Phoenix Herpet...
2,Had a horrible experence with the lash extensi...
3,I love this place. Everything is so fresh and ...
4,Very average pho joint. Nothing particularly s...


In [11]:
df['text'][0]

"Loved the new owners of this spot! The minute we walked in we were greeted and very friendly. The rooms have still not been upgraded, but we saw the renovations being planned and I cannot wait to come back! The rooms were still very comfortable and had a great view. We ate at the restaurant for brunch and dinner and it did not disappoint. The waiters were very friendly and well priced. Food was very flavorful, I highly recommend the scallops!  The front desk staff is very helpful and so nice. It's refreshing to get out of vegas and experience this. I cannot wait to come back, the location is beautiful!"

In [8]:
#generate a single summary for all the texts
# 1. Split Text into Sentences
from nltk.tokenize import sent_tokenize
from tqdm import tqdm
from tqdm.auto import tqdm
from tqdm import tqdm_notebook
sentences = []
for s in tqdm(df['text'], desc = 'progress bar', leave=True):
  sentences.append(sent_tokenize(s))
sentences = [y for x in sentences for y in x] # flatten list
print(len(sentences))
sentences[:5]

HBox(children=(IntProgress(value=0, description='progress bar', max=21122, style=ProgressStyle(description_wid…


174938


['Loved the new owners of this spot!',
 'The minute we walked in we were greeted and very friendly.',
 'The rooms have still not been upgraded, but we saw the renovations being planned and I cannot wait to come back!',
 'The rooms were still very comfortable and had a great view.',
 'We ate at the restaurant for brunch and dinner and it did not disappoint.']

In [3]:
# Extract word vectors
word_embeddings = {}
path1 = 'E:\word_embedding\glove_100d.txt'
f = open(path1, encoding='utf-8')
for line in tqdm(f, desc = 'progress bar', leave=True):
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    word_embeddings[word] = coefs
f.close()
len(word_embeddings)

HBox(children=(IntProgress(value=1, bar_style='info', description='progress bar', max=1, style=ProgressStyle(d…




400000

In [4]:
#Text Preprocessing
# remove punctuations, numbers and special characters
clean_sentences = pd.Series(sentences).str.replace("[^a-zA-Z]", " ")
# make alphabets lowercase
clean_sentences = [s.lower() for s in clean_sentences]
#Get rid of the stopwords
nltk.download('stopwords')
# import the stopwords
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
# define a function to remove stopwords
def remove_stopwords(sen):
    sen_new = " ".join([i for i in sen if i not in stop_words])
    return sen_new
# remove stopwords from the sentences
clean_sentences = [remove_stopwords(r.split()) for r in clean_sentences]
clean_sentences[:5]

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\zhang\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


['loved new owners spot',
 'minute walked greeted friendly',
 'rooms still upgraded saw renovations planned cannot wait come back',
 'rooms still comfortable great view',
 'ate restaurant brunch dinner disappoint']

In [5]:
#use clean_sentences to create vectors for sentences in data using the GloVe word vectors
# Extract word vectors

sentence_vectors = []
for i in tqdm(clean_sentences, desc = 'progress bar', leave=True):
    if len(i) != 0:
        v = sum([word_embeddings.get(w, np.zeros((100,))) for w in i.split()])/(len(i.split())+0.001)
    else:
        v = np.zeros((100,))
    sentence_vectors.append(v)
v[:5]

HBox(children=(IntProgress(value=0, description='progress bar', max=174938, style=ProgressStyle(description_wi…




array([-0.01573185,  0.19939011,  0.514905  , -0.3805214 ,  0.01422596],
      dtype=float32)

In [14]:
# similarity matrix
from sklearn.metrics.pairwise import *
import networkx as nx
length = len(sentences)- 170000
sim_mat = np.zeros([length, length])
# initialize the matrix with cosine similarity scores
for i in tqdm(range(length), desc = 'progress bar', leave=True):
    for j in range(length):
        if i != j:
            sim_mat[i][j] = cosine_similarity(sentence_vectors[i].reshape(1,100), 
                                              sentence_vectors[j].reshape(1,100))[0,0]

#convert the similarity matrix sim_mat into a graph

nx_graph = nx.from_numpy_array(sim_mat)
scores = nx.pagerank(nx_graph)

# extract the top N sentences based on their rankings for summary generation
ranked_sentences = sorted(((scores[i],s) for i,s in enumerate(sentences[:4938])), reverse=True)
# Extract top 10 sentences as the summary
for i in range(10):
  print(ranked_sentences[i][1])


HBox(children=(IntProgress(value=0, description='progress bar', max=4938, style=ProgressStyle(description_widt…


he spoke broken English so I couldn't really understand him but when I complained they informed me that the chips they give you are STARTER chips just to get you going which is why when I won I was only given $5 instead of $10...yeah that makes sense.....We did not get a chance to eat at the buffet due to incredibly long lines which btw you have a 3 hr time frame to eat else your free credit is worthless and their insistence that we charge everything to the room.
I'm by far not a regular but their staff always makes you feel like you are every time you go in.
The whole bathroom was quite large which made getting ready for the day so easy.
All in all, not a very good first-time restaurant experience...none of us plan to go back again...just doesn't live up to the hype.
I like little touches like that seem like a gimmick but to a little boy without a high chair it's a big smile and it takes such little effort for the cook (I used to be a cook) or the server to make a plate look good.
I 