In [18]:
import pandas as pd
from collections import Counter
import numpy as np
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.decomposition import LatentDirichletAllocation as LDA
from nltk.stem import WordNetLemmatizer
from nltk.stem.lancaster import LancasterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
import warnings
import torch
from allennlp.commands.elmo import ElmoEmbedder
import scipy
import random
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.cluster import KMeans
import os
from nltk.sentiment.vader import SentimentIntensityAnalyzer as SIA
warnings.filterwarnings("ignore")
import spacy
from spacy import displacy
from collections import Counter
import en_core_web_sm
from bs4 import BeautifulSoup
import string, unicodedata
import re
from nltk.corpus import wordnet
import os
from langdetect import detect
from sklearn.feature_extraction.text import CountVectorizer
from preprocess import sent_filter, prep_text
from summarization_methods import KL_sum, cluster_sum, query_sum

04/04/2019 21:13:39 - INFO - allennlp.commands.elmo -   Initializing ELMo.


## 1. Data Preprocessing

### 1.1 Read data and merge tables

In [3]:
# Read Data and Merge Three Tables
reviews =  pd.read_csv("Data/reviews.csv")
listing = pd.read_csv("Data/listings.csv")
neighbors = pd.read_csv("Data/neighbourhoods.csv")
listing = listing.rename(columns={"id": "listing_id"})
merged = pd.merge(listing, reviews, on = "listing_id", how ="inner")
data = pd.merge(merged, neighbors, on = "neighbourhood", how ="left")[['neighbourhood','neighbourhood_group','comments']]
data = data.dropna()

### 1.2 Filter sentences

In [None]:
neighbourhoods = sorted(list(set(data["neighbourhood"])))
for ngb in neighbourhoods:
    count = 0
    comments = sent_tokenize(" ".join(map(str, list(data[data["neighbourhood"] == ngb]["comments"]))))
    raw_sent = []
    file = open("Prep_reviews/" + ngb + ".txt","w") 
    for item in comments:
        if count%5000 == 0:
            print(count, len(comments))
        if len(item) > 10 and len(item.split()) > 4:
            try:
                if sent_filter(item):
                    #print(item)
                    raw_sent.append(item) 
                    file.write(item + "\n")
            except:
                continue
        count += 1
    file.close() 

### 1.3  text preprocessing

In [5]:
reviews = {}
files = os.listdir("prep_reviews")
for file in files:
    if "txt" in file:
        f = open('prep_reviews/' + file, 'r')
        reviews[file[:-4]] = "".join(f.readlines()).split("\n")
        f.close()
keys = sorted(list(reviews.keys()))

## 2. Sentiment Analysis

In [16]:
sia = SIA()
neg_sents = {}
for item in keys:
    scores = []
    for sent in reviews[item]:
        pol_score = sia.polarity_scores(sent)
        scores.append(pol_score['compound'])
        neg_sents[item] = np.array(reviews[item])[np.argsort(scores)[:10]]

In [15]:
neg_sents[keys[2]]

array(['Steep stairs to bedrooms and basement but only a problem if you have problems climbing stairs.',
       'It was very dirty and smelled really bad.',
       'Absolutely no complaints, you cannot ask for a better host.',
       'NO noise at all, u can sleep with no problems.',
       'Nothing like a smiling beautiful little baby to make you feel at home.'],
      dtype='<U220')

## 3. Summarization

### 3.1 Cluster Sum

In [None]:
for item in keys:
    file = open("Cluster_Sum/" +  item + ".txt","w") 
    if len(reviews[item]) > 6:
        raw_sents, new_sents = prep_text(reviews[item], stopwords_removal = False)
        #print(item)
        summ = cluster_sum(raw_sents, new_sents, 6)
        for sent in summ:
            file.write(sent + "\n")
    else:
        for sent in raw_sents:
            file.write(sent + "\n")
    file.close() 

### 2.3 Query-Based Model

In [11]:
query = [ "expense", "noise", "host", "entertainment", "shopping", "nightlife", "restaurant"]
for item in keys:
    if len(reviews[item]) > 6:
        raw_sents, new_sents = prep_text(reviews[item], stopwords_removal = False)
        query_dict = query_sum(raw_sents, new_sents, querys, 5)
        for i in range(len(querys)):
            print(item, querys[i])
            file = open("query_sum/" + querys[i] + "/" +item + ".txt","w") 
            for sent in query_dict[querys[i]]:
                file.write(sent + "\n")
            file.close() 

### 3. Top Key Words

In [52]:
topic_words = ['noise', 'safety', 'entertainment','restaurant', 'host', 'expense', 'shopping', 'nightlife', 'transit']
def get_nyms(word):
        synonyms = []
        antonyms = []

        for syn in wordnet.synsets(word):
            for l in syn.lemmas():
                synonyms.append(l.name())
                if l.antonyms():
                    antonyms.append(l.antonyms()[0].name())
        nyms = list(set(synonyms)) + list(set(antonyms))
        return nyms
    
def nyms_freq(sents):
    words_freq = {word: 0 for word in topic_words}
    cv = CountVectorizer(max_df=0.85,stop_words=stopwords.words('english'))
    cv.fit(sents)
    count = cv.vocabulary_
    for word in topic_words:
        for nym in get_nyms(word):
            if nym in count.keys():
                words_freq[word] += count[nym]
    summ = np.sum(list(words_freq.values()))
    if summ > 0 :
        for item in words_freq.keys():
            words_freq[item] = round(words_freq[item]/summ,3)
        return words_freq    
    return float('NaN')

In [40]:
key_words = {}
for item in keys:
    if len(reviews[item]) > 1:
        key_words[item] = nyms_freq(reviews[item])   

In [53]:
dd = pd.read_csv("restaurants.csv")
word_props = pd.DataFrame.from_dict(key_words).transpose()
word_props.reset_index(level=0, inplace=True)
word_props.rename(columns={'index':'neighbourhood'}, inplace=True)
word_props = pd.merge(dd, word_props, how='left', on='neighbourhood')[['neighbourhood'] + topic_words]
word_props.to_csv("words_prop.csv")