In [25]:
import pandas as pd
from collections import Counter
import numpy as np
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
import warnings
import torch
from allennlp.commands.elmo import ElmoEmbedder
import scipy
import random
from sklearn.feature_extraction.text import CountVectorizer
from preprocess import sent_filter, prep_text
from sklearn.cluster import KMeans
import os
warnings.filterwarnings("ignore")

In [40]:
listing = pd.read_csv("Data/listings.csv")
overviews = listing.rename(columns={"id": "listing_id"})[["neighbourhood","neighborhood_overview"]].dropna()
neighbors = list(set(overviews.neighbourhood))
describs = {}
for ngb in neighbors:
    items = sent_tokenize("".join(list(overviews[overviews['neighbourhood'] == ngb]['neighborhood_overview'])))
    for i in range(len(items)-1,-1,-1):
        try:
            if len(items[i]) < 10 and len(items[i].split()) < 4:
                del items[i]  
            elif not sent_filter(items[i]):
                del items[i]    
        except:
            del items[i] 
    describs[ngb] = " ".join(items)
    #print(ngb)
frame = pd.DataFrame.from_dict(describs, orient='index')
frame.reset_index(level=0, inplace=True)
frame.rename(columns={ "index": "neighborhood", 0: "descriptions"}, inplace = True)
frame.to_csv("host_descriptions.csv", index = False)

In [14]:
overviews = pd.read_csv("host_descriptions.csv").set_index("neighborhood")
overviews = overviews.to_dict()['descriptions']
keys = overviews.keys()

In [27]:
elmo = ElmoEmbedder()
# Embedding + Cluster + Sample
def cluster_sum(raw_sents, new_sents, n_clusters):
    vectors = []
    for i in range(len(new_sents)):
        vec = elmo.embed_sentence(new_sents[i].split())
        vectors.append(np.mean(vec, axis = 1).reshape(1,-1)[0])
    kmeans = KMeans(n_clusters=n_clusters, random_state=0).fit(vectors)
    distances = kmeans.transform(vectors)
    pos = np.argmin(distances, axis = 0)
    return np.array(raw_sents)[pos]

04/04/2019 20:59:23 - INFO - allennlp.commands.elmo -   Initializing ELMo.


In [34]:
summary = {}
for item in keys:
    try:    
        if len(overviews[item]) > 5:
            raw_sents, new_sents = prep_text(sent_tokenize(overviews[item]), stopwords_removal = False)
            raw_sents, new_sents = list(set(raw_sents)), list(set(new_sents))
            if len(raw_sents) > 5:
                summ = cluster_sum(raw_sents, new_sents, 5)
                summary[item] = summ
            else:
                summary[item] = raw_sents
    except:
        continue

In [68]:
frame = pd.DataFrame.from_dict(summary, orient='index')
frame.reset_index(level=0, inplace=True)
frame.rename(columns={'index':'neighbourhood', 0: "overviews"}, inplace=True)
dd = pd.read_csv("Results/summary2.csv")

summs_merged = pd.merge(dd, frame, how='left', on='neighbourhood').drop("Unnamed: 0", axis = 1)
summs_merged.rename(columns={'cluster_sum':'reviews'}, inplace=True)
summs_merged.to_csv("final_summary.csv", index = False)
#summs_merged.to_csv("overviews_summary.csv")

In [None]:
dd = pd.read_csv("restaurants.csv")
folders = os.listdir("query_sum")[1:]
summs = []
for i in range(len(folders)):
    files = os.listdir("query_sum/" + folders[i])
    summary = {}
    for file in files:
        if "txt" in file:
            f = open("query_sum/" + folders[i] + '/' + file, 'r')
            sents = "".join(f.readlines()).split("\n")
            summary[file[:-4]] = {}
            for j in range(len(sents)-1) :
                summary[file[:-4]][folders[i] + "_" + str(j+1)] = sents[j]
            f.close()
    frame = pd.DataFrame.from_dict(summary, orient='index')
    frame.reset_index(level=0, inplace=True)
    frame.rename(columns={'index':'neighbourhood'}, inplace=True)
    summ = pd.merge(dd[['neighbourhood']], frame, how='left', on='neighbourhood').drop(['neighbourhood'], axis = 1)
    summs.append(summ)
    
files = os.listdir("cluster_sum")
summary = {}
for file in files:
    if "txt" in file:
        f = open("cluster_sum/" + file, 'r')
        summary[file[:-4]] = " ".join("".join(f.readlines()).split("\n"))
        f.close()
frame = pd.DataFrame.from_dict(summary, orient='index', columns = ["cluster_sum"])
frame.reset_index(level=0, inplace=True)
frame.rename(columns={'index':'neighbourhood'}, inplace=True)
cluster_summ = pd.merge(dd, frame, how='left', on='neighbourhood')[['neighbourhood', "cluster_sum"]]


result = pd.concat([cluster_summ]+summs, axis=1)
for i in range(result.shape[0]):
    for j in range(result.shape[1]):
        try:
            if len(result.iloc[i,j]) > 2000:
                #print("*")
                result.iloc[i,j] = float('NaN')
        except:
            continue
result.to_csv("summary2.csv")