In [3]:
import pandas as pd
from collections import Counter
import numpy as np
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.decomposition import LatentDirichletAllocation as LDA
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
import warnings
import torch
warnings.filterwarnings("ignore")

In [4]:
# Help Functions
stop_words = stopwords.words('English')
lemmatizer = WordNetLemmatizer()
def is_eng(word):
    for char in word:
        if ord(char) < 32 or ord(char) > 126:
            return False
    return True

def preprocess_text(text):
    text = text.replace('[^\w\s]', '')
    text = text.replace('\d+', '')
    text = " ".join([lemmatizer.lemmatize(word.lower()) for word in nltk.word_tokenize(text) if (word not in stop_words) 
                     and is_eng(word) and word.isalpha()])
    return text

In [5]:
# Read Data and Merge Three Tables
reviews =  pd.read_csv("reviews.csv")
listing = pd.read_csv("listings.csv")
neighbors = pd.read_csv("neighbourhoods.csv")
listing = listing.rename(columns={"id": "listing_id"})
merged = pd.merge(listing, reviews, on = "listing_id", how ="inner")
data = pd.merge(merged, neighbors, on = "neighbourhood", how ="left")[['neighbourhood','neighbourhood_group','comments']]

## KL SUM

In [4]:
comments = " ".join(map(str, list(data[data["neighbourhood"] == "Harlem"]["comments"])))
raw_sent = sent_tokenize(comments)
new_sent = [preprocess_text(raw_sent[i]) for i in range(len(raw_sent))]

In [7]:
def KL(sen_words, doc_count):
    length_sen = len(sen_words)
    length_doc = sum(doc_count.values())
    sen_count = Counter(sen_words)
    
    kl_score = 0
    for item in sen_count.keys():
        p = doc_count[item]/length_doc
        q = sen_count[item]/length_sen
        kl_score += p * np.log(p / q)
    return kl_score

def KLsum(orig, doc, L):
    doc_words = [word for line in doc for word in line.split(" ")]
    doc_count = Counter(doc_words)
    sentences = [line.split(" ") for line in doc]
    sen_score = [KL(sent,doc_count) for sent in sentences]
    
    pos = []
    num_output = 0
    l = 0
    for i in np.argsort(sen_score):
        l += len(orig[i].split(" "))
        
        pos.append(i)
        if l > L:
            break
    pos = sorted(pos)
    #print(pos)
    summ = []
    summ = [orig[i] for i in pos]
    
    return ". ".join(summ)

In [8]:
KLsum(raw_sent, new_sent, 20)

'The apartment was very clean and everything was as advertised the location was great with plenty of shops and places to eat.We walked to central park in about 20 minutes and the subway was on the next corner up from the apartment all in all a great place to stay.We would stay there again.'

## LDA SUM

In [None]:
nbors = list(set(data[data["neighbourhood_group"] == "Manhattan"]['neighbourhood']))
com_docs = []
com_docs_orig = []
for i in range(len(nbors)):
    item = " ".join(map(str, list(data[data["neighbourhood"] == nbors[i]]["comments"])))
    raw_sent = sent_tokenize(item)
    new_sent = []
    for i in range(len(raw_sent)-1, -1, -1):
        wr = preprocess_text(raw_sent[i])
        if wr != "":
            new_sent.append(wr)
        else:
            del raw_sent[i]
    com_docs_orig.append("%%%".join(raw_sent))
    com_docs.append("%%%".join(new_sent))

In [40]:
vectorizer_nbors = TfidfVectorizer(sublinear_tf = True)
duc_com = vectorizer_nbors.fit_transform(com_docs)
word_com = {k:v for v,k in vectorizer_nbors.vocabulary_.items()}

In [46]:
def lda_score(doc_prob, sent, vocabulary, components):
    num_word = [vocabulary[w] for w in sent if w in vocabulary]
    sen_prob = np.sum([components[:,num] for num in num_word], axis = 0)
    
    doc_pd = doc_prob / np.sum(doc_prob)
    sen_pd = sen_prob / np.sum(sen_prob)

    score = 0
    try:
        iter(sen_prob)
    except TypeError:
        res = 0
    else:
        for p, q in zip(doc_pd, sen_pd):
            score += p*np.log(p/q)
    #print(score)
    return score

def ldasum(doc_prob, orig, doc, vocabulary, components, L):
    orig = orig.split("%%%")
    doc = doc.split("%%%")
    #print(len(orig), len(doc))
    scores = [lda_score(doc_prob, sen.split(" "),vocabulary, components) for sen in doc if len(sen)!=1]
    
    pos = []
    l = 0
    for i in np.argsort(scores):
        l += len(orig[i].split(" "))
        pos.append(i)
        #print(pos)
        if l > L:
            break
    pos = sorted(pos)
    
    summ = []
    summ = [orig[i] for i in pos]
    
    return ". ".join(summ)

In [42]:
n_topic = 20
lda_com = LDA(n_components=n_topic)
lda_com.fit(duc_com)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7,
             learning_method='batch', learning_offset=10.0,
             max_doc_update_iter=100, max_iter=10, mean_change_tol=0.001,
             n_components=20, n_jobs=None, n_topics=None, perp_tol=0.1,
             random_state=None, topic_word_prior=None,
             total_samples=1000000.0, verbose=0)

In [47]:
lda_sum_com = []
for i in range(26):
    if com_docs_orig[i] != "":
        lda_sum_com.append(ldasum(lda_com.transform(duc_com)[i], com_docs_orig[i], com_docs[i], vectorizer_nbors.vocabulary_, lda_com.components_, 50))
    else:
        lda_sum_com.append("")
LDA_sum = {nbors[i]:lda_sum_com[i] for i in range(len(nbors))}

In [52]:
LDA_sum.keys()

dict_keys(['Flatiron District', 'Battery Park City', "Hell's Kitchen", 'Harlem', 'West Village', 'East Village', 'Greenwich Village', 'Washington Heights', 'Chinatown', 'Civic Center', 'Nolita', 'Kips Bay', 'Marble Hill', 'Lower East Side', 'Midtown', 'Upper East Side', 'Murray Hill', 'Morningside Heights', 'Upper West Side', 'Financial District', 'Tribeca', 'Inwood', 'Chelsea', 'Little Italy', 'East Harlem', 'Roosevelt Island'])

In [58]:
LDA_sum['East Harlem']

'- We felt very secure in the neighborhood.. Everything was just as described.. Mimi provided towels and a box of toiletries for if you have forgotten anything.. I enjoyed my stay!. Bei Christopher und Co. Hat man wirklich eine super nette Truppe, die einen offen und sehr lieb empfängt.. There is an elevator in the building so we had no issues getting our luggage in and out.'

## ELMO

In [8]:
! 

Password:


In [1]:
from allennlp.commands.elmo import ElmoEmbedder
elmo = ElmoEmbedder()

ModuleNotFoundError: No module named 'allennlp'