In [2]:
import pandas as pd

In [1]:
nyc_desc = pd.read_csv('/Users/Xiao/Documents/nyc_desc.csv', header=0, index_col=0)

In [3]:
nyc_desc['descriptions']

1        Step into our artistic spacious apartment and ...
2        An adorable, classic, clean, light-filled one-...
3        This is a spacious, clean, furnished master be...
4        Make your NY story in our welcoming, home away...
5        Urban retreat: enjoy 500 s.f. floor in 1899 br...
6        2 BR apartment, with 1 kitchen, 1 bathroom and...
7        This listing is NO LONGER active.  ___________...
8        A beautiful home away from home for your stay ...
9        Please don’t expect the luxury here just a bas...
10       Ask about discounts for January to March!  Ren...
11       Beautiful 2 Bedroom Apartment, Great for Group...
12       Live like a New Yorker instead of a stuffy, ti...
13       Great, fun, comfortable loft apartment, fantas...
14       rent a room in a brick house rather than just ...
15       Freshly painted, bright open space! Excellent ...
16       A great location and super quiet apartment in ...
17       Meander down our beautiful tree lined street u.

In [4]:
nyc_desc.shape

(11824, 1)

In [5]:
import nltk

In [6]:
import gensim
from nltk import word_tokenize
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from gensim.corpora.dictionary import Dictionary
from gensim.models.tfidfmodel import TfidfModel
from gensim.models.ldamodel import LdaModel

import itertools
from collections import Counter
from collections import defaultdict

import json


In [7]:
import pyLDAvis.gensim
pyLDAvis.enable_notebook()

In [189]:
nltk.download('words')

[nltk_data] Downloading package words to /Users/Xiao/nltk_data...
[nltk_data]   Package words is already up-to-date!


True

In [190]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /Users/Xiao/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [191]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /Users/Xiao/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [8]:
import string
string.digits

'0123456789'

In [9]:
nyc_desc.descriptions = nyc_desc.descriptions.apply(lambda x: x.translate(str.maketrans('','','1234567890')))

In [10]:
nyc_desc.tail()

Unnamed: 0,descriptions
11820,"Hello, My listing is located a block away from..."
11821,"Beautifully lit, newly renovated bedroom apar..."
11822,"You will adore this sunny, lovely, and fully e..."
11823,"Hello! Clean, renovated room, mins away from ..."
11824,"Charming, cozy atmosphere The whole bottom flo..."


In [11]:
nlp_feats = ['descriptions']
corpus = nyc_desc[nlp_feats]

In [40]:
def preprocess_text(corpus):
    """Takes a corpus in list format and applies basic preprocessing steps of word tokenization,
     removing of english stop words, lower case and lemmatization."""
    processed_corpus = []
    
    english_words = set(nltk.corpus.words.words())
    customized_set = {'one','two','three','four','five','six','seven','eight','nine','ten','also', 'new','york'}
    customized_stopwords = set(stopwords.words('english'))| customized_set
    
    wordnet_lemmatizer = WordNetLemmatizer()
    tokenizer = RegexpTokenizer(r'[\w|!]+')
    for row in corpus:
        word_tokens = tokenizer.tokenize(row)
        word_tokens_lower = [t.lower() for t in word_tokens]
        word_tokens_lower_english = [t for t in word_tokens_lower if t in english_words or not t.isalpha()]
        word_tokens_no_stops = [t for t in word_tokens_lower_english if not t in customized_stopwords]
        word_tokens_no_stops_lemmatized = [wordnet_lemmatizer.lemmatize(t) for t in word_tokens_no_stops]
        processed_corpus.append(word_tokens_no_stops_lemmatized)
    return processed_corpus

In [41]:
def nlp_model_pipeline(processed_corpus):
    """Takes processed corpus and produce dictionary, doc_term_matrix and LDA model"""
    # Creates the term dictionary (every unique term in corpus is assigned an index)
    dictionary = Dictionary(processed_corpus)
    # Convert corpus into Document Term Matrix using dictionary prepared above
    doc_term_matrix = [dictionary.doc2bow(listing) for listing in processed_corpus]    
    return dictionary, doc_term_matrix

In [42]:

def LDA_topic_modelling(doc_term_matrix, dictionary, num_topics=3, passes=2):
    # Create an object for LDA model and train it on Document-Term-Matrix
    LDA = LdaModel
    ldamodel = LDA(doc_term_matrix, num_topics=num_topics, id2word = dictionary, passes=passes)
    return ldamodel

In [43]:
def add_topics_to_df(ldamodel, doc_term_matrix, df, num_topics):
    # Convert into Per-document topic probability matrix:
    docTopicProbMat = ldamodel[doc_term_matrix]
    docTopicProbDf = pd.DataFrame(index=df.index, columns=range(0, num_topics))
    for i, doc in enumerate(docTopicProbMat):
        for topic in doc:
            docTopicProbDf.iloc[i, topic[0]] = topic[1]
    docTopicProbDf = docTopicProbDf.astype('float64')
    topic = docTopicProbDf.idxmax(axis=1)
    # Merge with df
    return topic

In [44]:
corpus

Unnamed: 0,descriptions
1,Step into our artistic spacious apartment and ...
2,"An adorable, classic, clean, light-filled one-..."
3,"This is a spacious, clean, furnished master be..."
4,"Make your NY story in our welcoming, home away..."
5,Urban retreat: enjoy s.f. floor in brownston...
6,"BR apartment, with kitchen, bathroom and wo..."
7,This listing is NO LONGER active. ___________...
8,A beautiful home away from home for your stay ...
9,Please don’t expect the luxury here just a bas...
10,Ask about discounts for January to March! Ren...


In [45]:
corpus_description = corpus['descriptions'].astype(str)

In [46]:
processed_corpus_description = preprocess_text(corpus_description)

In [47]:
dictionary_description, doc_term_matrix_description = nlp_model_pipeline(processed_corpus_description)

In [48]:
ldamodel_description = LDA_topic_modelling(doc_term_matrix_description, dictionary_description, num_topics=3, passes=1)

In [53]:
p = pyLDAvis.gensim.prepare(ldamodel_description, doc_term_matrix_description, dictionary_description)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [None]:
# 1 luxury 2 budget  3 location

In [55]:
pyLDAvis.save_html(p, '/Users/Xiao/Documents/lda.html')

In [50]:
new_df = add_topics_to_df(ldamodel_description, doc_term_matrix_description,nyc_desc, num_topics=3)

In [51]:
new_df.to_csv("/Users/Xiao/Documents/nyc_topic.csv")

  """Entry point for launching an IPython kernel.
