In [None]:
# ! pip install pyLDAvis


In [None]:
# import libraries  
import numpy as np
import pandas as pd
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import re, random, os
import string, pprint
import matplotlib.pyplot as plt
import seaborn as sns
import spacy

# gensim for LDA 
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models.ldamodel import LdaModel
from gensim.models import CoherenceModel

# Plotting tools
import pyLDAvis
# import pyLDAvis.gensim  # don't skip this
import pyLDAvis.gensim_models

#from pyLDAvis import gensim_models as pg
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
  from collections import Iterable


In [None]:
df = pd.read_csv("Airbnb_Texas_Rentals.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,average_rate_per_night,bedrooms_count,city,date_of_listing,description,latitude,longitude,title,url
0,1,$27,2,Humble,May 2016,Welcome to stay in private room with queen bed...,30.020138,-95.293996,2 Private rooms/bathroom 10min from IAH airport,https://www.airbnb.com/rooms/18520444?location...
1,2,$149,4,San Antonio,November 2010,"Stylish, fully remodeled home in upscale NW – ...",29.503068,-98.447688,Unique Location! Alamo Heights - Designer Insp...,https://www.airbnb.com/rooms/17481455?location...
2,3,$59,1,Houston,January 2017,'River house on island close to the city' \nA ...,29.829352,-95.081549,River house near the city,https://www.airbnb.com/rooms/16926307?location...
3,4,$60,1,Bryan,February 2016,Private bedroom in a cute little home situated...,30.637304,-96.337846,Private Room Close to Campus,https://www.airbnb.com/rooms/11839729?location...
4,5,$75,2,Fort Worth,February 2017,Welcome to our original 1920's home. We recent...,32.747097,-97.286434,The Porch,https://www.airbnb.com/rooms/17325114?location...


In [None]:
# tokenize using gensim simple_preprocess
def sent_to_words(sentences, deacc=True): # deacc=True removes punctuations
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence)))  


# convert to list
data = df['description'].values.tolist()
data_words = list(sent_to_words(data))

# create list of stop words
stop_words = stopwords.words('english') + list(string.punctuation)

# functions for removing stopwords and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out
  
# remove stop words
data_words_nostops = remove_stopwords(data_words)

# initialize spacy 'en' model
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

# lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_nostops, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

# create dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create corpus
corpus = [id2word.doc2bow(text) for text in data_lemmatized]

# sample
# print(corpus[2])

In [None]:
#Hyperparameter Tuning - Number of Topics and Alpha

# compute coherence value at various values of alpha and num_topics
def compute_coherence_values(dictionary, corpus, texts, num_topics_range):
    
    coherence_values = []
    model_list = []
    for num_topics in num_topics_range:
        lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                            id2word=dictionary,
                                            passes=10,
                                            num_topics=num_topics, 
                                            per_word_topics=True)
        model_list.append(lda_model)
        
        coherencemodel = CoherenceModel(model=lda_model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_values.append((num_topics, coherencemodel.get_coherence()))
    

    return model_list, coherence_values
    
# build models across a range of num_topics
num_topics_range = [2, 4, 6, 8, 10, 15]

model_list, coherence_values = compute_coherence_values(dictionary=id2word, 
                                                        corpus=corpus, 
                                                        texts=data_lemmatized, 
                                                        num_topics_range=num_topics_range,
                                                        ) 
                                                        
coherence_df = pd.DataFrame(coherence_values, columns=['num_topics', 'coherence_value'])
coherence_df.sort_values('coherence_value', ascending=False)

Unnamed: 0,num_topics,coherence_value
1,4,0.603839
3,8,0.588813
2,6,0.571889
4,10,0.553117
0,2,0.541754
5,15,0.505937


In [None]:
#Finally building the LDA Model by selecting 

Final_LDA_model = LdaModel(corpus=corpus,id2word=id2word, num_topics=4,random_state=100,update_every=1, chunksize=100,
                                                  passes=10, alpha='auto', per_word_topics=True)

# print the topics
pprint.pprint(Final_LDA_model.print_topics())


[(0,
  '0.031*"beach" + 0.028*"pool" + 0.022*"view" + 0.019*"enjoy" + 0.018*"relax" '
  '+ 0.014*"water" + 0.014*"beautiful" + 0.013*"patio" + 0.012*"deck" + '
  '0.011*"fishing"'),
 (1,
  '0.048*"room" + 0.042*"bedroom" + 0.039*"private" + 0.038*"bed" + '
  '0.025*"kitchen" + 0.022*"bathroom" + 0.021*"full" + 0.021*"bath" + '
  '0.014*"living" + 0.014*"area"'),
 (2,
  '0.043*"place" + 0.033*"downtown" + 0.031*"close" + 0.030*"minute" + '
  '0.029*"restaurant" + 0.021*"good" + 0.021*"neighborhood" + 0.021*"business" '
  '+ 0.020*"family" + 0.020*"couple"'),
 (3,
  '0.055*"home" + 0.019*"locate" + 0.017*"stay" + 0.013*"perfect" + '
  '0.012*"away" + 0.011*"guest" + 0.010*"need" + 0.010*"mile" + 0.009*"enjoy" '
  '+ 0.009*"house"')]


In [None]:
# visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(Final_LDA_model, corpus, id2word)
vis

  by='saliency', ascending=False).head(R).drop('saliency', 1)


In [None]:
"""Topic Modeling"""

# ! pip install pyLDAvis

# import libraries  
import numpy as np
import pandas as pd
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import re, random, os
import string, pprint
import matplotlib.pyplot as plt
import seaborn as sns
import spacy
from nltk.corpus import stopwords

# gensim for LDA 
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim_models # don't skip this

#from pyLDAvis import gensim_models as pg
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

"""
Topic modeling is an unsupervised machine learning technique that's capable of 
scanning a set of documents, detecting word and phrase patterns within them, 
and automatically clustering word groups and similar expressions that best 
characterize a set of documents.

Args:
  df : Dataframe name
  text_column : text column name
  num_topics  : number f 
  chunksize=100 : Size pf the chunk (By default is 100)
  passes=10 : Number of passes (By default is 10)

Returns: 
  model : topic LDA model

"""

class TopicModelling():
  def __init__(self, df, text_column, num_topics, passes=10):
    """Inits the Preprocessing"""
    self.df = df
    self.text_column = text_column
    self.num_topics = num_topics
    self.passes = passes
  
  
  def sent_to_words(self, sentences, deacc=True): # deacc=True removes punctuations
      """tokenize using gensim simple_preprocess"""
      for sentence in sentences:
          yield(gensim.utils.simple_preprocess(str(sentence)))  
  
  
  def remove_stopwords(self, texts, stop_words):
      """remove stopwords"""
      return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]
 
  # perform the lemmatization
  def lemmatization(self, texts, spacy_en_model, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
      """https://spacy.io/api/annotation"""
      texts_out = []
      for sent in texts:
          doc = spacy_en_model(" ".join(sent)) 
          texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
      return texts_out


  def run_all(self):
    """Run all the methods as per the requirments"""
    # convert to list
    data = self.df[self.text_column].values.tolist()
    data_words = list(self.sent_to_words(data))

    # create list of stop words
    # string.punctuation (from the 'string' module) contains a list of punctuations

    stop_words = stopwords.words('english') + list(string.punctuation)

    # remove stop words
    data_words_nostops = self.remove_stopwords(data_words, stop_words)
  
    # initialize spacy 'en' model, use only tagger since we don't need parsing or NER 
    # python3 -m spacy download en
    spacy_en_model = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

    # lemmatization keeping only noun, adj, vb, adv
    data_lemmatized = self.lemmatization(data_words_nostops, spacy_en_model, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

    # create dictionary and corpus
    # create dictionary
    id2word = corpora.Dictionary(data_lemmatized)

    # Create corpus
    corpus = [id2word.doc2bow(text) for text in data_lemmatized]

    # Build LDA model
    lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=id2word, num_topics=self.num_topics, random_state=100, update_every=1,
                                                passes=self.passes, alpha='auto', per_word_topics=True) 

    return lda_model


df = pd.read_csv("Airbnb_Texas_Rentals.csv")

topicModelling = TopicModelling(df, 'description', 4, 10)
lda_model = topicModelling.run_all()

# print the topics
pprint.pprint(lda_model.print_topics())


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


[(0,
  '0.039*"room" + 0.031*"bed" + 0.031*"bedroom" + 0.026*"private" + '
  '0.021*"kitchen" + 0.020*"bathroom" + 0.019*"full" + 0.013*"bath" + '
  '0.012*"home" + 0.012*"size"'),
 (1,
  '0.015*"stay" + 0.014*"guest" + 0.010*"available" + 0.010*"night" + '
  '0.009*"day" + 0.009*"time" + 0.009*"home" + 0.007*"need" + 0.007*"house" + '
  '0.006*"book"'),
 (2,
  '0.051*"place" + 0.031*"close" + 0.026*"restaurant" + 0.024*"downtown" + '
  '0.023*"couple" + 0.022*"good" + 0.022*"business" + 0.021*"traveler" + '
  '0.021*"minute" + 0.019*"family"'),
 (3,
  '0.024*"home" + 0.014*"enjoy" + 0.013*"locate" + 0.013*"beach" + '
  '0.012*"view" + 0.010*"beautiful" + 0.010*"pool" + 0.010*"minute" + '
  '0.009*"relax" + 0.009*"private"')]
