In [1]:
from pathlib import Path
import pandas as pd
import gensim, spacy
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from nltk.corpus import stopwords
from joblib import Memory
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import pyLDAvis
import pyLDAvis.gensim_models
import warnings
warnings.filterwarnings("ignore")

In [2]:
BASE_DIR = Path.cwd().parent
stop_words = stopwords.words('english')
cachedir = 'cache'
memory = Memory(cachedir, verbose=0)

In [3]:
df_hosts_reviews_en = pd.read_parquet(BASE_DIR / 'staging_data' / 'hosts_reviews_en_cleaned.parquet')

In [4]:
def sent_to_words(sentences):
    for sent in sentences:
        sent = gensim.utils.simple_preprocess(str(sent), deacc=True) 
        yield(sent)  

# Convert to list
data = df_hosts_reviews_en['comments'].values.tolist()
data_words = list(sent_to_words(data))
print(data_words[1])

['nice', 'flat', 'great', 'area', 'host', 'sorted', 'things', 'make', 'stay', 'smooth', 'easy', 'leaving', 'key', 'neighbours', 'etc', 'thanks']


In [5]:
nlp = spacy.load("en_core_web_sm")
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# !python3 -m spacy download en  # run in terminal once
@memory.cache
def process_words_cached(texts, stop_words=stop_words, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """Remove Stopwords, Form Bigrams, Trigrams and Lemmatization"""
    #texts = [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]
    texts = [bigram_mod[doc] for doc in texts]
    texts = [trigram_mod[bigram_mod[doc]] for doc in texts]
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    # remove stopwords once more after lemmatization
    texts_out = [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts_out]    
    return texts_out

# call the cached function
data_ready = process_words_cached(data_words)

In [6]:
# Create Dictionary
id2word = corpora.Dictionary(data_ready)

# Create Corpus: Term Document Frequency
corpus = [id2word.doc2bow(text) for text in data_ready]

# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=3, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='symmetric',
                                           iterations=100,
                                           per_word_topics=True)

lda_model.print_topics()

[(0,
  '0.088*"host" + 0.052*"stay" + 0.049*"great" + 0.037*"place" + 0.036*"apartment" + 0.033*"location" + 0.026*"recommend" + 0.025*"nice" + 0.022*"clean" + 0.019*"really"'),
 (1,
  '0.038*"close" + 0.033*"walk" + 0.029*"apartment" + 0.028*"station" + 0.025*"city" + 0.025*"restaurant" + 0.023*"minute" + 0.020*"nice" + 0.017*"quiet" + 0.015*"area"'),
 (2,
  '0.019*"bed" + 0.017*"room" + 0.016*"kitchen" + 0.013*"small" + 0.012*"good" + 0.011*"apartment" + 0.009*"bathroom" + 0.009*"also" + 0.009*"work" + 0.008*"get"')]

In [7]:
def format_topics_sentences(ldamodel=None, corpus=corpus, texts=data):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row_list in enumerate(ldamodel[corpus]):
        row = row_list[0] if ldamodel.per_word_topics else row_list            
        # print(row)
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,3), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_topic', 'Percent_contribution', 'Topic_keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    contents.name = 'Text'
    sent_topics_df = sent_topics_df.merge(contents, left_index=True, right_index=True)
    return(sent_topics_df)


df_dominant_topic = format_topics_sentences(ldamodel=lda_model, corpus=corpus, texts=data_ready)
df_dominant_topic.columns = ['Dominant_topic', 'Topic_percent_contribution', 'Keywords', 'Text']
df_dominant_topic.head()

Unnamed: 0,Dominant_topic,Topic_percent_contribution,Keywords,Text
0,1,0.477,"close, walk, apartment, station, city, restaur...","[great, little, apartment, perfect, spot, town..."
1,0,0.589,"host, stay, great, place, apartment, location,...","[nice, flat, great, area, host, sort, thing, m..."
2,2,0.587,"bed, room, kitchen, small, good, apartment, ba...","[great, location, host, responsive, email, cou..."
3,1,0.443,"close, walk, apartment, station, city, restaur...","[host, place, super, great, close, quite, cent..."
4,0,0.562,"host, stay, great, place, apartment, location,...","[nice, apartment, great, location, great, pric..."


In [8]:
df_dominant_topic.to_parquet(BASE_DIR / 'processed_data' / 'hosts_reviews_en_topics.parquet')

In [9]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, dictionary=lda_model.id2word)
vis