### Importing Necessary Packages

In [None]:
# ! pip install gensim
# ! pip install spacy 
# ! pip install wordcloud
# ! pip install pyLDAvis
# ! pip install num2words

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
from num2words import num2words
sns.set_style('whitegrid')
%matplotlib inline

In [None]:
# Gensim
import gensim, spacy, logging, warnings
import gensim.corpora as corpora
from gensim.models import CoherenceModel
from pprint import pprint

In [None]:
#pickle
import pickle

In [None]:
#LdaVis
import pyLDAvis.gensim_models

In [None]:
#nltk
import nltk
import string
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
# from nltk import pos_tag
from wordcloud import WordCloud
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('tagsets')

### Loading Fake_ Dataset

In [None]:
df = pd.read_csv('data/appartment_descriptions_eng.csv')

### Check for Na's

In [None]:
df.head()

In [None]:
for i in df.columns:
    print(i,"--------->", df[i].nunique())

In [None]:
for col in df.columns:
    A = df.loc[:col].isnull().sum()
print(A)
print("Text length =", len(df.descr))

In [None]:
df.dropna(thresh=3, inplace=True)
df.shape

In [None]:
df['num_rooms'] = df.rooms
df['num_floor'] = df.floor.apply(lambda x: x.replace('floor', ''))
df['num_price'] = df.price.apply(lambda x: x.replace(' ', ''))

In [None]:
def replace_numbers_to_oridinals(text, category=None, wordify=True):
    if wordify:
        return re.sub(r"(\d+)", lambda x: num2words(x.group(0), category), text)
    return re.sub(r"(\d+)", "", text)

In [None]:
df.addr = df.addr.apply(lambda x: re.sub(r'[^\w]', ' ', x))
df.rooms = df.rooms.apply(lambda x: replace_numbers_to_oridinals(x))
df.ruler = df.ruler.apply(lambda x: replace_numbers_to_oridinals(x))
df.ruler = df.ruler.apply(lambda x: x.replace('mtwo','square metres'))
df.floor = df.floor.apply(lambda x: replace_numbers_to_oridinals(x, category='ordinal'))
df.floor = df.floor.apply(lambda x: x.replace(',', ' '))
df.price = df.price.apply(lambda x: x.replace(',', ''))
df.price = df.price.apply(lambda x: x.replace(' ', ''))
df.descr = df.descr.apply(lambda x: replace_numbers_to_oridinals(x, wordify=False))

In [None]:
df.head()

### Preprocessing The Dataset

### NLTK Preprocessing

In [None]:
%%time
stemmer = SnowballStemmer('english') 
stops = stopwords.words('english')
lemmatizer = WordNetLemmatizer() 

def stemming_(Sentence):
    cleaned_ver = [stemmer.stem(word) for word in word_tokenize(Sentence) if word not in stops and word.isalnum()]
    return ' '.join(cleaned_ver)

def lemmatize_(Sentence):
    cleaned_ver = [lemmatizer.lemmatize(word) for word in word_tokenize(Sentence) if word not in stops and word.isalnum()]
    return ' '.join(cleaned_ver)

In [None]:
%%time
df["stem_text"] = 0
df["lemm_text"] = 0

In [None]:
%%time
df.stem_text = df.descr.apply(lambda x: stemming_(x))
df.lemm_text = df.descr.apply(lambda y: lemmatize_(y))

In [None]:
df[["descr", "stem_text", "lemm_text"]].head()

In [None]:
df['text'] = df.addr + " " + df.rooms + " " + df.ruler + " " + df.floor + " " + df.price + " " + df.lemm_text + " "

In [None]:
df.text.values.tolist()[0]

### WordCloud

In [None]:
long_string2 = ','.join(list(df.text.values))
wordcloud = WordCloud(background_color = "white", width = 500, height = 500, max_words = 5000, contour_width = 20)
wordcloud.generate(long_string2)
wordcloud.to_image()

### Unigram and Trigram: 

In [None]:
def plot_15_most_common_words(count_data, count_vectorizer):
    words = count_vectorizer.get_feature_names()
    total_counts = np.zeros(len(words))
    for t in count_data:
        total_counts += t.toarray()[0]
    
    count_dict = (zip(words, total_counts))
    count_dict = sorted(count_dict, key=lambda x:x[1], reverse=True)[0:15]
    words = [w[0] for w in count_dict]
    counts = [w[1] for w in count_dict]
    x_pos = np.arange(len(words)) 
    
    plt.figure(2, figsize=(15, 15/1.6180))
    plt.subplot(title='15 most common words')
    sns.set_context("notebook", font_scale=1.25, rc={"lines.linewidth": 2.5})
    sns.barplot(x_pos, counts, palette='husl')
    plt.xticks(x_pos, words, rotation=90) 
    plt.xlabel('words')
    plt.ylabel('counts')
    plt.show()

In [None]:
Unigram_vectorizer = CountVectorizer()
Trigram_vectorizer = CountVectorizer(ngram_range = (3, 3),
                                     token_pattern = r'\b\w+\b',
                                     min_df=1)

In [None]:
%%time
Unigram_data = Unigram_vectorizer.fit_transform(df.text)
Trigram_data = Trigram_vectorizer.fit_transform(df.text)

## CountVectorizer - How it works

In [None]:
Unigram_vectorizer.get_feature_names()[::100]

In [None]:
total = np.zeros(len(Unigram_vectorizer.get_feature_names()))
print(total, len(total))
for t in Unigram_data:
#     print(t)
#     print(len(t.toarray()[0]))
    total += t.toarray()[0]
print(total, len(total))

In [None]:
%%time
plot_15_most_common_words(Unigram_data, Unigram_vectorizer)

In [None]:
%%time
plot_15_most_common_words(Trigram_data, Trigram_vectorizer)

### TF-IDF on Trigram

In [None]:
%%time
TF_Vectorizer = TfidfVectorizer(ngram_range = (1, 3),
                                    token_pattern = r'\b\w+\b', min_df=1)
Data = TF_Vectorizer.fit_transform(df.text)
plot_15_most_common_words(Data, TF_Vectorizer)

In [None]:
import warnings
warnings.simplefilter("ignore", DeprecationWarning)
from sklearn.decomposition import LatentDirichletAllocation as LDA

In [None]:
def print_topics(model, count_vectorizer, n_top_words):
    
    words = count_vectorizer.get_feature_names()
    for topic_idx, topic in enumerate(model.components_):
        print("\nTopic #%d:" % topic_idx)
        print(" ".join([words[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))

In [None]:
number_topics = 3
number_words = 4

lda = LDA(number_topics, n_jobs = None)
lda.fit(Data)

print("Topics found via LDA:")
print_topics(lda, TF_Vectorizer, number_words)

### More Detailed LDA 

In [None]:
def compute_coherence_values(dictionary, corpus, texts, limit, start, step):
    coherence_values = []
    model_list = []
    for num_topics in range(start, limit, step):
        model = gensim.models.ldamodel.LdaModel(corpus,
                                                num_topics = num_topics,
                                                id2word=dictionary,
                                                passes=25,
                                                alpha='auto',
                                                update_every=0,
                                                random_state=76)
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, texts=text_data, dictionary=dictionary, corpus=corpus, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())

    return model_list, coherence_values

Gensim creates a unique id for each word in the document. The produced corpus shown above is a mapping of (word_id, word_frequency).

For example, (0, 1) above implies, word id 0 occurs once in the first document. Likewise, word id 1 occurs twice and so on.

This is used as the input by the LDA model.

If you want to see what word a given id corresponds to, pass the id as a key to the dictionary.

### How to find the optimal number of topics for LDA?

The overall coherence score of a topic is the average of the distances between words.

0.3 is bad

0.4 is low (probably have the wrong number of topics )

0.55 is okay

0.65 might be as good as it is going to get

0.7 is nice

0.8 is unlikely and

0.9 is probably wrong

How to interpret this?

Topic 0 is a represented as '0.014*"The" + 0.014*"said" + 0.006*"police" + 0.006*"law" + 0.005*"case"')

The weights reflect how important a keyword is to that topic.

Looking at these keywords, can you guess what this topic could be?

So how to infer pyLDAvis’s output?

* Each bubble on the left-hand side plot represents a topic. The larger the bubble, the more prevalent is that topic.

* A good topic model will have fairly big, non-overlapping bubbles scattered throughout the chart instead of being clustered in one quadrant.

* A model with too many topics, will typically have many overlaps, small sized bubbles clustered in one region of the chart.

* Salient keywords form the selected topic.


### Topic Modeling with POS tagging

In [None]:
def retrive_needed_words(text):
    is_noun_adj = lambda pos: pos[:2] in ['NN', 'JJ', 'VB', 'RB']
#                                           [ 'JJR', 'JJS', 'VBD', 'VBG','VBN', 'VBP', 'VBZ', 'NNP', 'NNPS', 'NNS',  'RBR', 'RBS']
    tokenized = word_tokenize(text)
    nouns_adj = [word.lower() for (word, pos) in pos_tag(tokenized) if is_noun_adj(pos)] 
    return ' '.join(nouns_adj)

In [None]:
simplified_text_data = list()
for text in df.text:
    simplified_text = retrive_needed_words(text)
    simplified_text_data.append(simplified_text)
    print("Normal text: ", text, '\nSimplified text: ', simplified_text)
    print()

In [None]:
TF_Vectorizer_for_modelling = TfidfVectorizer(ngram_range = (1, 3))
Data = TF_Vectorizer_for_modelling.fit_transform(simplified_text_data)
words = TF_Vectorizer_for_modelling.get_feature_names()
total_counts = np.zeros(len(words))
for t in Data:
    total_counts += t.toarray()[0]

count_dict = (zip(words, total_counts))
count_dict = sorted(count_dict, key=lambda x:x[1], reverse=True)[:25]
words = [w[0] for w in count_dict]
counts = [round(float(w[1]),2) for w in count_dict]

In [None]:
top_frequent_words = {k:v for (k,v) in zip(words, counts)}

In [None]:
text_data = [word_tokenize(i) for i in simplified_text_data]

dictionary = corpora.Dictionary(text_data)
corpus = [dictionary.doc2bow(text) for text in text_data]

pickle.dump(corpus, open('gensim/corpus.pkl', 'wb'))
dictionary.save('gensim/dictionary.gensim')

In [None]:
# Can take a long time to run.
model_list, coherence_values = compute_coherence_values(dictionary=dictionary,
                                                        corpus=corpus, 
                                                        texts=text_data, 
                                                        limit=6, 
                                                        start=2, 
                                                        step=1)

In [None]:
x = range(2, 15, 1)
for m, cv in zip(x, coherence_values):
    print("Num Topics =", m, " has Coherence Value of", round(cv, 4))

In [None]:
%%time
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = 3, id2word=dictionary, alpha='auto', passes=30, random_state=76)
ldamodel.save('gensim/model.gensim')

topics = ldamodel.print_topics(num_words=5)
for topic in topics:
    print(topic)

In [None]:
%%time
dictionary = gensim.corpora.Dictionary.load('gensim/dictionary.gensim')
corpus = pickle.load(open('gensim/corpus.pkl', 'rb'))
lda = gensim.models.ldamodel.LdaModel.load('gensim/model.gensim')

lda_display = pyLDAvis.gensim_models.prepare(lda, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display)

### Dominant Topic in Each Comment

In [None]:
def format_topics_sentences(ldamodel=ldamodel, corpus=corpus):

    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row in enumerate(ldamodel[corpus]):
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['dominant_topic', 'percentage_contribution', 'topic_keywords']
    return(sent_topics_df)

In [None]:
df_topic_sents_keywords = format_topics_sentences()

In [None]:
# Format
df_dominant_topic = df_topic_sents_keywords.reset_index(drop=True)
df_dominant_topic.columns = ['dominant_topic', 'topic_percentage_contrib', 'keywords']

# Show
df_dominant_topic.sort_values(['dominant_topic'])

In [None]:
new_df = pd.merge(df, df_dominant_topic, left_index=True, right_index=True)

In [None]:
new_df.to_csv("data/appartment_descriptions_eng_with_coordinates.csv")

In [None]:
new_df.head(5)