# Latent Dirichlet Allocation
---
  
**LDA**  
- Generative Probabilistic Topic Model. 
- Assumes documents are a mixture of topics and that each word in the document is attributable to a topic  
- The topic probabilities provide an explicit representation of a document

## Env Preparation

In [1]:
import os
import sys
# os.chdir('/Users/pabloruizruiz/OneDrive/Courses/NLP_Stanford/Complementary_Courses/Document_Clustering')
os.chdir('/home/jovyan/work/')
print('Workdir: ', os.getcwd())
from os.path import join as JP
sys.path.append(os.getcwd())
sys.path.append(JP(os.getcwd(),'utils'))
sys.path.append(JP(os.getcwd(),'scripts'))

Workdir:  /home/jovyan/work


In [2]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore")

tqdm.pandas()
%matplotlib notebook
np.set_printoptions(precision=3)
pd.options.display.float_format = '{:,.3f}'.format

In [3]:
import pickle
from pprint import pprint
from collections import defaultdict

from utils.nlp_utils import preproces
from utils.general import parse_yaml, ensure_directories

from scripts.catalog import (
    Catalog, Document, Corpus,
    load_catalog, load_corpus)

config = parse_yaml('config.yaml')
paths = config['paths']
ensure_directories(paths)

ModuleNotFoundError: No module named 'utils'

## Data Preparation

In [None]:
data = pd.read_csv(JP('data','bbc-text.csv'))
data['lenght'] = data.text.apply(lambda l: len(l.split(' ')))
data.head()

In [None]:
import spacy
from spacy import displacy

nlp = spacy.load('en_core_web_sm') # Powerfull model with everytihing included

def spacy_cleaning(
    document,
    tags_to_keep=['JJ', 'NN', 'NNS', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ'],
    entities_to_remove=['ORG,NORP,GPE,PERSON']):

    def pass_test(w, tags=tags_to_keep):
        if w.ent_type_ == 0:
                return w.tag_ in tags and not w.is_punct and not w.is_stop and w.ent_ not in entities_to_remove
        return w.tag_ in tags and not w.is_punct and not w.is_stop 

    words = [ word for word in document if pass_test(word)]
    tokens = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in words ]
    return ' '.join(tokens)

In [None]:
data['processed'] = data['text'].progress_apply(nlp).progress_apply(spacy_cleaning)
data.to_csv(JP('data','bbc-text-processed.csv'))

In [None]:
data = pd.read_csv(JP('data','bbc-text-processed.csv')).iloc[:,1:]
data.head()

In [None]:
print('\nBefore Processing')
doc_list = [d for d in data.text]
print('Total documents: ', len(doc_list))
docs_toguether = ' '.join(d for d in doc_list)
all_words = docs_toguether.split(' ')
unique_words = np.unique(all_words)
print('All words: {}. Unique words: {}'.format(len(all_words), len(unique_words)))

print('\nAfter Processing')
doc_list = [d for d in data.processed]
print('Total documents: ', len(doc_list))
docs_toguether = ' '.join(d for d in doc_list)
all_words = docs_toguether.split(' ')
unique_words = np.unique(all_words)
print('All words: {}. Unique words: {}'.format(len(all_words), len(unique_words)))

## TFIDF

In [None]:
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
EMBED_SIZE = 10000 
NUM_CLUSTERS = data['category'].nunique()
WORDS_PER_CLUSTER = None
print(NUM_CLUSTERS)

In [None]:
documents = [Document() for i in range(data.shape[0])]
for d in range(len(documents)):
    documents[d].processed_text = data['processed'][d]

In [None]:
catalog = Catalog()
catalog.documents = documents

In [None]:
vectorizer = TfidfVectorizer(
    min_df=.05,
    max_df=.8,
    norm='l2',
    use_idf=True,
    smooth_idf=True,
    max_features=EMBED_SIZE,
    ngram_range=(1,3),
    lowercase=True,
    stop_words=stopwords.words('english'))

In [None]:
_ = catalog.collect_corpus(attr='processed_text', form=list)
tfidf = catalog.to_matrix(
    vectorizer=vectorizer,
    modelname='TFIDF',
    max_docs=None)
print(tfidf.representation.shape)
tfidf.representation.head()

## Sklearn Topic Modelling

- 1: Latent Dirichlet Allocation
- 2: Non Negative Matrix Factorization

In [None]:
NUM_TOPICS = NUM_CLUSTERS

In [None]:
# Functions for printing keywords for each topic
def selected_topics(model, vectorizer, top_n=10):
    for idx, topic in enumerate(model.components_):
        print("Topic %d:" % (idx))
        print([vectorizer.get_feature_names()[i] for i in topic.argsort()[:-top_n - 1:-1]]) 

### Latente Dirichlet Allocation

In [None]:
from sklearn.decomposition import NMF, LatentDirichletAllocation, TruncatedSVD
lda = LatentDirichletAllocation(
    n_components=NUM_TOPICS, 
    max_iter=10, 
    learning_method='online',
    verbose=True)
lda


In [None]:
print('Latent Semantion Allocation')
data_lda = lda.fit_transform(tfidf.representation)

In [None]:
# Keywords for topics clustered by Latent Dirichlet Allocation
print("LDA Model:")
selected_topics(lda, vectorizer, 8)

**Save Model**

In [None]:
with open(JP(paths['checkpoints'], 'lda_sklearn.pkl'), 'wb') as obj:
    pickle.dump(lda,obj)

**Load Model**

In [None]:
with open(JP(paths['checkpoints'], 'lda_sklearn.pkl'), 'rb') as obj:
    lda = pickle.load(obj)
lda

#### Visualization

In [None]:
import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()

In [None]:
pyLDAvis.sklearn.prepare(lda, tfidf.representation, tfidf.mapping, mds='tsne')

## Non-Negative Matrix Factorization

In [None]:
nmf = NMF(n_components=NUM_TOPICS)
nmf

In [None]:
print('Non-Negative Matrix Factorization')
data_nmf = nmf.fit_transform(tfidf.representation)

In [None]:
# Keywords for topics clustered by Latent Semantic Indexing
print("NMF Model:")
selected_topics(nmf, vectorizer, 8)

**Save Model**

In [None]:
with open(JP(paths['checkpoints'], 'NNMF.pkl'), 'wb') as obj:
    pickle.dump(nmf,obj)

**Load Model**

In [None]:
with open(JP(paths['checkpoints'], 'NNMF.pkl'), 'rb') as obj:
    nmf = pickle.load(obj)
nmf

## Gensin

In [None]:
from gensim import corpora, models, similarities 

In [None]:
NUM_TOPICS = 5

In [None]:
data['processed_token'] = data['processed'].apply(lambda t: t.split())
data['processed_token']

In [None]:
# Create a Gensim dictionary from the texts
dictionary = corpora.Dictionary(data['processed_token']) 
dictionary.filter_extremes(no_below=0.1, no_above=0.7)
corpus = [dictionary.doc2bow(text) for text in data['processed_token']]

In [None]:
import time
start = time.time()
ldaModel = models.LdaModel(
    corpus, 
    num_topics=NUM_TOPICS,
    id2word=dictionary, 
    update_every=5, 
    chunksize=1000, 
    passes=300)
print('It took me {}'.format(time.time()-start))

In [None]:
import time
start = time.time()
ldaModelMulticore = models.LdaMulticore(
    corpus, 
    num_topics=NUM_TOPICS,
    id2word=dictionary, 
    chunksize=1000, 
    passes=300)
print('It took me {}'.format(time.time()-start))

**Save Model**

In [None]:
with open(JP(paths['checkpoints'], 'lda_gensim.pkl'), 'wb') as obj:
    pickle.dump(ldaModel,obj)
    
with open(JP(paths['checkpoints'], 'lda_multicore_gensim.pkl'), 'wb') as obj:
    pickle.dump(ldaModelMulticore,obj)

**Load Model**

In [None]:
with open(JP(paths['checkpoints'], 'lda_gensim.pkl'), 'rb') as obj:
    ldaModel = pickle.load(obj)
    
# with open(JP(paths['checkpoints'], 'lda_multicore_gensim.pkl'), 'rb') as obj:
#     ldaModelMulticore = pickle.load(obj)
    
print(ldaModel)
# print(ldaModelMulticore)

### Aggregate Results into Pandas

In [None]:
ldaModel.show_topics(formatted=False, num_words=2)

In [None]:
from functools import reduce

scores = []
for c in range(NUM_TOPICS):
    df = pd.DataFrame(ldaModel.show_topic(c,topn=None), columns=['word',c+1])
    scores.append(df)
    
scores = reduce(lambda df1, df2: df1.merge(df2, on='word'), scores)
scores.head(3)

#### Visualization

In [None]:
import pyLDAvis.gensim as gensimvis

In [None]:
vis_data = gensimvis.prepare(ldaModel, corpus, dictionary)


---

# Plotting the WordClouds


In [None]:
%matplotlib inline
from wordcloud import WordCloud

#### Helper Functions for Plotting

In [None]:
def define_subplots(n_cols,n_plots,figsize=None):
    '''Return the axes given a total 
    of plots and desired number of columns'''
    j = 1 if n_plots%n_cols != 0 else 0
    n_rows = (n_plots // n_cols) + j 
    print(n_rows,n_cols)
    if not figsize: 
        figsize=(n_cols*5,n_rows*5)
    
    fig, axs = plt.subplots(
        nrows=n_rows, ncols=n_cols, sharex=False, sharey=False,
        figsize=figsize)
    return fig,axs # .reshape(n_plots,-1)


In [None]:
def cluster_to_wordcloud(
    df, max_words=200, use_mask=False, bgcolor='black'):
    ''' Convert 1 cluster into a WordCloud given:
        - The TFIDF for the cluster
        - The Score Method that give imporance to the word '''
    # Create the wordcloud attending to the inverse of idf
    wordcloud = WordCloud(
        max_words=max_words, 
        mask=mask_ if use_mask else None,
        background_color=bgcolor).generate_from_frequencies(
            frequencies=dict(zip(df.word, df.score)))
    return wordcloud


In [None]:
def plot_centroids_as_wordclouds(
    word_scores,
    NUM_CLUSTERS = None,
    max_words_per_cloud=100, 
    use_mask=False, n_cols=2, figsize=(15,15)):

    if not NUM_CLUSTERS:
        NUM_CLUSTERS = word_scores.cluster.nunique()

    n_plots = NUM_CLUSTERS
    _, axs = define_subplots(n_cols,n_plots, figsize)
    
    for c in range(NUM_CLUSTERS):
        wordcloud = cluster_to_wordcloud(
            df=word_scores[word_scores.cluster == c+1],
            max_words=max_words_per_cloud,
            use_mask=use_mask)
        
        # Plot the resulting wordcloud
        axs[c // n_cols, c % n_cols].imshow(wordcloud)
        axs[c // n_cols, c % n_cols].axis('off')
    plt.tight_layout()
    plt.show()
    return


In [None]:
data = pd.melt(scores.set_index('word').T.rename_axis('cluster').reset_index(), 
               id_vars=['cluster'], var_name='word', value_name='score')
data.head()

In [None]:
plot_centroids_as_wordclouds(data, n_cols=2)

# Validation

**According to importants words of cluster lets assigned a classification:**

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.utils.multiclass import unique_labels

In [None]:
cluster_preds = ['sport', 'entertainment', 'business', 'tech', 'politics']  
d = dict(zip(range(NUM_CLUSTERS), cluster_preds))
d

In [None]:
data['predict'] = [d[i] for i in clusters.labels_]

In [None]:
print(data.category.value_counts())

In [None]:
print(data.predict.value_counts())

In [None]:
cm = confusion_matrix(
    y_true=data.category, 
    y_pred = data.predict, 
    labels=cluster_preds)

pd.DataFrame(cm, columns=cluster_preds, index=cluster_preds)