In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.externals import joblib
from pathlib import Path
from termcolor import colored
from wordcloud import WordCloud
from numpy.random import dirichlet
from ipywidgets import interact, FloatSlider

In [None]:
% matplotlib inline
pd.options.display.float_format = '{:,.2f}'.format

In [None]:
from jupyterthemes import jtplot
jtplot.style(theme='onedork', context='talk', fscale=1.4, spines=False, gridlines='--', ticks=True, grid=False, figsize=(14, 8))

### Simulate Dirichlet Distribution

In [None]:
f=FloatSlider(value=1, min=1e-2, max=1e2, step=1e-2, continuous_update=False, description='Alpha')
@interact(alpha=f)
def sample_dirichlet(alpha):
    topics = 10
    draws= 9
    alphas = np.full(shape=topics, fill_value=alpha)
    samples = dirichlet(alpha=alphas, size=draws)
    fig, axes = plt.subplots(nrows=3, ncols=3, sharex=True, sharey=True)
    axes = axes.flatten()
    plt.setp(axes, ylim=(0, 1))
    for i, sample in enumerate(samples):
        axes[i].bar(x=list(range(10)), height=sample, color=sns.color_palette("Set2", 10))
    fig.suptitle('Dirichlet Allocation | 10 Topics, 9 Samples')
    fig.tight_layout()
    plt.subplots_adjust(top=.95)

### Load BBC data

Most of the following steps are not shown in the video since this is a preview of the results that will be introduced throughout this section. The code will be explained in the following segments.

In [None]:
path = Path('bbc')
files = path.glob('**/*.txt')
doc_list = []
for i, file in enumerate(files):
    with open(str(file), encoding='latin1') as f:
        _, topic, file_name = file.parts

        lines = f.readlines()
        file_id = file_name.split('.')[0]
        heading = lines[0].strip()
        body = ' '.join([l.strip() for l in lines[1:]])
        doc_list.append([topic, heading, body])

### Convert to DataFrame

In [None]:
docs = pd.DataFrame(doc_list, columns=['topic', 'heading', 'article'])
print(docs.info())

### Vectorize train & test sets

In [None]:
train_docs, test_docs = train_test_split(docs, stratify=docs.topic, test_size=50, random_state=42)

In [None]:
vectorizer = CountVectorizer(max_df=.2, min_df=3, stop_words='english', max_features=2000)
train_dtm = vectorizer.fit_transform(train_docs.article)
words = vectorizer.get_feature_names()
train_dtm

In [None]:
topic_labels = ['Topic {}'.format(i) for i in range(1, 6)]

In [None]:
lda = LatentDirichletAllocation(n_components=5, n_jobs=-1, max_iter=500,
                                learning_method='batch', evaluate_every=5, 
                                verbose=1, random_state=42)
lda.fit(train_dtm)

In [None]:
joblib.dump(lda, 'lda_demo.pkl')
lda = joblib.load('lda_demo.pkl')

In [None]:
train_result = pd.DataFrame(data=lda.transform(train_dtm),
                            columns=topic_labels,
                            index=train_docs.topic)

In [None]:
df = train_result.groupby(level='topic').idxmax(
    axis=1).reset_index(-1, drop=True)
sns.heatmap(df.groupby(level='topic').value_counts(normalize=True)
            .unstack(-1), annot=True, fmt='.1%', cmap='Blues', square=True)
plt.title('Train Data: Topic Assignments');

In [None]:
topics_prob = lda.components_ / lda.components_.sum(axis=1).reshape(-1, 1)
topics = pd.DataFrame(topics_prob.T,
                      index=words,
                      columns=topic_labels)
topics.head()

In [None]:
w = WordCloud()
fig, axes = plt.subplots(ncols=3, nrows=2, figsize=(25, 12))
axes = axes.flatten()
for t, (topic, freq) in enumerate(topics.items()):
    w.generate_from_frequencies(freq.to_dict())
    axes[t].imshow(w, interpolation='bilinear')
    axes[t].set_title(topic, fontsize=18)
    axes[t].axis('off')
axes[5].set_visible(False)
plt.tight_layout()

In [None]:
train_result['article'] = train_docs.article.values
train_result['heading'] = train_docs.heading.values
sample = train_result[train_result[topic_labels].gt(.1).all(1)]
sample

In [None]:
from collections import OrderedDict
color_dict = OrderedDict()
color_dict['Topic 1'] = {'color': 'white', 'on_color': 'on_blue'}
color_dict['Topic 2'] = {'color': 'white', 'on_color': 'on_green'}
color_dict['Topic 3'] = {'color': 'white', 'on_color': 'on_red'}
color_dict['Topic 4'] = {'color': 'white', 'on_color': 'on_magenta'}
color_dict['Topic 5'] = {'color': 'blue', 'on_color': 'on_yellow'}

In [None]:
sample[topic_labels].plot.bar(title='Topics Distribution - Sample Article')
plt.xlabel('')
plt.xticks([])
plt.tight_layout()

In [None]:
colored_text = []
for word in sample.iloc[0, 5].split():
    try:
        topic = topics.loc[word.strip().lower()].idxmax()
        colored_text.append(colored(word, **color_dict[topic]))
    except:
        colored_text.append(word)
    

print(' '.join([colored(k, **v) for k, v in color_dict.items()]))
print('\n',sample.iloc[0, 6], '\n')
text = ' '.join(colored_text)
print(text)