In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.ticker import FuncFormatter
from sklearn.decomposition import LatentDirichletAllocation, NMF, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.externals import joblib
from pathlib import Path
from jupyterthemes import jtplot

In [None]:
% matplotlib inline
pd.options.display.float_format = '{:,.2f}'.format
jtplot.style(theme='onedork', context='talk', fscale=1.4, spines=False, gridlines='--', ticks=True, grid=False, figsize=(14, 8))


### Load BBC data

In [None]:
path = Path('bbc')
files = path.glob('**/*.txt')
doc_list = []
for i, file in enumerate(files):
    with open(str(file), encoding='latin1') as f:
        _, topic, file_name = file.parts

        lines = f.readlines()
        file_id = file_name.split('.')[0]
        heading = lines[0].strip()
        body = ' '.join([l.strip() for l in lines[1:]])
        doc_list.append([topic, heading, body])

### Convert to DataFrame

In [None]:
docs = pd.DataFrame(doc_list, columns=['topic', 'heading', 'article'])
print(docs.info())

### Train-test split

In [None]:
train_docs, test_docs = train_test_split(
    docs, stratify=docs.topic, test_size=50, random_state=42)

In [None]:
train_docs.shape, test_docs.shape

In [None]:
pd.Series(test_docs.topic).value_counts()

### Vectorize train & test sets

In [None]:
vectorizer = CountVectorizer(max_df=.2, min_df=3, stop_words='english', 
                             max_features=2000)
train_dtm = vectorizer.fit_transform(train_docs.article)
words = vectorizer.get_feature_names()
train_dtm

In [None]:
test_dtm = vectorizer.transform(test_docs.article)
test_dtm

### Latent Semantic Analysis

In [None]:
svd = TruncatedSVD(n_components=5, n_iter=20, random_state=42)
svd.fit(train_dtm)

#### Explore Topics 

In [None]:
topic_labels = ['Topic {}'.format(i) for i in range(1, 6)]

In [None]:
svd_result = pd.DataFrame(data=svd.transform(train_dtm),
                   columns=topic_labels,
                   index=train_docs.topic)
svd_result.groupby(level='topic').mean().plot.bar();

In [None]:
topics = pd.DataFrame(svd.components_.T,
                      index=words,
                      columns=topic_labels)
topics.head()

In [None]:
top_words = {}
for topic, words_ in topics.items():
    top_words[topic] = words_.nlargest(10).index.tolist()
pd.DataFrame(top_words)

In [None]:
test_eval = pd.DataFrame(data=svd.transform(test_dtm), 
                         columns=topic_labels,
                         index=test_docs.topic)

test_eval.groupby(level='topic').mean().plot.bar(title='Avg. Topic Scores');

### probabilistic Latent Semantic Analysis

#### Equivalent to Non-Negative Matrix Factorization with Kullback-Leibler Divergence objective

In [None]:
nmf = NMF(n_components=5, random_state=42, solver='mu',
          beta_loss='kullback-leibler', max_iter=1000)
nmf.fit(train_dtm)
nmf.reconstruction_err_

In [None]:
nmf_res = pd.DataFrame(data=nmf.transform(train_dtm),
                   columns=topic_labels,
                   index=train_docs.topic,)
nmf_res.groupby(level='topic').mean().plot.bar();

In [None]:
topics = pd.DataFrame(nmf.components_.T,
                      index=words,
                      columns=topic_labels)
topics.head()

In [None]:
top_words = {}
for topic, words_ in topics.items():
    top_words[topic] = words_.nlargest(10).index.tolist()
pd.DataFrame(top_words)

In [None]:
test_eval = pd.DataFrame(data=nmf.transform(test_dtm),
                         columns=topic_labels,
                         index=test_docs.topic)

test_eval.groupby(level='topic').mean().plot.bar(
    title='Avg. Topic Probabilities');

### LDA with sklearn

In [None]:
lda_base = LatentDirichletAllocation(n_components=5, n_jobs=-1, learning_method='batch')
lda_base.fit(train_dtm)

#### Persist model

In [None]:
joblib.dump(lda_base, 'lda_10_iter.pkl')

In [None]:
lda_base = joblib.load('lda_10_iter.pkl') 
lda_base

#### Explore topics & word distributions

In [None]:
# pseudo counts
topics_count = lda_base.components_
print(topics_count.shape)
topics_count[:5]

In [None]:
topics_prob = topics_count / topics_count.sum(axis=1).reshape(-1, 1)
topics = pd.DataFrame(topics_prob.T,
                      index=words,
                      columns=topic_labels)
topics.head()

In [None]:
# all words have positive probability for all topics
topics[topics.gt(0).all(1)].shape[0] == topics.shape[0]

In [None]:
sns.heatmap(topics, cmap='Blues')

In [None]:
top_words = {}
for topic, words_ in topics.items():
    top_words[topic] = words_.nlargest(10).index.tolist()
pd.DataFrame(top_words)

In [None]:
fig, axes = plt.subplots(nrows=5, sharey=True, sharex=True, figsize=(10, 15))
for i, (topic, prob) in enumerate(topics.items()):
    sns.distplot(prob, ax=axes[i], bins=100, kde=False, norm_hist=False)
    axes[i].set_yscale('log')
    axes[i].xaxis.set_major_formatter(FuncFormatter(lambda x, _: '{:.1%}'.format(x)))
fig.suptitle('Topic Distributions')
fig.tight_layout()

### Evaluate fit

In [None]:
train_preds = lda_base.transform(train_dtm)
train_preds.shape

In [None]:
train_eval = pd.DataFrame(train_preds, columns=topic_labels, index=train_docs.topic)
train_eval.head()

In [None]:
train_eval.groupby(level='topic').mean().plot.bar(title='Avg. Topic Probabilities');

In [None]:
df = train_eval.groupby(level='topic').idxmax(
    axis=1).reset_index(-1, drop=True)
sns.heatmap(df.groupby(level='topic').value_counts(normalize=True)
            .unstack(-1), annot=True, fmt='.1%', cmap='Blues', square=True)
plt.title('Train Data: Topic Assignments')

### Test Set 

In [None]:
test_preds = lda_base.transform(test_dtm)
test_eval = pd.DataFrame(test_preds, columns=topic_labels, index=test_docs.topic)
test_eval.head()

In [None]:
test_eval.groupby(level='topic').mean().plot.bar(title='Avg. Topic Probabilities');

In [None]:
df = test_eval.groupby(level='topic').idxmax(
    axis=1).reset_index(-1, drop=True)
sns.heatmap(df.groupby(level='topic').value_counts(normalize=True)
            .unstack(-1), annot=True, fmt='.1%', cmap='Blues', square=True)
plt.title('Topic Assignments');

### Retrain unitl perplexity no longer decreases

In [None]:
lda_opt = LatentDirichletAllocation(n_components=5, n_jobs=-1, max_iter=500,
                                learning_method='batch', evaluate_every=5, 
                                verbose=1, random_state=42)
lda_opt.fit(train_dtm)

In [None]:
joblib.dump(lda_opt, 'lda_opt.pkl')

In [None]:
train_opt_eval = pd.DataFrame(data=lda_opt.transform(train_dtm),
                          columns=topic_labels,
                          index=train_docs.topic)

In [None]:
test_opt_eval = pd.DataFrame(data=lda_opt.transform(test_dtm),
                         columns=topic_labels, 
                         index=test_docs.topic)

### Compare Train & Test Topic Assignments 

In [None]:
fig, axes = plt.subplots(ncols=2)
source = ['Train', 'Test']
for i, df in enumerate([train_opt_eval, test_opt_eval]):
    df = df.groupby(level='topic').idxmax(
    axis=1).reset_index(-1, drop=True)
    sns.heatmap(df.groupby(level='topic').value_counts(normalize=True)
            .unstack(-1), annot=True, fmt='.1%', cmap='Blues', square=True, ax=axes[i])
    axes[i].set_title('{} Data: Topic Assignments'.format(source[i]));

### Explore misclassified articles 

In [None]:
test_assignments = test_opt_eval.groupby(level='topic').idxmax(
    axis=1).reset_index(-1, drop=True).to_frame('predicted').reset_index()
test_assignments['heading'] = test_docs.heading.values
test_assignments['article'] = test_docs.article.values
test_assignments.head()

In [None]:
misclassified = test_assignments[(test_assignments.topic == 'sport') & (
    test_assignments.predicted == 'Topic 3')]
misclassified.heading

In [None]:
misclassified.article.tolist()