# __Step 4.2: DistillBert-based__

https://towardsdatascience.com/topic-modeling-with-bert-779f7db187e6

## ___Set up___

### Module import

Complained about:
- `TqdmWarning: IProgress not found. Please update jupyter and ipywidgets`
- Following [this](https://ipywidgets.readthedocs.io/en/stable/user_install.html) to install ipywidgets.

In [8]:
import os, pickle
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from pathlib import Path
from tqdm import tqdm
from sentence_transformers import SentenceTransformer
from umap import UMAP
from hdbscan import HDBSCAN

### Key variables

In [2]:
# Reproducibility
seed = 20220609

# Setting working directory
proj_dir   = Path.home() / "projects/plant_sci_hist"
work_dir   = proj_dir / "4_topic_model/4_2_distillbert"
work_dir.mkdir(parents=True, exist_ok=True)

# plant science corpus
dir25       = proj_dir / "2_text_classify/2_5_predict_pubmed"
corpus_file = dir25 / "corpus_plant_421658.tsv.gz"

# processed docs
dir41            = proj_dir / "4_topic_model/4_1_get_topics"
docs_clean_file  = dir41 / "corpus_plant_421658_proc_txt.pkl"

# output
topic_model_file = work_dir / "topic_model_distillbert"
topics_file      = work_dir / "topics_distillbert.pickle"

## ___Load data and get embeddings___

### Load cleaned data

In [5]:
with open(docs_clean_file, "rb") as f:
  docs_clean = pickle.load(f)

In [6]:
len(docs_clean)

421658

### Get doc embeddings 

In [7]:
model_bert = SentenceTransformer('distilbert-base-nli-mean-tokens')

Downloading:   0%|          | 0.00/690 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/3.99k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/550 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/265M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/450 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/229 [00:00<?, ?B/s]

In [None]:
embeddings_bert = model_bert.encode(docs_clean, show_progress_bar=True)

## ___Document clusetering___

### Initialize UMAP and HDBSCAN

In [None]:
embeddings_umap = UMAP(n_neighbors=15, 
                       n_components=20, 
                       metric='cosine').fit_transform(embeddings_bert)

In [None]:
cluster_hdbscan = HDBSCAN(min_cluster_size=15,
                          metric='euclidean',                      
                          cluster_selection_method='eom').fit(embeddings_umap)

### Visualize resulted clusters

In [None]:
# reduce dimensionality to two. The explanation was to distinguish main data
# from outliers. Q: Not exactly share why.
embeddings_umap_2 = UMAP(n_neighbors=15, n_components=2, 
                  min_dist=0.0, metric='cosine').fit_transform(embeddings_bert)
result_umap2      = pd.DataFrame(umap_data, columns=['x', 'y'])
result_umap2['labels'] = cluster_hdbscan.labels_

# Visualize clusters
fig, ax   = plt.subplots(figsize=(20, 10))
outliers  = result_umap2.loc[result_umap2.labels == -1, :]
clustered = result_umap2.loc[result_umap2.labels != -1, :]
plt.scatter(outliers.x, outliers.y, color='#BDBDBD', s=0.05)
plt.scatter(clustered.x, clustered.y, c=clustered.labels, s=0.05, cmap='hsv_r')
plt.colorbar()

## ___Topic creation___

### c-TF-IDF

In [None]:
# Ceate a dataframe with topic (cluster ID) and doc ID.
docs_df = pd.DataFrame(docs_clean, columns=["Doc"])
docs_df['Topic'] = cluster_hdbscan.labels_
docs_df['Doc_ID'] = range(len(docs_df))
docs_per_topic = docs_df.groupby(['Topic'], as_index = False).agg({'Doc': ' '.join})

In [None]:
def c_tf_idf(documents, m, ngram_range=(1, 1)):
    count = CountVectorizer(ngram_range=ngram_range, stop_words="english").fit(documents)
    t = count.transform(documents).toarray()
    w = t.sum(axis=1)
    tf = np.divide(t.T, w)
    sum_t = t.sum(axis=0)
    idf = np.log(np.divide(m, sum_t)).reshape(-1, 1)
    tf_idf = np.multiply(tf, idf)

    return tf_idf, count

In [None]:
tf_idf, count = c_tf_idf(docs_per_topic.Doc.values, m=len(docs_clean))

### Representing topics

In [None]:
def extract_top_n_words_per_topic(tf_idf, count, docs_per_topic, n=20):
    words = count.get_feature_names()
    labels = list(docs_per_topic.Topic)
    tf_idf_transposed = tf_idf.T
    indices = tf_idf_transposed.argsort()[:, -n:]
    top_n_words = {label: [(words[j], tf_idf_transposed[i][j]) for j in indices[i]][::-1] for i, label in enumerate(labels)}
    return top_n_words

def extract_topic_sizes(df):
    topic_sizes = (df.groupby(['Topic'])
                     .Doc
                     .count()
                     .reset_index()
                     .rename({"Topic": "Topic", "Doc": "Size"}, axis='columns')
                     .sort_values("Size", ascending=False))
    return topic_sizes

In [None]:
top_n_words = extract_top_n_words_per_topic(tf_idf, count, docs_per_topic, n=20)
topic_sizes = extract_topic_sizes(docs_df); topic_sizes.head(10)

In [None]:
top_n_words[7][:10]

In [None]:
top_n_words[30][:10]

### Topic reduction

In [None]:
for i in range(20):
    # Calculate cosine similarity
    similarities = cosine_similarity(tf_idf.T)
    np.fill_diagonal(similarities, 0)

    # Extract label to merge into and from where
    topic_sizes = docs_df.groupby(['Topic']).count().sort_values("Doc", ascending=False).reset_index()
    topic_to_merge = topic_sizes.iloc[-1].Topic
    topic_to_merge_into = np.argmax(similarities[topic_to_merge + 1]) - 1

    # Adjust topics
    docs_df.loc[docs_df.Topic == topic_to_merge, "Topic"] = topic_to_merge_into
    old_topics = docs_df.sort_values("Topic").Topic.unique()
    map_topics = {old_topic: index - 1 for index, old_topic in enumerate(old_topics)}
    docs_df.Topic = docs_df.Topic.map(map_topics)
    docs_per_topic = docs_df.groupby(['Topic'], as_index = False).agg({'Doc': ' '.join})

    # Calculate new topic words
    m = len(data)
    tf_idf, count = c_tf_idf(docs_per_topic.Doc.values, m)
    top_n_words = extract_top_n_words_per_topic(tf_idf, count, docs_per_topic, n=20)

In [None]:
topic_sizes = extract_topic_sizes(docs_df); topic_sizes.head(10)