# BERTopic - Tutorial
We start with installing bertopic from pypi before preparing the data. 

**NOTE**: Make sure to select a GPU runtime. Otherwise, the model can take quite some time to create the document embeddings!

In [1]:
!pip install bertopic



# Prepare data
For this example, we use the famous 20 Newsgroups dataset which contains roughly 18000 newsgroups posts on 20 topics.

In [1]:
from bertopic import BERTopic
from sklearn.datasets import fetch_20newsgroups
 
docs = fetch_20newsgroups(subset='train')['data']

In [2]:
#docs is a list of strings
len(docs)

11314

In [3]:
import pandas as pd
df = pd.read_csv('/home/ajanco/projects/slavic_review/SEELANGS/SEELANGS.csv')
docs = list(df.text)
docs[0]

'\n<div align="left">\n<table cellpadding="0" cellspacing="0" border="0">\n<tbody><tr>\n<td><a href="javascript:print()"><img src="/archives/images/b-print.png" alt="Print" title="Print" border="0"></a></td>\n<td><img src="/archives/images/b-blank.gif" alt="" width="5" height="1"></td>\n<td nowrap=""><p><a href="javascript:print()" style="font-family: Arial, Helvetica, sans-serif; font-size: 12px; font-weight: bold; color: #3366CC; text-decoration: none">Print</a></p></td>\n</tr>\n</tbody></table>\n<hr>\n</div>\n<pre> \n\nDear SEELANGERS!\n\n \n\nI am excited to present to you the latest addition to the SRAS Online lineup. Please share with students, friends, colleagues, other departments, even family members who enjoy learning!\n\n \n\nНачинаем! Getting Started with Russian\n\nWhen: 4 meetings + asynchronous work\n\nCost: $99\n\nWhat: Looking to “sample” Russian and determine whether it is the language for you? This is a no stress, no test way for you to familiarize yourself with a la

# Create Topics
We use the **distilbert-base-nli-mean-tokens** model as it is the recommended model for creating sentence embeddings according to the authors of the [sentence-embeddings](https://www.sbert.net/docs/pretrained_models.html) package. However, you can use whatever embeddings is currently pre-trained in the package.

In [4]:
model = BERTopic("distilbert-base-nli-mean-tokens", verbose=True)
topics = model.fit_transform(docs)

2020-11-15 19:47:56,987 - BERTopic - Loaded BERT model
INFO:BERTopic:Loaded BERT model


KeyboardInterrupt: 

In [16]:
# Get most frequent topics
model.get_topics_freq()[:10]

Unnamed: 0,Topic,Count
0,1,59051
1,-1,878
2,0,48


In [18]:
# Get a topic 
model.get_topic(0)[:100]

[('2020', 0.014756688930008855),
 ('hanna', 0.01176757780860469),
 ('chuchvaha', 0.009217095125575654),
 ('https', 0.009173698782757837),
 ('volha', 0.009114942784486036),
 ('heidelberg', 0.008340001453894068),
 ('virtual', 0.008017114644771093),
 ('wixsite', 0.0070440696481119365),
 ('lesbian', 0.006728334244283046),
 ('aug', 0.006030023108767534),
 ('sras', 0.00542157678931076),
 ('aksyonov', 0.005351767001042392),
 ('zoom', 0.0053329133874397685),
 ('nafta', 0.004989702176733887),
 ('poetry', 0.004885032950130228),
 ('adjunct', 0.004722709715659518),
 ('phd', 0.0047159575324253146),
 ('online', 0.004713860817263869),
 ('august', 0.0045312911068235194),
 ('ires', 0.004507260852800454)]

In [23]:
import numpy as np
import pandas as pd

import umap
import hdbscan
from sentence_transformers import SentenceTransformer
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
import matplotlib.pyplot as plt


def c_tf_idf(documents, m, ngram_range=(1, 1)):
    """ Calculate a class-based TF-IDF where m is the number of total documents. """
    count = CountVectorizer(ngram_range=ngram_range, stop_words="english").fit(documents)
    t = count.transform(documents).toarray()
    w = t.sum(axis=1)
    tf = np.divide(t.T, w)
    sum_t = t.sum(axis=0)
    idf = np.log(np.divide(m, sum_t)).reshape(-1, 1)
    tf_idf = np.multiply(tf, idf)

    return tf_idf, count

def extract_top_n_words_per_topic(tf_idf, count, docs_per_topic, n=20):
    words = count.get_feature_names()
    labels = list(docs_per_topic.Topic)
    tf_idf_transposed = tf_idf.T
    indices = tf_idf_transposed.argsort()[:, -n:]
    top_n_words = {label: [(words[j], tf_idf_transposed[i][j]) for j in indices[i]][::-1] for i, label in enumerate(labels)}
    return top_n_words

def extract_topic_sizes(df):
    topic_sizes = (df.groupby(['Topic'])
                     .Doc
                     .count()
                     .reset_index()
                     .rename({"Topic": "Topic", "Doc": "Size"}, axis='columns')
                     .sort_values("Size", ascending=False))
    return topic_sizes

In [20]:
!pip install matplotlib



## Model serialization
The model and its internal settings can easily be saved. Note that the documents and embeddings will not be saved. However, UMAP and HDBSCAN will be saved. 

In [None]:
# Save model
model.save("my_model")	

In [None]:
# Load model
my_model = BERTopic.load("my_model")	

In [None]:
my_model.get_topic(4)[:10]

[('baseball', 0.01534818753609341),
 ('players', 0.01113384693242755),
 ('cubs', 0.010651317673247482),
 ('game', 0.01064425481072388),
 ('braves', 0.010439585241772109),
 ('pitching', 0.009477156669897367),
 ('games', 0.009166144809830891),
 ('runs', 0.009154570979537589),
 ('year', 0.008982491530594413),
 ('team', 0.00894693731063402)]