In [None]:
!pip install bertopic datasets openai datamapplot

In [None]:
from datasets import load_dataset
dataset = load_dataset("maartengr/arxiv_nlp")["train"]

In [None]:
dataset

Dataset({
    features: ['Titles', 'Abstracts', 'Years', 'Categories'],
    num_rows: 44949
})

In [None]:
# Extract metadata
abstracts = list(dataset["Abstracts"])
titles = list(dataset["Titles"])

### A common Pipeline for Text Clustering

#### **1. Embedding Documents**

In [None]:
from sentence_transformers import SentenceTransformer

# Create an embedding for each abstract
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = embedding_model.encode(abstracts, show_progress_bar=True)

In [None]:
embeddings.shape

#### **2. Reducing the Dimensionality of Embeddings**

In [None]:
from umap import UMAP

umap_model = UMAP(
    n_components=5,
    n_neighbors=15,
    min_dist=0.0,
    metric='cosine',
    random_state=42
)

In [None]:
reduced_embeddings = umap_model.fit_transform(embeddings)

In [None]:
reduced_embeddings.shape

#### **3. Cluster the Reduced Embeddings**

In [None]:
from hdbscan import HDBSCAN

# We fit the model and extract the clusters
hdbscan_model = HDBSCAN(
    min_cluster_size=50, metric='euclidean', cluster_selection_method='eom'
).fit(reduced_embeddings)
clusters = hdbscan_model.labels_

# How many clusters did we generate?
len(set(clusters))

### Inspecting the Clusters

In [None]:
import numpy as np

# Print first three documents in cluster 0
cluster = 0
for index in np.where(clusters==cluster)[0][:3]:
    print(abstracts[index][:300] + "... \n")

#### Next, we reduce our embeddings to 2-dimensions so that we can plot them and get a rough understanding of the generated clusters.



In [None]:
import pandas as pd

# Reduce 384-dimensional embeddings to 2 dimensions for easier visualization
reduced_embeddings = UMAP(
    n_components=2, min_dist=0.0, metric='cosine', random_state=42
).fit_transform(embeddings)

# Create dataframe
df = pd.DataFrame(reduced_embeddings, columns=["x", "y"])
df["title"] = titles
df["cluster"] = [str(c) for c in clusters]

# Select outliers and non-outliers (clusters)
clusters_df = df.loc[df.cluster != "-1", :]
outliers_df = df.loc[df.cluster == "-1", :]

### Static Plot

In [None]:
import matplotlib.pyplot as plt

# Plot outliers and non-outliers seperately
plt.scatter(outliers_df.x, outliers_df.y, alpha=0.05, s=2, c="grey")
plt.scatter(
    clusters_df.x, clusters_df.y, c=clusters_df.cluster.astype(int),
    alpha=0.6, s=2, cmap='tab20b'
)
plt.axis('off')
plt.savefig("matplotlib.png", dpi=300)  # Uncomment to save the graph as a .png

### From Text Clustering to Topic Modeling

### BERTopic: A Modular Topic Modeling Framework


In [None]:
from bertopic import BERTopic

# Train our model with our previously defined models
topic_model = BERTopic(
    embedding_model=embedding_model,
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    verbose=True
).fit(abstracts, embeddings)

In [None]:
topic_model.get_topic_info()

In [None]:
topic_model.get_topic(0)

In [None]:
topic_model.find_topics("topic modeling")

In [None]:
topic_model.get_topic(22)

In [None]:
topic_model.topics_[titles.index('BERTopic: Neural topic modeling with a class-based TF-IDF procedure')]

#### It is! We expected it might be because there are non-LDA specific words in the topic describtion such as "clustering" and "topic".

### Visualizations
### Visualize Documents

In [None]:
# Visualize topics and documents
fig = topic_model.visualize_documents(
    titles,
    reduced_embeddings=reduced_embeddings,
    width=1200,
    hide_annotations=True
)

# Update fonts of legend for easier visualization
fig.update_layout(font=dict(size=16))