# Data Generation for Topic Labeling using Top2Vec (Gutenberg version)

by Andreas Sünder

## Setup

In [5]:
emb_model_id = 'jinaai/jina-embeddings-v2-base-de'
books_path = ''
limit = 2048
stop_words = 'english'

## Setup Model

In [None]:
from transformers import AutoModel, AutoTokenizer
import torch

model = AutoModel.from_pretrained(emb_model_id, trust_remote_code=True, device_map='auto')
tokenizer = AutoTokenizer.from_pretrained(emb_model_id, trust_remote_code=True)

## Convert documents to vectors

In [None]:
import os
from datasets import Dataset
import re

def process_texts():
  for filename in os.listdir(books_path):
    if re.fullmatch(r'book_\d*.txt', filename) is None:
      continue

    with open(os.path.join(books_path, filename), 'r') as file:
      try:
        text = file.read()
      except:
        continue
      
      index_start = text.find("** START")
      index_start = text.find("\n", index_start) + 1
      index_end = text.find("** END")
      text = text[index_start:index_end]
      
      batch_dict = tokenizer(text, return_tensors="pt")
      tokens = batch_dict["input_ids"].squeeze()
      token_parts = [tokens[i : i + limit] for i in range(0, len(tokens), limit)]
      parts = []

      for part in token_parts:
        parts.append(tokenizer.decode(part))

      if parts:
        for part in parts:
          with torch.no_grad():
            embedding = model.encode(part, show_progress_bar=False)
            yield {"text": part, "embedding": embedding}

dataset = Dataset.from_generator(process_texts)

## UMAP

In [None]:
from umap import UMAP

umap = UMAP(n_neighbors=15, n_components=5, metric='cosine', verbose=True)
umap_embeddings = umap.fit_transform(dataset['embedding'])

## HDBSCAN

In [None]:
from hdbscan import HDBSCAN

hdb = HDBSCAN(min_cluster_size=10, metric='euclidean', cluster_selection_method='eom')
cluster = hdb.fit(umap_embeddings)

## Visualization

In [None]:
import matplotlib.pyplot as plt

umap = UMAP(n_neighbors=15, n_components=2, metric='cosine', verbose=True)
umap_data = umap.fit_transform(dataset['embedding'])

In [None]:
import pandas as pd

result = pd.DataFrame(umap_data, columns=['x', 'y'])
result['labels'] = cluster.labels_
outliers = result.loc[result.labels == -1, :]
clustered = result.loc[result.labels != -1, :]

In [None]:
total = len(result)
print(f'Number of clusters: {len(set(cluster.labels_))}')
print(f'Ratio clustered/outliers: {len(clustered)/total:.1f}/{len(outliers)/total:.1f}')

In [None]:
fig, ax = plt.subplots()
ax.scatter(outliers.x, outliers.y, color='#BDBDBD', s=0.5)
ax.scatter(clustered.x, clustered.y, c=clustered.labels, s=1, cmap='turbo')
plt.show()

## Topic Extraction

In [None]:
# Optional: Save the clustered dataset
dataset_clustered = pd.DataFrame(umap_data, columns=['x', 'y'])
dataset_clustered['title'] = dataset['title']
dataset_clustered['topic'] = cluster.labels_
dataset_clustered['topic'] = dataset_clustered['topic'].map('Topic {}'.format)
dataset_clustered.to_csv('mn-ds_clustered.csv', index=False)

In [None]:
import pandas as pd
df = dataset.remove_columns(['embedding']).to_pandas()

docs_df = pd.DataFrame(df, columns=['text'])
docs_df['topic'] = cluster.labels_
docs_df = docs_df[docs_df.topic != -1]
docs_per_topic = docs_df.groupby(['topic'], as_index=False).agg({'text': ' '.join})

In [None]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

if stop_words == 'german':
  stop_words = open('german_stopwords.txt', 'r').read().splitlines()

def c_tf_idf(documents, m, ngram_range=(1, 2)):
  count = CountVectorizer(ngram_range=ngram_range, stop_words=stop_words).fit(documents)
  t = count.transform(documents).astype(np.uint8).toarray()
  w = t.sum(axis=1)
  tf = np.divide(t.T, w)
  sum_t = t.sum(axis=0)
  idf = np.log(np.divide(m, sum_t)).reshape(-1, 1)
  tf_idf = np.multiply(tf, idf)

  return tf_idf, count
  
tf_idf, count = c_tf_idf(docs_per_topic.text.values, m=len(dataset))

In [None]:
def extract_top_n_words_per_topic(tf_idf, count, docs_per_topic, n=20):
  words = count.get_feature_names_out()
  labels = list(docs_per_topic.topic)
  tf_idf_transposed = tf_idf.T
  indices = tf_idf_transposed.argsort()[:, -n:]
  top_n_words = {label: [(words[j], tf_idf_transposed[i][j]) for j in indices[i]][::-1] for i, label in enumerate(labels)}
  return top_n_words

top_n_words = extract_top_n_words_per_topic(tf_idf, count, docs_per_topic)

In [None]:
for topic in range(len(top_n_words)):
  print(', '.join([word[0] for word in top_n_words[topic][:10]]))