# Topic Extraction of News Articles

by Andreas Sünder

## Load Data

In [None]:
from datasets import load_dataset
df = load_dataset('textminr/topic-labeling', 'mn-ds', split='train').to_pandas()
df = df[['title', 'content']]

## Setup Model and Convert Documents to Vectors

In [None]:
import torch
from transformers import AutoModel

model = AutoModel.from_pretrained(
  'jinaai/jina-embeddings-v2-small-en',
  torch_dtype=torch.float16,
  load_in_4bit=True,
  trust_remote_code=True,
  device_map='auto'
)

Plotting the content lengths of and filtering out the largest documents:

In [None]:
lengths = df[df['content'].str.len() < 40000]['content'].str.len()
lengths.hist(bins=100)

In [None]:
df_filtered = df[df['content'].str.len() < 40000]
df_filtered.reset_index(drop=True, inplace=True)

In [None]:
from tqdm import tqdm

embeddings = []
for i in tqdm(range(len(df_filtered))):
  if i % 500 == 0:
    torch.cuda.empty_cache()
    
  content = df_filtered['content'][i]
  embeddings.append(model.encode(content))

torch.cuda.empty_cache()

In [None]:
df_filtered['embeddings'] = embeddings

In [None]:
import pandas as pd

if 'df_filtered' in globals():
  df_filtered[['title', 'embeddings']].to_pickle('mn-ds.pkl')
else:
  df_filtered = pd.read_pickle('mn-ds.pkl')

## UMAP

In [None]:
from umap import UMAP

umap_embeddings = UMAP(
  n_neighbors=15, n_components=5, metric='cosine', verbose=True
).fit_transform(embeddings)

## HDBSCAN

In [None]:
from hdbscan import HDBSCAN

cluster = HDBSCAN(
  min_cluster_size=15,
  metric='euclidean',
  cluster_selection_method='eom'
).fit(umap_embeddings)

## Visualization

In [None]:
import matplotlib.pyplot as plt

umap_data = UMAP(
  n_neighbors=15, n_components=2, metric='cosine', verbose=True
).fit_transform(embeddings)

In [None]:
result = pd.DataFrame(umap_data, columns=['x', 'y'])
result['labels'] = cluster.labels_
outliers = result.loc[result.labels == -1, :]
clustered = result.loc[result.labels != -1, :]

In [None]:
fig, ax = plt.subplots()
ax.scatter(outliers.x, outliers.y, color='#BDBDBD', s=0.5)
ax.scatter(clustered.x, clustered.y, c=clustered.labels, s=1, cmap='hsv_r')
plt.show()

## Topic Extraction

In [None]:
docs_df = pd.DataFrame(df_filtered, columns=['title', 'content'])
docs_df['topic'] = cluster.labels_
docs_df = docs_df[docs_df.topic != -1]
docs_per_topic = docs_df.groupby(['topic'], as_index=False).agg({'content': ' '.join})

In [None]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

def c_tf_idf(documents, m, ngram_range=(1, 1)):
    count = CountVectorizer(ngram_range=ngram_range, stop_words="english").fit(documents)
    t = count.transform(documents).toarray()
    w = t.sum(axis=1)
    tf = np.divide(t.T, w)
    sum_t = t.sum(axis=0)
    idf = np.log(np.divide(m, sum_t)).reshape(-1, 1)
    tf_idf = np.multiply(tf, idf)

    return tf_idf, count
  
tf_idf, count = c_tf_idf(docs_per_topic.content.values, m=len(df))

In [None]:
def extract_top_n_words_per_topic(tf_idf, count, docs_per_topic, n=20):
    words = count.get_feature_names_out()
    labels = list(docs_per_topic.topic)
    tf_idf_transposed = tf_idf.T
    indices = tf_idf_transposed.argsort()[:, -n:]
    top_n_words = {label: [(words[j], tf_idf_transposed[i][j]) for j in indices[i]][::-1] for i, label in enumerate(labels)}
    return top_n_words

top_n_words = extract_top_n_words_per_topic(tf_idf, count, docs_per_topic, n=20)

In [None]:
for topic in range(len(top_n_words)):
  print(', '.join([word[0] for word in top_n_words[topic][:10]]))