In [1]:
from typing import cast
import numpy as np
from nptyping import Float64, Int64, NDArray, Float32, Shape
import json

from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.spatial import distance

from sentence_transformers import SentenceTransformer
from umap import UMAP
from hdbscan import HDBSCAN
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from pydantic import BaseModel

In [2]:
from modules.elastic import ArticleSearchQuery
from modules.objects import FullArticle
from modules.config import BaseConfig

from dotenv import load_dotenv

load_dotenv()

config_options = BaseConfig()

In [3]:
articles = config_options.es_article_client.query_documents(ArticleSearchQuery(limit=0), True)[0]
len(articles)

33754

In [4]:
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = embedding_model.encode([article.content for article in articles], show_progress_bar=True, convert_to_numpy=True)

Batches:   0%|          | 0/1055 [00:00<?, ?it/s]

In [None]:
tfidf_vectorizer = TfidfVectorizer(min_df=5, stop_words='english')
tfidf_embeddings = tfidf_vectorizer.fit_transform([article.content for article in articles])

In [None]:
cluster_embeddings = UMAP(min_dist=0, n_neighbors=7, n_components=20, metric="cosine").fit_transform(embeddings)

In [None]:
reduced_embeddings = UMAP(min_dist=0, n_neighbors=7, n_components=2, metric="cosine").fit_transform(embeddings)

In [None]:
labels = HDBSCAN(
    min_cluster_size=5,
    min_samples=5,
    cluster_selection_epsilon=0.2,
    metric="euclidean",
    cluster_selection_method="eom",
    prediction_data=True,
).fit_predict(cluster_embeddings)

In [None]:
max(labels)

In [None]:
for i, article in enumerate(articles):
    article.ml["coordinates"] = (float(reduced_embeddings[i][0]), float(reduced_embeddings[i][1]))
    article.ml["cluster"] = int(labels[i])
    
with open("./articles.json", "w") as f:
    json.dump([article.model_dump(mode="json") for article in articles], f)

In [None]:
umap = UMAP(min_dist=0, n_neighbors=10, n_components=3, metric="cosine").fit(embeddings)

In [None]:
contents = [article.content for article in articles]
umap_model = UMAP(
    n_neighbors=15, n_components=15, min_dist=0.0, metric="cosine", random_state=42
)
hdbscan_model = HDBSCAN(
    min_cluster_size=10,
    metric="euclidean",
    cluster_selection_method="eom",
    prediction_data=True,
)
vectorizer_model = CountVectorizer(
    stop_words="english", min_df=2, ngram_range=(1, 2)
)


topic_model = BERTopic(
    # Pipeline models
    embedding_model=embedding_model,
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    vectorizer_model=vectorizer_model,
    # Hyperparameters
    top_n_words=10,
    verbose=True,
)
                                                                  
topic_numbers, _ = topic_model.fit_transform(contents, embeddings)

In [None]:
reduced_embeddings = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric='cosine').fit_transform(embeddings)
topic_model.visualize_documents([article.title for article in articles], reduced_embeddings=reduced_embeddings, hide_annotations=True)

In [None]:
reduced_embeddings = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric='cosine').fit_transform(embeddings)
topic_model.visualize_documents([article.title for article in articles], reduced_embeddings=reduced_embeddings, hide_annotations=True)