In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

# Loading

In [None]:

DATA_FILE="newspapers_filtered_2024-04-30_18-17-52.jsonl"

from json import loads

data = [
    loads(line)
    for line in open(DATA_FILE, "r", encoding="utf-8").readlines()
]

print(f"Loaded {len(data)} records")

df = pd.DataFrame(data)

df["date"] = pd.to_datetime(df["date"])


In [None]:
EMBEDDINGS_FILE = f"{DATA_FILE}_embeddings.npy"

embeddings = np.load(EMBEDDINGS_FILE)

df["embedding"] = [e for e in embeddings]

print(f"Loaded {len(embeddings)} embeddings")

In [None]:
df.iloc[0]

# HDBSCAN

In [None]:
import hdbscan

MIN_CLUSTER_SIZE = 10

## Dimension reduction using UMAP

In [None]:
from umap import UMAP

# Specify the desired number of dimensions (K)
K = MIN_CLUSTER_SIZE

# Create a UMAP object with the specified number of dimensions
umap_reducer = UMAP(n_components=K, random_state=42)

# Fit and transform the embeddings to reduce dimensionality
umap_embeddings = umap_reducer.fit_transform(df['embedding'].tolist())

# Set the reduced embeddings as a list to each row in the DataFrame
df[f'umap_embedding{K}'] = umap_embeddings.tolist()

## Adding the date as a clustering feature

In [None]:
df.sort_values("date", inplace=True)

In [None]:
# Convert date to a numerical feature
df['date_feature'] = df['date'].astype('int64') / 10**18  # Adjust the scaling factor as needed

### Set features column

In [None]:
# Combine the UMAP embedding and date feature
df['combined_features'] = df.apply(lambda row: row[f'umap_embedding{K}'] + [row['date_feature']], axis=1)

### Perform HDBSCAN clustering

In [None]:
model = hdbscan.HDBSCAN(min_cluster_size=MIN_CLUSTER_SIZE, metric='euclidean', cluster_selection_method='eom')
labels = model.fit_predict(df['combined_features'].tolist())
df['cluster'] = labels

# Analyse clusters

In [22]:
# show titles from the biggest cluster (except noise cluster)


biggest_cluster = df[df['cluster']!=-1].value_counts('cluster').idxmax()

df[df['cluster']==biggest_cluster].sort_values('date')[['date', 'title']]

Unnamed: 0,date,title
4547,2022-02-11,Billie Holiday et les grandes voix du jazz
66565,2022-02-12,Monbazillac : du théâtre pour prévenir le sexi...
69953,2022-02-13,La danseuse soufie Rana Gorgani a fait tourner...
57778,2022-02-14,Saint-Symphorien : Rag Mama Rag en concert au ...
39864,2022-02-15,Bordeaux : la première création d’un duo du Co...
...,...,...
68261,2024-02-06,« Je savoure comme je n’ai jamais savouré » : ...
65550,2024-02-07,"Vidéos. Chanson : Hildebrandt, Emilie Marsh et..."
39587,2024-02-11,Le Taillan-Médoc : Pascal Viau et la chorale E...
10292,2024-02-11,Opéra de Bordeaux : « situation sociale alarma...


In [23]:
# for each cluster, show the number of articles and the first 5 titles

for cluster in df['cluster'].unique()[:100]:
    min_date = df[df['cluster'] == cluster]['date'].min()
    max_date = df[df['cluster'] == cluster]['date'].max()
    duration = max_date - min_date

    print(f"Cluster {cluster} ({len(df[df['cluster'] == cluster])} articles, from {min_date} to {max_date}, duration: {duration})")
    for title in df[df['cluster'] == cluster].sort_values("date")["title"].head():
        print(f"  {title}")
    print()

Cluster -1 (43905 articles, from 2022-02-11 00:00:00 to 2024-02-13 00:00:00, duration: 732 days 00:00:00)
  Carignan-de-Bordeaux : schéma directeur vélo et équipements sportifs étaient au menu du conseil municipal
  Esnandes : les alchimies oniriques de Patricia Gorbaty
  Aveyron : des dizaines d'animaux morts retrouvés morts dans une ferme
  Centrès : pour l’association du château de Taurines, une nouvelle saison s’éveille
  Castelbaladins : les spectacles continuent malgré la crise

Cluster 459 (61 articles, from 2022-02-11 00:00:00 to 2024-02-08 00:00:00, duration: 727 days 00:00:00)
  Carcassonne - Richard Anconina au théâtre Jean-Alary : "Je me voyais mal parler fort sur scène devant des gens"
  Carcassonne : des poids lourds de l'équipement viticole à Bezons
  Carcassonne : quand la campagne de don de sang se drape de gastronomie et de gourmandise
  Carcassonne : Nicolas Dupont-Aignan en visite ce vendredi sur le thème du tourisme
  Carcassonne : les commerçants des halles associ

# Analyse HDBSCAN complexity

In [None]:
import time

MIN_CLUSTER_SIZE = 10

# Define the time periods to use for clustering
time_periods = np.logspace(1, 1.6, num=10, dtype=int)
time_taken = []

print(f"Time periods: {time_periods}")

In [None]:
# Perform clustering for each time period

for time_period in time_periods:
    start_time = time.time()
    df_restricted = df[df['date'] < df['date'].iloc[0] + pd.DateOffset(days=time_period)]
    model = hdbscan.HDBSCAN(min_cluster_size=MIN_CLUSTER_SIZE, metric='euclidean', cluster_selection_method='eom')
    labels = model.fit_predict(df_restricted['combined_features'].tolist())
    df_restricted.loc[:, 'cluster'] = labels
    time_taken.append(time.time() - start_time)
    print(f"Time period: {time_period} days, time taken: {time_taken[-1]:.2f} s, clusters: {len(df_restricted['cluster'].unique())}")


In [None]:
# Plot the time taken for each time period

plt.plot(time_periods, time_taken)
plt.xlabel('Time period (days)')
plt.ylabel('Time taken (s)')
plt.title('Time taken for clustering')
plt.show()


In [None]:
time_periods