In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

# Loading

## Load cleaned data

In [2]:

DATA_FILE="newspapers_filtered_2024-04-30_18-17-52.jsonl"

from json import loads

data = [
    loads(line)
    for line in open(DATA_FILE, "r", encoding="utf-8").readlines()
]

print(f"Loaded {len(data)} records")

df = pd.DataFrame(data)

df["date"] = pd.to_datetime(df["date"])


Loaded 84593 records


## Load embeddings

In [3]:
EMBEDDINGS_FILE = f"{DATA_FILE}_embeddings.npy"

embeddings = np.load(EMBEDDINGS_FILE)

print(f"Loaded {len(embeddings)} embeddings")

Loaded 84593 embeddings


Add embeddings to dataframe

In [4]:
df["embedding"] = [e for e in embeddings]

In [5]:
df.iloc[0]

title             International de Sète : la pétanque, une affai...
text              L'international de pétanque de Sète, avait lie...
date                                            2022-03-07 00:00:00
article_id                                                  2047761
article_url       https://france3-regions.francetvinfo.fr/occitanie
article_domain                      france3-regions.francetvinfo.fr
embedding         [-0.02630615234375, 0.0106658935546875, 0.0487...
Name: 0, dtype: object

# Clustering

In [None]:
from sklearn.cluster import KMeans

model = KMeans(n_clusters=50, max_iter=1000)
model.fit(df['embedding'].to_list())
df["cluster"] = model.labels_

Let's show one random cluster

In [None]:
print(*df[df.cluster==23].title.head(100), sep='\n\n')

We can clearly see the subjects of each cluster. One can be about violence and crime, another about politics, another about sports, etc.

We found a cluster about rugby only, and what's amazing with this embedding approach, is that we can find related subjects even if they don't use the same words. 
Example : `Pro D2. FCG - Nevers : « On a toutes les cartes pour y répondre comme il faut », l'Ardéchois Luka Plataret veut réussir un coup au Stade des Alpes` doesn't contain the word `rugby` but is still in the rugby cluster.
`

# Grouping by week

The goal of this section if to group articles by week and find the most important topics of each week. 

For this, we will use the following approach :
- For each week, we will find the most important clusters
- We will then try to find the subject of each cluster

### Adding a year-week column to the dataframe.

In [None]:

df["year_week"] = df["date"].dt.strftime("%Y-%U")

assert df["year_week"].value_counts().sum() == len(df)

### Plot the number of articles per week

In [None]:
plt.figure(figsize=(30, 10))
sns.countplot(data=df, x="year_week", order=df["year_week"].value_counts().index)
plt.xticks(rotation=45)
plt.title("Number of articles per week")
plt.show()

### Clustering

In [None]:
grouped_data = df.groupby("year_week")

for week, group in grouped_data:
    print(f"Processing week {week}")

    N_CLUSTERS = int(len(group) / 10)

    model = KMeans(n_clusters=N_CLUSTERS, max_iter=1000)
    model.fit(group['embedding'].to_list())
    df.loc[df.year_week == week, "cluster"] = model.labels_

### Check one cluster

In [None]:
week = df[df['year_week'] == '2022-11' ]

# find the biggest cluster

biggest_cluster = week['cluster'].value_counts().idxmax()

week[week['cluster'] == biggest_cluster].title.head(100)


In [None]:
for week, group in grouped_data:
    print(f"Processing week {week}")
    # print 2 biggest clusters
    for cluster in group['cluster'].value_counts().index[:2]:
        print(f"Cluster {cluster}")
        print(*group[group.cluster==cluster].title.head(100), sep='\n\n')
        print("\n\n")

The biggest clusters created by KMeans do not always to represent the most important topics of the week.
We need an algorithms that is indifferent to the size of the clusters.

We should use DBSCAN or hierarchical clustering.

# Hierarchical clustering

In [None]:
from sklearn.cluster import AgglomerativeClustering


grouped_data = df.groupby("year_week")

N_CLUSTERS = 50
#LINKAGE = 'ward' #  or 'single', 'complete', or 'average' 
LINKAGE = 'single'
#LINKAGE = 'complete'
#LINKAGE = 'average'


for week, group in grouped_data:
    print(f"Processing week {week}")
    model = AgglomerativeClustering(n_clusters=N_CLUSTERS, linkage='ward')
for week, group in grouped_data:
    print(f"Processing week {week}")
    labels = model.fit_predict(group['embedding'].to_list())
    df.loc[df.year_week == week, "cluster"] = labels


In [None]:

# Analyze the clusters for each week
for week, group in grouped_data:
    print(f"Processing week {week}")

    # find 4 biggest clusters
    biggest_clusters = group['cluster'].value_counts().index[:4]

    for cluster in biggest_clusters:
        print(f"Cluster {cluster}")
        print(*group[group.cluster==cluster].title.head(100), sep='\n\n')
        print("\n\n --- \n\n")


# Using DBSCAN

Since 



### Dimension reduction



In [None]:
from sklearn.decomposition import PCA

# Reduce dimensionality using PCA
N_COMPONENTS = 25  # Choose the number of components based on your data

pca = PCA(n_components=N_COMPONENTS)
reduced_embeddings = pca.fit_transform(df['embedding'].tolist())

# Update the DataFrame with the reduced embeddings
df['reduced_embedding'] = reduced_embeddings.tolist()


In [None]:
from sklearn.cluster import OPTICS


MIN_SAMPLES = 10
MAX_EPS = 10

for week, group in grouped_data:
    print(f"Processing week {week}")
    model = OPTICS(min_samples=MIN_SAMPLES, max_eps=MAX_EPS, metric='cosine')
    labels = model.fit_predict(group['reduced_embedding'].tolist())
    df.loc[df.year_week == week, "cluster"] = labels


In [None]:
# Show the number of clusters for each week
plt.figure(figsize=(30, 10))
sns.countplot(data=df, x="year_week", hue="cluster")
plt.xticks(rotation=45)
plt.title("Number of articles per week")
plt.show()

In [None]:
# Analyze the clusters for each week
for week, group in grouped_data:
    print(f"Processing week {week}")

    # Find the unique clusters (excluding noise points)
    clusters = group['cluster'].unique()
    clusters = clusters[clusters != -1]  # Exclude noise points

    # Sort clusters by size (descending order)
    cluster_sizes = group['cluster'].value_counts()
    sorted_clusters = cluster_sizes.loc[clusters].index

    # Print the titles for each cluster
    for cluster in sorted_clusters[:2]:
        print(f"Cluster {cluster}")
        titles = group[group.cluster == cluster].title.tolist()
        print(*titles[:100], sep='\n\n')
        print("\n\n --- \n\n")

# HDBSCAN

In this section and as in the [Using LLM for Improving Key Event Discovery:
Temporal-Guided News Stream Clustering with Event Summaries](https://openreview.net/pdf?id=lojtRAQOls) paper, we will use HDBSCAN to find the most important topics of each week. 

This algorithm is very useful because it is indifferent to the size, shape and density of the clusters, while requiring only one parameter, the MIN_CLUSTER_SIZE.


In [6]:
import hdbscan



In [None]:
MIN_CLUSTER_SIZE = 3


for week, group in grouped_data:
    # get the progression (e.g  "34/123")
    
    print(f"Processing week {week}")
    model = hdbscan.HDBSCAN(min_cluster_size=MIN_CLUSTER_SIZE, metric='euclidean', cluster_selection_method='eom')
    labels = model.fit_predict(group['embedding'].tolist())
    df.loc[df.year_week == week, "cluster"] = labels


In [None]:
# Create a dataframe with the week as index, and the number of clusters for that week
df_clusters = df.groupby("year_week")["cluster"].nunique().reset_index()

# Plot the number of clusters for each week
plt.figure(figsize=(30, 10))
sns.barplot(data=df_clusters, x="year_week", y="cluster")
plt.xticks(rotation=45)
plt.title("Number of clusters per week")
plt.show()

In [None]:
# Analyze the clusters for each week
for week, group in grouped_data:
    print(f"Processing week {week}")

    # Find the unique clusters (excluding noise points)
    clusters = group['cluster'].unique()
    clusters = clusters[clusters != -1]  # Exclude noise points

    # Sort clusters by size (descending order)
    cluster_sizes = group['cluster'].value_counts()
    sorted_clusters = cluster_sizes.loc[clusters].index

    # Print the titles for each cluster
    for cluster in sorted_clusters[:2]:
        print(f"Cluster {cluster}")
        titles = group[group.cluster == cluster].title.tolist()
        print(*titles[:100], sep='\n\n')
        print("\n\n --- \n\n")

## Adding the datea as a clustering feature

In [9]:
# Convert date to a numerical feature
df['date_feature'] = df['date'].astype('int64') / 10**18  # Adjust the scaling factor as needed

# Combine the date feature with the embeddings
df['combined_features'] = df.apply(lambda row: np.append(row['embedding'], row['date_feature']), axis=1)


### Perform HDBSCAN clustering

In [11]:
MIN_CLUSTER_SIZE = 40

# take only first 2 months
df_restricted = 

model = hdbscan.HDBSCAN(min_cluster_size=MIN_CLUSTER_SIZE, metric='euclidean', cluster_selection_method='eom')
labels = model.fit_predict(df['combined_features'].tolist())
df['cluster'] = labels

### Plot the number of clusters for each date

In [None]:
# Create a dataframe with the date as index, and the number of clusters for that date
df_clusters = df.groupby(df['date'].dt.date)['cluster'].nunique().reset_index()

plt.figure(figsize=(30, 10))
sns.barplot(data=df_clusters, x='date', y='cluster')
plt.xticks(rotation=45)
plt.title("Number of clusters per date")
plt.show()