# Méthodes pour le clustering pré-classifié

**(1) Préparation du Dataset**

Importer Packages et Data

In [1]:
# Importer les packages
import pandas as pd
import numpy as np
from collections import defaultdict
import random
import os
import nltk
from nltk.tokenize import word_tokenize  # Using NLTK to tokenize text

nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\jeanv\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

Importation et Pre-Processing

In [2]:
# Importer le dataset 'DF_50_3000_DeDupl.xslx
us_pol = pd.read_excel(r'C:\Users\jeanv\Documents\GitHub\jun24_bc_llm\data\processed\df_for_preclass_samples.xlsx')

# Enlever les colonnes inutiles
us_pol = us_pol.drop(['top_keywords', 'top_keywords_list', 'record', 'sign_count'], axis = 1)

# renommer labels en topic
us_pol = us_pol.rename(columns={'km_labels100': 'topic'})

# Créer un nouveau dataframe avec que les 'short' et 'medium' pour le clustering
us_pol_short = us_pol[us_pol['short_medium'] == 'short (50-280)']
us_pol_medium = us_pol[us_pol['short_medium'] == 'medium (281-3000)']

**(2) Clustering Semi-Supervisé**

Mini-fonction pour tokenizer

In [3]:
# Function to tokenize text and get token count
def count_tokens(text):
    return len(nltk.word_tokenize(text))

Fonction pour le clustering de textes pré-classifiés (100 clusters, 1900 tokens)

In [4]:
def create_topic_clusters(df, max_tokens_per_cluster=1900):
    # Création du dataframe final
    final_clusters = pd.DataFrame(columns=['cluster_id', 'clustered_text', 'id_50_3000', 'token_number', 'topic'])

    # Extraire le nombre de topics
    unique_topics = df['topic'].unique()

    # Initialiser cluster ID
    cluster_id = 1

    # Loop par le nombre de topics pour créer un cluster initial
    topic_clusters = {}
    for topic in unique_topics:
        # Filtrer df pour ne garder que les lignes liés au topic X
        topic_df = df[df['topic'] == topic]

        # Randomiser le dataframe filtrer
        topic_df = topic_df.sample(frac=1).reset_index(drop=True)

        # Initialiser variables pour le clustering
        cluster_text = ""
        cluster_tokens = 0
        clustered_ids = []

        # Passer par chaque ligne du df lié au topic X
        for index, row in topic_df.iterrows():
            # Mise en format des textes dans le cluster
            text = f"- {row['text']}\n"
            # Compter les tokens
            text_tokens = count_tokens(text)
            
            # Vérifier si l'ajout du texte va au delà de la limite
            if cluster_tokens + text_tokens > max_tokens_per_cluster:
                break

            # Ajouter le texte au cluster
            cluster_text += text
            # Ajouter nombre de tokens
            cluster_tokens += text_tokens
            # Ajouter id du texte utilisé
            clustered_ids.append(str(row['id_50_3000']))

        # Sauvegarder le cluster
        topic_clusters[topic] = {
            'clustered_text': cluster_text,
            'id_50_3000': clustered_ids,
            'token_number': cluster_tokens,
            'topic': topic  # Add topic to the cluster
        }

    # Convert the final clusters to a dataframe
    for topic, cluster in topic_clusters.items():
        new_cluster = pd.DataFrame({
            'cluster_id': [cluster_id],
            'clustered_text': [cluster['clustered_text']],
            'id_50_3000': [', '.join(cluster['id_50_3000'])],
            'token_number': [cluster['token_number']],
            'topic': [cluster['topic']]  # Add topic to the dataframe
        })
        final_clusters = pd.concat([final_clusters, new_cluster], ignore_index=True)
        cluster_id += 1

    return final_clusters

Appliquer fonction aux posts short et medium

In [5]:
# Appliquer fonctions aux short
clusters_short = create_topic_clusters(us_pol_short)

# Appliquer fonctions aux medium
clusters_medium = create_topic_clusters(us_pol_medium)

Éliminer mauvais topics et id_50_3000 déjà utilisés du df original - *Pour les posts shorts* 

In [6]:
# Extract clusters with more than 1000 tokens
filtered_posts_short = clusters_short[clusters_short['token_number'] >= 1000]

# Eliminer les topics qui ne sont pas bons
bad_topics_short = clusters_short[clusters_short['token_number'] < 1000]['topic']
us_pol_short_prefiltered = us_pol_short[~us_pol_short['topic'].isin(bad_topics_short)]

# Eliminer les ids utilisées
ids_to_remove = filtered_posts_short['id_50_3000'].str.split(', ').explode().astype(int).tolist()
us_pol_short_filtered = us_pol_short_prefiltered[~us_pol_short_prefiltered['id_50_3000'].isin(ids_to_remove)]

Éliminer mauvais topics et id_50_3000 déjà utilisés du df original - *Pour les posts medium*

In [7]:
# Extract clusters with less than 1000 tokens
filtered_posts_medium = clusters_medium[clusters_medium['token_number'] >= 1000]

# Eliminer les topics qui ne sont pas bons
bad_topics_medium = clusters_medium[clusters_medium['token_number'] < 1000]['topic']
us_pol_medium_prefiltered = us_pol_medium[~us_pol_medium['topic'].isin(bad_topics_medium)]

# Eliminer les ids utilisées
ids_to_remove = filtered_posts_medium['id_50_3000'].str.split(', ').explode().astype(int).tolist()
us_pol_medium_filtered = us_pol_medium_prefiltered[~us_pol_medium_prefiltered['id_50_3000'].isin(ids_to_remove)]

Apply function on new dataframes and only select what is needed to reach total number  - *Pour les posts short*

In [8]:
# Appliquer fonction sur df filtré short
fill_clusters_short_pre = create_topic_clusters(us_pol_short_filtered)

# Calculer combien de posts doivent être remplacés pour short
tokens_deleted_short = len(clusters_short) - len(filtered_posts_short)

# Garder que ce qui est nécessaire
fill_clusters_short = fill_clusters_short_pre.head(tokens_deleted_short)

# Concaténer pour avoir 100 posts
final_clusters_short = pd.concat([filtered_posts_short, fill_clusters_short], ignore_index=True)

# Only store first 100 rows
final_clusters_short = final_clusters_short.head(100)

# Ajouter une colonnes 'short' et combiner avec le 'cluster_id'
final_clusters_short['type'] = 'short'
final_clusters_short['cluster_id'] = ['PC_short_' + str(i) for i in range(1, len(final_clusters_short) + 1)]

*Pour les posts medium*

In [9]:
# Appliquer fonction sur df filtré medium
fill_clusters_medium = create_topic_clusters(us_pol_medium_filtered)

# Combien de clusters doivent être remplacés pour medium
tokens_deleted_medium = len(clusters_medium) - len(filtered_posts_medium)

# Garder que ce qui est nécessaire
fill_clusters_medium = fill_clusters_medium.head(tokens_deleted_medium + 1)

# Concaténer pour avoir 100 posts
final_clusters_medium = pd.concat([filtered_posts_medium, fill_clusters_medium], ignore_index=True)

# Ajouter une colonnes 'medium' et combiner avec le 'cluster_id'
final_clusters_medium['type'] = 'medium'
final_clusters_medium['cluster_id'] = ['PC_medium_' + str(i) for i in range(1, len(final_clusters_medium) + 1)]

Post-Processing

In [10]:
# Merge both 'short' and 'medium' dataframes together together
preclassified_clusters = pd.concat([final_clusters_medium, final_clusters_short], axis=0, ignore_index=True)

# Mettre R_medium
preclassified_clusters['type'] = 'PC_' + preclassified_clusters['type']

In [11]:
preclassified_clusters.to_excel('preclassified_clusters_V2.xlsx', index=False)
final_clusters_medium.head(20)

Unnamed: 0,cluster_id,clustered_text,id_50_3000,token_number,topic,type
0,PC_medium_1,- If you ve been wondering how long it would b...,"24224, 62918, 9166, 43918",1795,50,medium
1,PC_medium_2,"- No Obama won t run again, but he ll use ever...","8818, 63900, 7791, 46741, 19417, 43478",1623,71,medium
2,PC_medium_3,- The former CEO of a local cybersecurity firm...,"23183, 38734, 51075, 33984, 32659",1606,1,medium
3,PC_medium_4,- While most people have pretty much forgotten...,"49550, 60236, 22597, 12450, 27174",1716,7,medium
4,PC_medium_5,- Donald Trump attacked the CEOs who are leavi...,"4168, 58496, 56837, 4543",1741,81,medium
5,PC_medium_6,- LONDON (Reuters) - British Prime Minister Th...,"20855, 27676, 49938, 49393, 37820",1784,37,medium
6,PC_medium_7,- WASHINGTON (Reuters) - U.S. President Donald...,"13654, 7576, 63091, 13529, 803",1854,57,medium
7,PC_medium_8,- GENEVA (Reuters) - A European and African de...,"57964, 35796, 31868, 57540, 3129",1629,68,medium
8,PC_medium_9,- We re still waiting for the NFL to make a st...,"15053, 42328, 43707, 57381",1836,60,medium
9,PC_medium_10,- FRANKFURT (Reuters) - A suspicious package t...,"7021, 30333, 1299, 55277, 39271, 10429",1637,32,medium
