---------------------------------------------------------------
## VI. CRÉATION DE CLUSTERS ALÉATOIRES
--------------------------------------------------------------

Nous entreprenons d'abord de créer des clusers aléatoires afin de faire tourner les modèles sur des données volontairement hétérogènes.

In [11]:
import pandas as pd
import numpy as np
import random
import nltk
import os
from nltk.tokenize import word_tokenize  # Using NLTK to tokenize text

nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/thevault/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [20]:
# Import du jeu de données
df = pd.read_csv('../data/processed/processed_data.csv')

---
### 1. Traitement préliminaire des données
---

In [22]:
# Retrait des colonnes inutiles
df = df.drop(['label', 'text_prepr', 'has_similarity', 'similar_with', 'similarity_group', 'similarity_score'], axis = 1)

# Extraction du 'type' des textes (short ou medium), et ajouter dans une colonne
df['type'] = df['sign_count'].apply(
    lambda x: 'short' if 50 <= x <= 280 else 'medium' if x > 280 else None)

# Création d'un nouveau jeu de données avec que les 'short' et 'medium' pour le clustering
df_short = df[df['type'] == 'short']
df_medium = df[df['type'] == 'medium']

In [25]:
# Affichage des posts courts
df_short.head()

Unnamed: 0,id_50_3000,id_origin,text,sign_count,type
0,0,55929,Dank is it a tornado n Raleigh car blowincg n ...,50,short
1,1,51817,@smoak_queen 'I'm going to be in so much troub...,50,short
2,2,51709,@CSAresu American Tragedy http://t.co/SDmrzG...,50,short
3,3,47897,How to Survive a Dust Storm http://t.co/0yL3yT...,50,short
4,4,50836,I SCREAMED 'WHATS A CHONCe' http://t.co/GXYivs...,50,short


In [24]:
# Affichage des posts moyens
df_medium.head()

Unnamed: 0,id_50_3000,id_origin,text,sign_count,type
35953,35953,8941,WASHINGTON (Reuters) - Senate Democratic leade...,281,medium
35954,35954,85520,Maybe partly logistics. Hurricanes give far mo...,281,medium
35955,35955,100415,"The Federalist, a right-wing group with secret...",281,medium
35956,35956,88820,Volunteers with search dogs continue to scour ...,281,medium
35957,35957,79189,Main takeaways - If this country has a nationa...,281,medium


---
### 2. Regroupement par clusterisation
---

In [26]:
# Fonction pour tokéniser le texte
def count_tokens(text):
    return len(nltk.word_tokenize(text))

In [27]:
# Fonction pour le clustering de textes aléatoires (100 clusters, max 1900 tokens)
def create_clusters(df, max_clusters=100, max_tokens_per_cluster=1900):
    # Initialiser le nouveau dataframe
    final_clusters = pd.DataFrame(columns=['cluster_id', 'clustered_text', 'id_50_3000', 'token_number'])
    
    # Initialiser variables
    cluster_id = 1
    cluster_text = ""
    cluster_tokens = 0
    clustered_ids = []

    # Mélanger le Dataframe
    df = df.sample(frac=1).reset_index(drop=True)

    # Passer par chaque row du Dataframe
    for index, row in df.iterrows():
        text = f"- {row['text']}\n"
        text_tokens = count_tokens(text)
        
        # Vérifier si la token limit a été surpassée
        if cluster_tokens + text_tokens > max_tokens_per_cluster:
            # Sauvegarder dans le current cluster
            new_cluster = pd.DataFrame({
                'cluster_id': [cluster_id],
                'clustered_text': [cluster_text],
                'id_50_3000': [', '.join(clustered_ids)],
                'token_number': [cluster_tokens]
            })
            final_clusters = pd.concat([final_clusters, new_cluster], ignore_index=True)
            
            # Aller au prochain cluster
            cluster_id += 1
            cluster_text = ""
            cluster_tokens = 0
            clustered_ids = []
            
            # Stop si le max des clusters a été atteint
            if cluster_id > max_clusters:
                break

        # Ajouter le texte au cluster
        cluster_text += text
        cluster_tokens += text_tokens
        clustered_ids.append(str(row['id_50_3000']))

    # Ajouter le dernier cluster si il y a du texte
    if cluster_text:
        new_cluster = pd.DataFrame({
            'cluster_id': [cluster_id],
            'clustered_text': [cluster_text],
            'clusters_id_50_3000': [', '.join(clustered_ids)],
            'token_number': [cluster_tokens]
        })
        final_clusters = pd.concat([final_clusters, new_cluster], ignore_index=True)

    return final_clusters

In [28]:
# On appliquer les fonction aux deux nouveaux jeux de données
# Appliquer fonction à 'df_short'
final_clusters_short = create_clusters(df_short)

# Appliquer fonction à 'df_medium'
final_clusters_medium = create_clusters(df_medium)

In [29]:
# Ajouter une colonnes 'short' et combiner avec le 'cluster_id'
final_clusters_short['type'] = 'short'
final_clusters_short['cluster_id'] = 'R_' + final_clusters_short['type'] + '_' + final_clusters_short['cluster_id'].astype(str)

# Ajouter une colonnes 'medium' et combiner avec le 'cluster_id'
final_clusters_medium['type'] = 'medium'
final_clusters_medium['cluster_id'] = 'R_' + final_clusters_medium['type'] + '_' + final_clusters_medium['cluster_id'].astype(str)

# Merge both 'short' and 'medium' dataframes together together
randomized_clusters = pd.concat([final_clusters_medium, final_clusters_short], axis=0, ignore_index=True)

# Mettre R_medium
randomized_clusters['type'] = 'R_' + randomized_clusters['type']

In [30]:
# On affiche le résultat
randomized_clusters.head()

Unnamed: 0,cluster_id,clustered_text,id_50_3000,token_number,type
0,R_medium_1,- This is the beauty of humanity. At 6 years o...,"37768, 47185, 47979, 47743, 63102, 54176",1826,R_medium
1,R_medium_2,- LONDON (Reuters) - Prime Minister Theresa Ma...,"63269, 45411, 39134, 63843, 55247, 42981",1879,R_medium
2,R_medium_3,- BRUSSELS (Reuters) - U.S. President Donald T...,"47680, 48968, 63021, 58186",1550,R_medium
3,R_medium_4,- CAIRO (Reuters) - Egyptian President Abdel F...,"54857, 39579, 61043, 48612, 57687",1669,R_medium
4,R_medium_5,- NEW DELHI (Reuters) - India s top court has ...,"62639, 65075, 64421",1573,R_medium


In [31]:
# On enregistre le jeu de données
randomized_clusters.to_csv('../data/processed/randomized_clusters.csv', index=False)