In [1]:
import pandas as pd
import numpy as np

# Nombre d'exemples
n_positive = 14000
n_negative = 6000
total_samples = n_positive + n_negative

# Générer les timestamps
start_time = pd.to_datetime("2025-02-19 10:00:00")
timestamps = [start_time + pd.Timedelta(minutes=i) for i in range(total_samples)]

# Initialisation des listes de stockage
cpu_usage, ram_usage, disk_usage = [], [], []
network_sent, network_received = [], []
temperature, reallocated_sectors, event_id = [], [], []
labels, messages, levels = [], [], []

# Définition des scénarios
scenarios = ["Surcharge CPU", "Surcharge RAM", "Surchauffe matérielle", "Défaillance disque",
             "Problème réseau", "Erreurs système critiques", "Combinaison de plusieurs pannes"]

# Génération des données
for i in range(total_samples):
    label = 1 if i < n_positive else 0  # Attribution des labels
   
    # Initialisation des valeurs par défaut
    cpu, ram, disk, temperature_val = np.random.uniform(10, 70), np.random.uniform(10, 70), np.random.uniform(40, 90), np.random.uniform(30, 55)
    reallocated, network_sent_val, network_received_val = 0, np.random.randint(0, 1000), np.random.randint(0, 1000)
   
    if label == 1:  # Cas panne
        scenario = np.random.choice(scenarios)

        if scenario == "Surcharge CPU":
            cpu = np.random.uniform(90, 100)
            ram = np.random.uniform(70, 90)
            temperature_val = np.random.uniform(55, 70)

        elif scenario == "Surcharge RAM":
            cpu = np.random.uniform(50, 80)
            ram = np.random.uniform(95, 100)
            temperature_val = np.random.uniform(50, 65)

        elif scenario == "Surchauffe matérielle":
            cpu = np.random.uniform(60, 90)
            ram = np.random.uniform(50, 80)
            temperature_val = np.random.uniform(70, 100)

        elif scenario == "Défaillance disque":
            reallocated = np.random.randint(6, 50)

        elif scenario == "Problème réseau":
            network_sent_val = np.random.randint(10000, 50000)
            network_received_val = np.random.randint(0, 500)

        elif scenario == "Combinaison de plusieurs pannes":
            cpu = np.random.uniform(85, 100)
            ram = np.random.uniform(90, 100)
            temperature_val = np.random.uniform(75, 100)
            reallocated = np.random.randint(10, 50)
            network_sent_val = np.random.randint(20000, 60000)
            network_received_val = np.random.randint(0, 200)

    # Ajout aux listes
    cpu_usage.append(cpu)
    ram_usage.append(ram)
    disk_usage.append(np.random.uniform(40, 90))
    network_sent.append(network_sent_val)
    network_received.append(network_received_val)
    temperature.append(temperature_val)
    reallocated_sectors.append(reallocated)
    event_id.append(np.random.randint(1, 10))
    labels.append(label)

    # Messages et niveaux de logs
    if label == 1:
        messages.append(f"Erreur détectée : {scenario}")
        levels.append("Error, Warning")
    else:
        messages.append("Journal système OK")
        levels.append("Information")

# Vérifier que toutes les listes ont bien la même longueur
assert len(cpu_usage) == len(ram_usage) == len(disk_usage) == len(network_sent) == \
       len(network_received) == len(temperature) == len(reallocated_sectors) == \
       len(event_id) == len(labels) == len(messages) == len(levels) == total_samples, "Les listes ne sont pas de la même taille !"

# Création du dataset final
df_large = pd.DataFrame({
    "timestamp": timestamps,
    "cpu_usage": cpu_usage,
    "ram_usage": ram_usage,
    "disk_usage": disk_usage,
    "network_sent": network_sent,
    "network_received": network_received,
    "temperature": temperature,
    "reallocated_sectors": reallocated_sectors,
    "event_id": event_id,
    "message": messages,
    "level": levels,
    "label": labels
})

# Sauvegarde en Excel
output_file = "final_dataset_200_corrected.xlsx"
df_large.to_excel(output_file, index=False)

print(f"✅ Dataset bien généré et enregistré sous {output_file}")


✅ Dataset bien généré et enregistré sous final_dataset_200_corrected.xlsx
