In [48]:
import pandas as pd
import numpy as np

In [49]:
# Nombre d'exemples
n_positive = 14000  # Label = 1 (Panne probable)
n_negative = 6000   # Label = 0 (Système normal)
total_samples = n_positive + n_negative

In [50]:
# Générer les timestamps en partant du dernier timestamp existant
start_time = pd.to_datetime("2025-02-19 10:00:00")
timestamps = [start_time + pd.Timedelta(minutes=i) for i in range(total_samples)]

In [51]:
# Initialisation des listes de stockage
cpu_usage, ram_usage, disk_usage = [], [], []
network_sent, network_received = [], []
temperature, reallocated_sectors, event_id = [], [], []
labels, messages, levels = [], [], []

In [52]:
# Définition des scénarios de panne
scenarios = [
    "Surcharge CPU", "Surcharge RAM", "Surchauffe matérielle",
    "Défaillance disque", "Problème réseau", "Erreurs système critiques",
    "Combinaison de plusieurs pannes"
]

In [53]:
# Génération des données
for i in range(total_samples):
    label = 1 if i < n_positive else 0  # Attribution des labels

    if label == 1:  # Cas panne
        scenario = np.random.choice(scenarios, p=[0.2, 0.2, 0.15, 0.15, 0.15, 0.1, 0.05])

        if scenario == "Surcharge CPU":
            cpu = np.random.uniform(90, 100)
            ram = np.random.uniform(70, 90)
            temperature_val = np.random.uniform(55, 70)

        elif scenario == "Surcharge RAM":
            cpu = np.random.uniform(50, 80)
            ram = np.random.uniform(95, 100)
            temperature_val = np.random.uniform(50, 65)

        elif scenario == "Surchauffe matérielle":
            cpu = np.random.uniform(60, 90)
            ram = np.random.uniform(50, 80)
            temperature_val = np.random.uniform(70, 100)

        elif scenario == "Défaillance disque":
            cpu = np.random.uniform(30, 70)
            ram = np.random.uniform(40, 80)
            temperature_val = np.random.uniform(40, 60)
            reallocated = np.random.randint(6, 50)

        elif scenario == "Problème réseau":
            cpu = np.random.uniform(40, 70)
            ram = np.random.uniform(50, 80)
            temperature_val = np.random.uniform(40, 60)
            network_sent_val = np.random.randint(10000, 50000)
            network_received_val = np.random.randint(0, 500)

        elif scenario == "Erreurs système critiques":
            cpu = np.random.uniform(50, 80)
            ram = np.random.uniform(50, 80)
            temperature_val = np.random.uniform(40, 60)

        elif scenario == "Combinaison de plusieurs pannes":
            cpu = np.random.uniform(85, 100)
            ram = np.random.uniform(90, 100)
            temperature_val = np.random.uniform(75, 100)
            reallocated = np.random.randint(10, 50)
            network_sent_val = np.random.randint(20000, 60000)
            network_received_val = np.random.randint(0, 200)

    else:  # Cas normal (label = 0)
        cpu = np.random.uniform(10, 70)
        ram = np.random.uniform(10, 70)
        temperature_val = np.random.uniform(30, 55)
        reallocated = np.random.randint(0, 3)
        network_sent_val = np.random.randint(0, 1000)
        network_received_val = np.random.randint(0, 1000)



In [54]:
 # Stockage des valeurs
cpu_usage.append(cpu)
ram_usage.append(ram)
disk_usage.append(np.random.uniform(40, 90))
network_sent.append(network_sent_val)
network_received.append(network_received_val)
temperature.append(temperature_val)
reallocated_sectors.append(reallocated)
event_id.append(np.random.randint(1, 10))
labels.append(label)

In [55]:
# Messages et niveaux de logs
if label == 1:
    messages.append(f"Erreur détectée : {scenario}")
    levels.append("Error, Warning")
else:
    messages.append("Journal système OK")
    levels.append("Information")

In [56]:
# Génération des descriptions
def generate_description(row):
    issues = []
    if row["cpu_usage"] > 85 and row["ram_usage"] > 85:
        issues.append("Surcharge critique CPU et RAM.")
    if row["cpu_usage"] > 90:
        issues.append("Utilisation CPU anormalement élevée.")
    if row["ram_usage"] > 95:
        issues.append("Mémoire vive saturée.")
    if row["temperature"] > 70:
        issues.append("Surchauffe matérielle détectée.")
    if row["reallocated_sectors"] > 5:
        issues.append("Défaillance probable du disque (SMART).")
    if "Error" in row["level"]:
        issues.append("Erreurs système détectées.")
    return " | ".join(issues) if issues else "Système stable."


In [57]:
# Génération des recommandations
def generate_recommendation(row):
    actions = []
    if "Surcharge critique CPU" in row["description"]:
        actions.append("Optimisez les processus et fermez les applications inutiles.")
    if "Mémoire vive saturée" in row["description"]:
        actions.append("Ajoutez plus de RAM ou réduisez la charge mémoire.")
    if "Surchauffe matérielle" in row["description"]:
        actions.append("Vérifiez les ventilateurs et nettoyez le système de refroidissement.")
    if "Défaillance probable du disque" in row["description"]:
        actions.append("Effectuez un test SMART et sauvegardez les données.")
    if "Erreurs système détectées" in row["description"]:
        actions.append("Analysez les logs pour identifier la cause exacte.")
    return " | ".join(actions) if actions else "Aucune action requise."

In [58]:
# Création du dataset final
df_large = pd.DataFrame({
    "timestamp": timestamps,
    "cpu_usage": cpu_usage,
    "ram_usage": ram_usage,
    "disk_usage": disk_usage,
    "network_sent": network_sent,
    "network_received": network_received,
    "temperature": temperature,
    "reallocated_sectors": reallocated_sectors,
    "event_id": event_id,
    "message": messages,
    "level": levels,
    "label": labels
})


ValueError: All arrays must be of the same length

In [None]:
# Appliquer les descriptions et recommandations
df_large["description"] = df_large.apply(generate_description, axis=1)
df_large["recommendation"] = df_large.apply(generate_recommendation, axis=1)


In [47]:
# Sauvegarde en Excel
output_file = "final_dataset_20000_complete.xlsx"
df_large.to_excel(output_file, index=False)

print(f"✅ Dataset bien équilibré avec tous les cas possibles généré : {output_file}")

NameError: name 'df_large' is not defined