In [None]:

from google.colab import drive
import os

# --- Unmount any existing Drive mount (if needed) ---
drive.flush_and_unmount()

# --- (Optional) Clear local /content/drive mountpoint (disabled for safety) ---
drive_path = '/content/drive'
if os.path.exists(drive_path) and os.listdir(drive_path):
    print("Notice: /content/drive is not empty. Skipping automatic deletion for safety.")
    # Uncomment below to forcibly clear the mountpoint (NOT your actual Drive!)
    # !rm -rf /content/drive/*

# --- Mount Google Drive ---
drive.mount('/content/drive')

Drive not mounted, so nothing to flush and unmount.
Mounted at /content/drive


In [None]:
PYSHARK_DIR = "/content/drive/MyDrive/BINOME_WORK/STAGE_CERIST/DATA_FORMATING/STEP1_json_enriched_pyshark"
CLEANED_DIR = "/content/drive/MyDrive/BINOME_WORK/STAGE_CERIST/DATA_FORMATING/json_CLEANED"
CLEANED_FLATTENED_DIR = "/content/drive/MyDrive/BINOME_WORK/STAGE_CERIST/DATA_FORMATING/json_CLEANED_FLATTENED"
os.makedirs(CLEANED_DIR, exist_ok=True)
os.makedirs(CLEANED_FLATTENED_DIR, exist_ok=True)

In [None]:
import os
import json

REQ_HEADER_KEEP = {
    "http.host", "http.user_agent", "http.accept", "http.content_type",
    "http.content_length", "http.cookie", "http.referer",
    "http.authorization", "http.connection"
}
RES_HEADER_KEEP = {
    "http.server", "http.date", "http.content_type", "http.content_length",
    "http.content_length_header", "http.set_cookie", "http.connection",
    "http.cache_control", "http.location", "http.www_authenticate",
    "http.response.code"     # <=== Ajout du code HTTP status ici !
}

def clean_headers(headers, keep_keys):
    """Garde uniquement les headers HTTP standards."""
    return {k: v for k, v in headers.items() if k in keep_keys}

def clean_pyshark_json_file(src_filepath, dst_filepath):
    """Nettoie les artefacts Wireshark d'un fichier JSON enrichi PyShark et sauvegarde le résultat ailleurs."""
    with open(src_filepath, "r", encoding="utf-8") as f:
        data = json.load(f)

    for entry in data:
        # Nettoie les headers si présents
        if 'req_headers' in entry:
            entry['req_headers'] = clean_headers(entry['req_headers'], REQ_HEADER_KEEP)
        if 'res_headers' in entry:
            entry['res_headers'] = clean_headers(entry['res_headers'], RES_HEADER_KEEP)
        # (le reste des champs n'est pas modifié)

    # S'assurer que le dossier destination existe
    os.makedirs(os.path.dirname(dst_filepath), exist_ok=True)
    with open(dst_filepath, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=2, ensure_ascii=False)
    print(f"Sauvé : {dst_filepath}")

def clean_all_pyshark_files(pyshark_dir, cleaned_dir):
    """Nettoie tous les fichiers JSON d'un dossier PyShark et sauve dans un dossier de sortie."""
    os.makedirs(cleaned_dir, exist_ok=True)
    for filename in os.listdir(pyshark_dir):
        if filename.endswith(".json"):
            src_filepath = os.path.join(pyshark_dir, filename)
            dst_filepath = os.path.join(cleaned_dir, filename)
            clean_pyshark_json_file(src_filepath, dst_filepath)

# === Exécution ===
clean_all_pyshark_files(PYSHARK_DIR, CLEANED_DIR)


Sauvé : /content/drive/MyDrive/BINOME_WORK/STAGE_CERIST/DATA_FORMATING/json_CLEANED/xml_pyshark.json
Sauvé : /content/drive/MyDrive/BINOME_WORK/STAGE_CERIST/DATA_FORMATING/json_CLEANED/lfi_pyshark.json
Sauvé : /content/drive/MyDrive/BINOME_WORK/STAGE_CERIST/DATA_FORMATING/json_CLEANED/ssti_pyshark.json
Sauvé : /content/drive/MyDrive/BINOME_WORK/STAGE_CERIST/DATA_FORMATING/json_CLEANED/cmdinj_pyshark.json
Sauvé : /content/drive/MyDrive/BINOME_WORK/STAGE_CERIST/DATA_FORMATING/json_CLEANED/xss_pyshark.json
Sauvé : /content/drive/MyDrive/BINOME_WORK/STAGE_CERIST/DATA_FORMATING/json_CLEANED/sql_pyshark.json


In [None]:
import os
import json

def flatten_entry(entry):
    flat = {}
    # Champs principaux
    for field in [
        "attack_tag", "src_ip", "dst_ip", "src_port", "dst_port",
        "req_method", "req_url", "req_body", "res_body"
    ]:
        flat[field] = entry.get(field)
    # Headers de requête
    req_headers = entry.get("req_headers", {})
    for k, v in req_headers.items():
        # Nettoie le nom: "http.user_agent" -> "User_Agent"
        k_flat = k.replace("http.", "").replace(".", "_").replace("_", "-").title().replace("-", "_")
        flat[f"req_header_{k_flat}"] = v
    # Headers de réponse
    res_headers = entry.get("res_headers", {})
    for k, v in res_headers.items():
        k_flat = k.replace("http.", "").replace(".", "_").replace("_", "-").title().replace("-", "_")
        flat[f"res_header_{k_flat}"] = v
    return flat


def flatten_all_json_files(cleaned_dir, flattened_dir):
    """Aplati tous les fichiers JSON du dossier cleaned_dir et les sauvegarde dans flattened_dir."""
    os.makedirs(flattened_dir, exist_ok=True)
    for filename in os.listdir(cleaned_dir):
        if filename.endswith(".json"):
            src_path = os.path.join(cleaned_dir, filename)
            dst_path = os.path.join(flattened_dir, filename)
            with open(src_path, "r", encoding="utf-8") as f:
                data = json.load(f)
            flat_data = [flatten_entry(e) for e in data]
            with open(dst_path, "w", encoding="utf-8") as f:
                json.dump(flat_data, f, indent=2, ensure_ascii=False)
            print(f"Fichier aplati : {dst_path}")

# === Exécution ===
flatten_all_json_files(CLEANED_DIR, CLEANED_FLATTENED_DIR)


Fichier aplati : /content/drive/MyDrive/BINOME_WORK/STAGE_CERIST/DATA_FORMATING/json_CLEANED_FLATTENED/xml_pyshark.json
Fichier aplati : /content/drive/MyDrive/BINOME_WORK/STAGE_CERIST/DATA_FORMATING/json_CLEANED_FLATTENED/lfi_pyshark.json
Fichier aplati : /content/drive/MyDrive/BINOME_WORK/STAGE_CERIST/DATA_FORMATING/json_CLEANED_FLATTENED/ssti_pyshark.json
Fichier aplati : /content/drive/MyDrive/BINOME_WORK/STAGE_CERIST/DATA_FORMATING/json_CLEANED_FLATTENED/cmdinj_pyshark.json
Fichier aplati : /content/drive/MyDrive/BINOME_WORK/STAGE_CERIST/DATA_FORMATING/json_CLEANED_FLATTENED/xss_pyshark.json
Fichier aplati : /content/drive/MyDrive/BINOME_WORK/STAGE_CERIST/DATA_FORMATING/json_CLEANED_FLATTENED/sql_pyshark.json


In [None]:
import os
import json

CLEANED_FLATTENED_DEDUP_DIR = "/content/drive/MyDrive/BINOME_WORK/STAGE_CERIST/DATA_FORMATING/STEP2_json_CLEANED_FLATTENED_DEDUP"
os.makedirs(CLEANED_FLATTENED_DEDUP_DIR, exist_ok=True)

def deduplicate_json_file(src_path, dst_path, key_fields=None):
    """
    Déduplique un fichier JSON (liste de dicts) sur les champs clé.
    Si key_fields est None, on déduplique sur l'ensemble des champs.
    """
    with open(src_path, "r", encoding="utf-8") as f:
        data = json.load(f)

    if key_fields is None:
        # Déduplication sur tout le dict
        seen = set()
        unique_data = []
        for entry in data:
            # Utilisation d'un tuple ordonné pour que l'ordre des clés ne casse pas la déduplication
            hashable = tuple(sorted(entry.items()))
            if hashable not in seen:
                seen.add(hashable)
                unique_data.append(entry)
    else:
        # Déduplication seulement sur les champs choisis
        seen = set()
        unique_data = []
        for entry in data:
            hashable = tuple(entry.get(k) for k in key_fields)
            if hashable not in seen:
                seen.add(hashable)
                unique_data.append(entry)

    with open(dst_path, "w", encoding="utf-8") as f:
        json.dump(unique_data, f, indent=2, ensure_ascii=False)
    print(f"Dédupliqué : {dst_path} ({len(data)} → {len(unique_data)} entrées)")

# === Exécution sur tout le dossier ===
for filename in os.listdir(CLEANED_FLATTENED_DIR):
    if filename.endswith(".json"):
        src_path = os.path.join(CLEANED_FLATTENED_DIR, filename)
        dst_path = os.path.join(CLEANED_FLATTENED_DEDUP_DIR, filename)
        # Tu peux choisir les clés à utiliser pour la déduplication :
        key_fields = ["req_method", "req_url", "req_body"]
        deduplicate_json_file(src_path, dst_path, key_fields)  # None = tous les champs


Dédupliqué : /content/drive/MyDrive/BINOME_WORK/STAGE_CERIST/DATA_FORMATING/STEP2_json_CLEANED_FLATTENED_DEDUP/xml_pyshark.json (10 → 10 entrées)
Dédupliqué : /content/drive/MyDrive/BINOME_WORK/STAGE_CERIST/DATA_FORMATING/STEP2_json_CLEANED_FLATTENED_DEDUP/lfi_pyshark.json (671 → 661 entrées)
Dédupliqué : /content/drive/MyDrive/BINOME_WORK/STAGE_CERIST/DATA_FORMATING/STEP2_json_CLEANED_FLATTENED_DEDUP/ssti_pyshark.json (206 → 167 entrées)
Dédupliqué : /content/drive/MyDrive/BINOME_WORK/STAGE_CERIST/DATA_FORMATING/STEP2_json_CLEANED_FLATTENED_DEDUP/cmdinj_pyshark.json (77 → 64 entrées)
Dédupliqué : /content/drive/MyDrive/BINOME_WORK/STAGE_CERIST/DATA_FORMATING/STEP2_json_CLEANED_FLATTENED_DEDUP/xss_pyshark.json (154 → 154 entrées)
Dédupliqué : /content/drive/MyDrive/BINOME_WORK/STAGE_CERIST/DATA_FORMATING/STEP2_json_CLEANED_FLATTENED_DEDUP/sql_pyshark.json (171 → 160 entrées)
