In [1]:
from google.colab import drive
import os

# --- Unmount any existing Drive mount (if needed) ---
drive.flush_and_unmount()

# --- (Optional) Clear local /content/drive mountpoint (disabled for safety) ---
drive_path = '/content/drive'
if os.path.exists(drive_path) and os.listdir(drive_path):
    print("Notice: /content/drive is not empty. Skipping automatic deletion for safety.")
    # Uncomment below to forcibly clear the mountpoint (NOT your actual Drive!)
    # !rm -rf /content/drive/*

# --- Mount Google Drive ---
drive.mount('/content/drive')

Drive not mounted, so nothing to flush and unmount.
Mounted at /content/drive


In [7]:
import os
import json

# --- Define input and output directories (adjust as needed) ---
PCAP_DIR = "/content/drive/MyDrive/BINOME_WORK/STAGE_CERIST/CERIST_RAW_DATA/attack_classes_pcap"
ENRICHED_JSON_DIR = "/content/drive/MyDrive/BINOME_WORK/STAGE_CERIST/DATA_FORMATING/STEP1_json_enriched_pyshark"

# --- Create output directory ---
os.makedirs(ENRICHED_JSON_DIR, exist_ok=True)

In [8]:
# --- Install system dependency: TShark (required by PyShark) ---
!apt-get update
!apt-get install -y tshark

# --- Allow TShark (dumpcap) to capture without root password ---
!chmod +x /usr/bin/dumpcap  # Be cautious: ensures executable permission only

# --- Install Python dependencies ---
!pip install pyshark
!pip install nest_asyncio

# --- Import libraries ---
import json
import pyshark
import nest_asyncio
from os import listdir
from os.path import join

# --- Enable nested event loop support (required in Colab) ---
nest_asyncio.apply()

0% [Working]            Hit:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease
Hit:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
Hit:3 http://security.ubuntu.com/ubuntu jammy-security InRelease
Hit:4 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:5 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
Hit:6 https://r2u.stat.illinois.edu/ubuntu jammy InRelease
Hit:7 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:8 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Get:9 http://archive.ubuntu.com/ubuntu jammy-backports InRelease [127 kB]
Hit:10 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Get:11 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 Packages [3,295 kB]
Get:12 http://archive.ubuntu.com/ubuntu jammy-updates/universe amd64 Packages [1,553 kB]
Fetched 5,103 kB in 3s (1,687 kB/s)
Reading package lists... Done
W: S

In [9]:
pcap_file_path = "/content/drive/MyDrive/BINOME_WORK/STAGE_CERIST/CERIST_RAW_DATA/attack_classes_pcap/wfuzz_lfi.pcap"
all_http_fields = set()

try:
    # Use PyShark to capture packets from the specific file
    cap = pyshark.FileCapture(pcap_file_path)
    # Iterate through each packet
    for pkt in cap:
        # Check if the packet has the HTTP layer and the _all_fields attribute
        if hasattr(pkt, 'http') and hasattr(pkt.http, '_all_fields'):
             all_http_fields.update(pkt.http._all_fields.keys())
    # Close the capture file
    cap.close()

except Exception as e:
    print(f"An error occurred while processing the pcap file: {e}")


print(all_http_fields)

An error occurred while processing the pcap file: [Errno 2] No such file or directory: /content/drive/MyDrive/BINOME_WORK/STAGE_CERIST/CERIST_RAW_DATA/attack_classes_pcap/wfuzz_xss.pcap
set()


In [10]:
import json
import pyshark

# Only these fields will be kept from _all_fields
SELECTED_HTTP_FIELDS = {
    '', 'http.time', 'http.request_number', '_ws.expert.severity', 'http.request.method',
    'http.prev_response_in', 'http.connection', 'http.request',
    '_ws.expert.message', 'http.prev_request_in', 'http.response', 'http.request.uri',
    '_ws.expert.group', 'http.accept', 'http.request_in', 'http.response.phrase',
    'http.file_data', 'http.server', 'http.chat', 'http.host', 'http.request.line',
    'http.response_number', 'http.date', 'http.response_for.uri', 'http.content_type',
    'http.user_agent', 'http.response.code.desc', 'http.content_length',
    'http.response.version', 'http.request.version', '_ws.expert', 'http.response.line',
    'http.response.code', 'http.request.full_uri'
}

def extract_http_sessions_selected_fields(pcap_path, attack_tag, output_path, max_errors=5):
    """
    Extracts HTTP request/response information using only the specified fields.
    """
    cap = pyshark.FileCapture(pcap_path, display_filter="http")
    current_streams = {}
    error_count = 0

    for pkt in cap:
        try:
            stream_id = int(pkt.tcp.stream)
            is_request = hasattr(pkt.http, "request_method")
            is_response = hasattr(pkt.http, "response_code")

            def filter_fields(all_fields):
                # Filter only the selected fields and return as dict
                return {k: v for k, v in all_fields.items() if k in SELECTED_HTTP_FIELDS}

            # Initialize or retrieve the HTTP stream record
            entry = current_streams.get(stream_id, {
                "attack_tag": attack_tag,
                "src_ip": pkt.ip.src,
                "dst_ip": pkt.ip.dst,
                "src_port": pkt.tcp.srcport,
                "dst_port": pkt.tcp.dstport,
                "req_method": None,
                "req_url": None,
                "req_headers": {},
                "req_body": None,
                "res_headers": {},
                "res_body": None
            })

            if is_request:
                entry["req_method"] = getattr(pkt.http, "request_method", None)
                entry["req_url"] = getattr(pkt.http, "request_full_uri", None)
                entry["req_headers"] = filter_fields(dict(pkt.http._all_fields))
                entry["req_body"] = getattr(pkt.http, "file_data", None)

            if is_response:
                entry["res_headers"] = filter_fields(dict(pkt.http._all_fields))
                entry["res_body"] = getattr(pkt.http, "file_data", None)

            current_streams[stream_id] = entry

        except Exception as e:
            if error_count < max_errors:
                print(f"[!] Skipping packet due to error: {e}")
                error_count += 1
            continue

    cap.close()

    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(list(current_streams.values()), f, indent=2)


In [18]:
# --- Batch process all .pcap files using PyShark ---
for file in os.listdir(PCAP_DIR):
    if file.endswith(".pcap"):
        # Derive attack type from filename
        attack_tag = file.replace("wfuzz_", "").replace(".pcap", "")

        # Construct input and output paths
        pcap_path = os.path.join(PCAP_DIR, file)
        output_path = os.path.join(ENRICHED_JSON_DIR, f"{attack_tag}_pyshark.json")

        # Process the file
        print(f"\n[*] Processing file: {file}")
        extract_http_sessions_selected_fields(pcap_path, attack_tag, output_path)
        print(f"[✔] Enriched HTTP sessions saved to: {output_path}")


[*] Processing file: wfuzz_xml.pcap
[✔] Enriched HTTP sessions saved to: /content/drive/MyDrive/BINOME_WORK/STAGE_CERIST/DATA_FORMATING/STEP1_json_enriched_pyshark/xml_pyshark.json

[*] Processing file: wfuzz_lfi.pcap
[✔] Enriched HTTP sessions saved to: /content/drive/MyDrive/BINOME_WORK/STAGE_CERIST/DATA_FORMATING/STEP1_json_enriched_pyshark/lfi_pyshark.json

[*] Processing file: wfuzz_ssti.pcap
[✔] Enriched HTTP sessions saved to: /content/drive/MyDrive/BINOME_WORK/STAGE_CERIST/DATA_FORMATING/STEP1_json_enriched_pyshark/ssti_pyshark.json

[*] Processing file: wfuzz_cmdinj.pcap
[✔] Enriched HTTP sessions saved to: /content/drive/MyDrive/BINOME_WORK/STAGE_CERIST/DATA_FORMATING/STEP1_json_enriched_pyshark/cmdinj_pyshark.json

[*] Processing file: wfuzz_sql_2.pcap
[✔] Enriched HTTP sessions saved to: /content/drive/MyDrive/BINOME_WORK/STAGE_CERIST/DATA_FORMATING/STEP1_json_enriched_pyshark/sql_2_pyshark.json

[*] Processing file: wfuzz_sql_3.pcap
[✔] Enriched HTTP sessions saved to: /c

In [21]:
import os
import glob
import json

def combine_and_cleanup(directory, attack_prefix):
    """
    Combine tous les fichiers JSON du dossier 'directory' commençant par 'attack_prefix'
    dans un fichier '{attack_prefix}_pyshark.json', puis supprime les fichiers sources SAUF le fichier combiné.
    Tous les champs 'attack_tag' sont réécrits pour correspondre à 'attack_prefix'.
    """
    pattern = os.path.join(directory, f"{attack_prefix}*.json")
    file_list = glob.glob(pattern)
    combined_data = []

    for filename in file_list:
        with open(filename, "r", encoding="utf-8") as f:
            data = json.load(f)
            if isinstance(data, list):
                combined_data.extend(data)
            else:
                combined_data.append(data)

    # ---- Normalisation du tag ----
    for entry in combined_data:
        entry['attack_tag'] = attack_prefix

    # Création du fichier combiné
    output_file = os.path.join(directory, f"{attack_prefix}_pyshark.json")
    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(combined_data, f, indent=2, ensure_ascii=False)

    # Suppression des fichiers source, sauf le fichier combiné !
    for filename in file_list:
        # On compare les chemins absolus pour éviter tout piège
        if os.path.abspath(filename) == os.path.abspath(output_file):
            continue  # Ne pas supprimer le fichier combiné !
        try:
            os.remove(filename)
            print(f"Supprimé : {filename}")
        except Exception as e:
            print(f"Erreur lors de la suppression de {filename} : {e}")

    print(f"\nFichier créé : {output_file} ({len(combined_data)} lignes)\n")

# Exemple d'appel :
combine_and_cleanup('/content/drive/MyDrive/BINOME_WORK/STAGE_CERIST/DATA_FORMATING/STEP1_json_enriched_pyshark/', 'xss')
combine_and_cleanup('/content/drive/MyDrive/BINOME_WORK/STAGE_CERIST/DATA_FORMATING/STEP1_json_enriched_pyshark/', 'sql')


Fichier créé : /content/drive/MyDrive/BINOME_WORK/STAGE_CERIST/DATA_FORMATING/STEP1_json_enriched_pyshark/xss_pyshark.json (154 lignes)


Fichier créé : /content/drive/MyDrive/BINOME_WORK/STAGE_CERIST/DATA_FORMATING/STEP1_json_enriched_pyshark/sql_pyshark.json (171 lignes)

