**Importation des librairies**

In [None]:
#-----------------------IMPORTATION
from google.colab import drive
import os
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import classification_report
import joblib
from tqdm import tqdm
import concurrent.futures
#-----------------------CHARGEMENT DES DATASETS
DATA_DIR = '/content/drive/MyDrive/Dataset/'
os.chdir(DATA_DIR)
# Nomenclature des colonnes
cols = [
    'srcip','sport','dstip','dsport','proto','state','dur','sbytes','dbytes',
    'sttl','dttl','sloss','dloss','service','Sload','Dload','Spkts','Dpkts',
    'swin','dwin','stcpb','dtcpb','smeansz','dmeansz','trans_depth','res_bdy_len',
    'Sjit','Djit','Stime','Ltime','Sintpkt','Dintpkt','tcprtt','synack','ackdat',
    'is_sm_ips_ports','ct_state_ttl','ct_flw_http_mthd','is_ftp_login','ct_ftp_cmd',
    'ct_srv_src','ct_srv_dst','ct_dst_ltm','ct_src_ltm','ct_src_dport_ltm',
    'ct_dst_sport_ltm','ct_dst_src_ltm','attack_cat','label'
]

# Fonction de chargement du dataset
def load_dataset(file):
    try:
        # Lecture du dataset avec les colonnes comme header
        df = pd.read_csv(os.path.join(DATA_DIR, file), header=None, names=cols, low_memory=False)

        # Tri des colonnes correspondantes aux logs Zeek
        df = df[[
            'srcip', 'dstip', 'sport', 'dsport', 'proto', 'sbytes', 'label'
        ]]

        # Renommage des colonnes
        df.rename(columns={
            'srcip': 'src_ip',
            'dstip': 'dst_ip',
            'sport': 'src_port',
            'dsport': 'dst_port',
            'proto': 'proto',
            'sbytes': 'bytes'
        }, inplace=True)

        # Fonction de formatage
        def parse_port(x):
            try:
                return int(str(x), 0)
            except:
                return np.nan

        df['src_port'] = df['src_port'].apply(parse_port)
        df['dst_port'] = df['dst_port'].apply(parse_port)

        df.dropna(subset=['src_port', 'dst_port'], inplace=True)

        # Conversion
        df = df.astype({
            'src_ip': 'category',
            'dst_ip': 'category',
            'proto': 'category',
            'src_port': 'uint16',
            'dst_port': 'uint16',
            'bytes': 'uint32',
            'label': 'uint8'
        })
        return df

    except Exception as e:
        print(f"Erreur avec {file}: {str(e)}")
        return None

all_files = [f for f in os.listdir() if f.endswith('.csv')][:20]

with concurrent.futures.ThreadPoolExecutor() as executor:
    datasets = list(
        tqdm(executor.map(load_dataset, all_files), total=len(all_files)))

full_data = pd.concat([d for d in datasets if d is not None], ignore_index=True)
print(f"Données totales chargées: {len(full_data):,} lignes")

#---------------PRETRAITEMENT DES DONNÉES
# Fonction de pretraitement des données
def preprocess_data(df):

    cat_cols = ['proto', 'src_ip', 'dst_ip']
    encoders = {}

    for col in cat_cols:
        if col in df.columns:
            le = LabelEncoder()
            df[col] = le.fit_transform(df[col].astype(str))
            encoders[col] = le

    if 'bytes' in df.columns and 'duration' in df.columns:
        df['bytes_per_sec'] = df['bytes'] / (df['duration'] + 1e-6)

    features = ['src_port', 'dst_port', 'proto', 'bytes', 'bytes_per_sec']
    features = [f for f in features if f in df.columns]

    return df[features], df['label'], encoders

X, y, encoders = preprocess_data(full_data)

os.makedirs('preprocessing', exist_ok=True)
for col, encoder in encoders.items():
    joblib.dump(encoder, f'preprocessing/encoder_{col}.joblib')

#---------------------ENTRAINEMENT DU MODELE
# Séparation en features et labels de test et d'entrainement
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y)

# Normalisation
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
joblib.dump(scaler, 'preprocessing/scaler.joblib')

# Entrainement du RandomForest
batch_size = 100000
model = RandomForestClassifier(
    n_estimators=100,
    warm_start=True,
    n_jobs=-1,
    verbose=1
)

for i in tqdm(range(0, len(X_train), batch_size), desc="Entraînement par lots"):
    batch_end = min(i + batch_size, len(X_train))
    model.fit(X_train[i:batch_end], y_train[i:batch_end])
    model.n_estimators += 10

print("\nRapport de classification final:")
print(classification_report(y_test, model.predict(X_test)))

model_dir = '/content/drive/MyDrive/saved_models'
os.makedirs(model_dir, exist_ok=True)
joblib.dump(model, os.path.join(model_dir, 'network_forensics_model.joblib'))

100%|██████████| 3/3 [00:18<00:00,  6.24s/it]


Données totales chargées: 2,099,994 lignes


Entraînement par lots:   0%|          | 0/17 [00:00<?, ?it/s][Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:    5.3s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   13.2s finished
Entraînement par lots:   6%|▌         | 1/17 [00:13<03:31, 13.23s/it][Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    1.4s finished
Entraînement par lots:  12%|█▏        | 2/17 [00:14<01:34,  6.30s/it][Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    1.2s finished
Entraînement par lots:  18%|█▊        | 3/17 [00:15<00:55,  3.98s/it][Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    1.1s finished
Entraînement par lots:  24%|██▎       | 4/17 [00:17<00:37,  2


Rapport de classification final:


[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    1.1s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:    5.0s
[Parallel(n_jobs=2)]: Done 260 out of 260 | elapsed:    8.6s finished


              precision    recall  f1-score   support

           0       0.99      1.00      0.99    373521
           1       0.96      0.95      0.95     46478

    accuracy                           0.99    419999
   macro avg       0.98      0.97      0.97    419999
weighted avg       0.99      0.99      0.99    419999



['/content/drive/MyDrive/saved_models/network_forensics_model.joblib']