In [57]:
import numpy as np
import pandas as pd
import warnings
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import (accuracy_score, recall_score, confusion_matrix, balanced_accuracy_score, 
                             average_precision_score, f1_score, roc_auc_score, classification_report, precision_recall_curve)
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import IsolationForest, RandomForestClassifier
from sklearn.neighbors import LocalOutlierFactor
from sklearn.preprocessing import MinMaxScaler, RobustScaler,MultiLabelBinarizer
from imblearn.under_sampling import TomekLinks
from imblearn.over_sampling import SMOTE
from sklearn.base import clone
from sklearn.pipeline import Pipeline
from sklearn.multiclass import OneVsRestClassifier
from sklearn.impute import SimpleImputer
import joblib

## Lecture du fichier

In [58]:
with open("kddcup.names", "r", encoding="utf-8") as f:
    lines = f.readlines()

# Ignorer la premi√®re ligne (qui contient les classes d'attaques)
lines = lines[1:]

# Extraire uniquement les noms des colonnes avant les `:` et supprimer les espaces
columns = [line.split(":")[0].strip() for line in lines]

# Ajouter la colonne cible "label" (classification)
columns.append("label")

# Charger le fichier de donn√©es avec les colonnes extraites
data = pd.read_csv("kddcup.data.corrected", sep=",", header=None, names=columns)


In [None]:
data = data.iloc[:489843]

# S√©parer les caract√©ristiques et la cible
X = data.drop('label', axis=1)
Y = data['label']

# Appliquer One-Hot Encoding sur les colonnes cat√©gorielles
X_encoded = pd.get_dummies(X)

# Afficher les premi√®res lignes du dataset apr√®s One-Hot Encoding
print(X_encoded.head())

# Appliquer Isolation Forest sur les donn√©es encod√©es
IF = IsolationForest(n_estimators=100, contamination=0.002, random_state=1, n_jobs=-1)
outliers_if = IF.fit_predict(X_encoded)


   duration  src_bytes  dst_bytes  land  wrong_fragment  urgent  hot  \
0         0        215      45076     0               0       0    0   
1         0        162       4528     0               0       0    0   
2         0        236       1228     0               0       0    0   
3         0        233       2032     0               0       0    0   
4         0        239        486     0               0       0    0   

   num_failed_logins  logged_in  num_compromised  ...  flag_REJ  flag_RSTO  \
0                  0          1                0  ...     False      False   
1                  0          1                0  ...     False      False   
2                  0          1                0  ...     False      False   
3                  0          1                0  ...     False      False   
4                  0          1                0  ...     False      False   

   flag_RSTOS0  flag_RSTR  flag_S0  flag_S1  flag_S2  flag_S3  flag_SF  \
0        False      Fals

In [60]:

# V√©rifier les dimensions avant encodage
X = data.drop('label', axis=1)
Y = data['label']

print(f"Avant encodage - X.shape: {X.shape}")
print(f"Avant encodage - Y.shape: {Y.shape}")

# Appliquer One-Hot Encoding sur les colonnes cat√©gorielles
X_encoded = pd.get_dummies(X)

# V√©rifier les dimensions apr√®s encodage
print(f"Apr√®s encodage - X_encoded.shape: {X_encoded.shape}")




Avant encodage - X.shape: (4898431, 41)
Avant encodage - Y.shape: (4898431,)
Apr√®s encodage - X_encoded.shape: (4898431, 122)


In [61]:

# 1Ô∏è‚É£ Transformation des labels en une liste de labels (si plusieurs √©tiquettes par ligne)
data['label'] = data['label'].apply(lambda x: x.split(','))  # Si plusieurs labels sont s√©par√©s par des virgules

# 2Ô∏è‚É£ Encodage avec MultiLabelBinarizer
mlb = MultiLabelBinarizer()
Y_encoded = mlb.fit_transform(data['label'])

# 3Ô∏è‚É£ Affichage des nouvelles colonnes
print("Classes d√©tect√©es:", mlb.classes_)
print("Aper√ßu des donn√©es encod√©es:")
print(Y_encoded[:5])  # Afficher les 5 premi√®res lignes

# 4Ô∏è‚É£ (Optionnel) Transformer en DataFrame pour voir les colonnes
Y_encoded_df = pd.DataFrame(Y_encoded, columns=mlb.classes_)
print(Y_encoded_df.head())


Classes d√©tect√©es: ['back.' 'buffer_overflow.' 'ftp_write.' 'guess_passwd.' 'imap.'
 'ipsweep.' 'land.' 'loadmodule.' 'multihop.' 'neptune.' 'nmap.' 'normal.'
 'perl.' 'phf.' 'pod.' 'portsweep.' 'rootkit.' 'satan.' 'smurf.' 'spy.'
 'teardrop.' 'warezclient.' 'warezmaster.']
Aper√ßu des donn√©es encod√©es:
[[0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0]]
   back.  buffer_overflow.  ftp_write.  guess_passwd.  imap.  ipsweep.  land.  \
0      0                 0           0              0      0         0      0   
1      0                 0           0              0      0         0      0   
2      0                 0           0              0      0         0      0   
3      0                 0           0              0      0         0      0   
4      0                 0           0  

In [62]:
class_counts = Y_encoded_df.sum()
# D√©finir un seuil (ex: classes avec moins de 5 occurrences seront regroup√©es)
min_samples = 5
rare_classes = class_counts[class_counts < min_samples].index

# Fusionner ces classes en une seule colonne "autre"
Y_encoded_df['autre'] = Y_encoded_df[rare_classes].sum(axis=1)

# Supprimer les colonnes des classes rares
Y_encoded_df = Y_encoded_df.drop(columns=rare_classes)


<h1 style="color:blue;">Approches supervis√©es</h1>

In [63]:
models={
       'RF' : RandomForestClassifier(n_estimators=50, random_state = 1, n_jobs=-1)}

In [64]:
def scoring(Ytest,Pred,Prob):
    ba=balanced_accuracy_score(Ytest,Pred)
    f1=f1_score(Ytest,Pred)
    ap=average_precision_score(Ytest,Prob)
    print('Matrice de confusion') 
    print(confusion_matrix(Ytest,Pred))
    print('Balanced Accuracy : %.3f' %ba)    
    print('F1 Score : %.3f' %f1)
    print('Average precision score : %.3f' %ap)
    return ba,f1,ap
        

In [65]:
def original_approach(Xtrain,Ytrain,Xtest,Ytest,model):
    model.fit(Xtrain,Ytrain)
    Pred=model.predict(Xtest)
    Prob=model.predict_proba(Xtest)[:,1]

    best_f1 = f1_score(Ytest,Pred)
    best_pred = Pred

    for i in np.arange(1.0, 0.0, -0.01):
        Pred_LR_new=Prob>=i
        test_f1 = f1_score(Ytest,Pred_LR_new)
        if test_f1 > best_f1 :
            best_f1 = test_f1
            best_pred = Pred_LR_new

    
    return best_pred, Prob

In [66]:
def Tomek(X,Y):
    tl = TomekLinks(n_jobs=-1)
    X_tomek, y_Tomek = tl.fit_resample(X, Y)
    return X_tomek, y_Tomek


In [67]:
def undersampling_approach(Xtrain,Ytrain,Xtest,Ytest,model):
    X_under, Y_under=Tomek(Xtrain,Ytrain)
    Pred, Prob=original_approach(X_under, Y_under,Xtest,Ytest,model)
    return Pred, Prob

In [68]:
def Smote(X,Y):
    sm=SMOTE(k_neighbors=5,random_state=1)
    X_smote, y_smote = sm.fit_resample(X, Y)
    return X_smote, y_smote

In [69]:
def oversampling_approach(Xtrain,Ytrain,Xtest,Ytest,model):
    X_over, Y_over=Smote(Xtrain,Ytrain)
    Pred, Prob=original_approach(X_over, Y_over,Xtest,Ytest,model)
    return Pred, Prob

In [70]:
def balancing_approach(Xtrain,Ytrain,Xtest,Ytest,model):
    cloned_model = clone(model)
    cloned_model.class_weight='balanced'
    Pred, Prob=original_approach(Xtrain,Ytrain,Xtest,Ytest,cloned_model)
    return Pred, Prob

In [71]:
def isolation_forest(Xtrain,Ytrain,Xtest):
    IF = IsolationForest(n_estimators=1000, random_state=1,n_jobs=-1)
    IF.fit(Xtrain)
    Prob=-IF.decision_function(Xtest)
    return Prob

In [72]:
def LOF_novelty(Xtrain, Ytrain, Xtest):
    LOF = LocalOutlierFactor(n_neighbors=200, n_jobs=-1, novelty=True)
    
    # ‚úÖ Correction du filtrage de Xtrain
    if isinstance(Ytrain, pd.DataFrame):
        Xtrain_subset = Xtrain[Ytrain.iloc[:, 0] == 0]
    else:  # Si Ytrain est un np.array
        Xtrain_subset = Xtrain[Ytrain[:, 0] == 0]
    
    LOF.fit(Xtrain_subset)
    Prob = -LOF.decision_function(Xtest)
    return Prob


In [73]:

def comparaison(Xtrain_1, Xtest_1, Ytrain, Ytest, models):
    best_model = None
    best_score = 0

    # V√©rification des NaN dans Xtrain_1
    if pd.isnull(Xtrain_1).values.any():
        print("üö® Attention : Des NaN d√©tect√©s dans Xtrain ! Remplacement par la m√©diane.")
        imputer = SimpleImputer(strategy='median')
        Xtrain_1 = imputer.fit_transform(Xtrain_1)
        Xtest_1 = imputer.transform(Xtest_1)

    # √âtape 1 : LOF et Isolation Forest (Placeholder, assure-toi de les d√©finir)
    print("Local Outlier Factor : D√©tection de nouveaut√©s")
    # Prob = LOF_novelty(Xtrain_1, Ytrain, Xtest_1)
    
    print("Isolation Forest : D√©tection d'outliers")
    # Prob = isolation_forest(Xtrain_1, Ytrain, Xtest_1)

    # Convertir Ytrain et Ytest en numpy array
    Ytrain = np.array(Ytrain)
    Ytest = np.array(Ytest)

    # √âtape 2 : Test des mod√®les
    for name, base_model in models.items():
        print(f'***************** {name} *****************')
        
        if name == 'LR_Norm':
            RS = RobustScaler()
            Xtrain = RS.fit_transform(Xtrain_1)
            Xtest = RS.transform(Xtest_1)
        else:
            Xtrain, Xtest = Xtrain_1, Xtest_1

        # Oversampling et undersampling (Placeholder)
        # X_under, Y_under = Tomek(Xtrain, Ytrain)
        # X_over, Y_over = Smote(Xtrain, Ytrain)

        # Adapter les mod√®les pour le multi-label
        model = OneVsRestClassifier(base_model) if isinstance(base_model, (LogisticRegression, RandomForestClassifier)) else base_model

        # Pipeline
        pipe = Pipeline([('scaler', RobustScaler()), ('model', model)])

        # D√©finition des hyperparam√®tres selon le mod√®le
        param_grid = {}
        if isinstance(base_model, LogisticRegression):
            param_grid = {
                'model__estimator__C': [0.1, 1, 10],
                'model__estimator__solver': ['liblinear', 'saga']
            }
        elif isinstance(base_model, RandomForestClassifier):
            param_grid = {
                'model__estimator__n_estimators': [50, 100, 200],
                'model__estimator__max_depth': [None, 10, 20],
                'model__estimator__min_samples_split': [2, 5, 10]
            }

        # GridSearch
        if param_grid:
            grid_search = GridSearchCV(pipe, param_grid, cv=3, scoring='f1_samples', n_jobs=-1)
            grid_search.fit(Xtrain, Ytrain)
            current_model = grid_search.best_estimator_
            current_score = grid_search.best_score_
        else:
            current_model = pipe.fit(Xtrain, Ytrain)
            current_score = current_model.score(Xtest, Ytest)

        # Mise √† jour du meilleur mod√®le
        if current_score > best_score:
            best_score = current_score
            best_model = current_model

        # Affichage des scores et courbes PR
        print('****** Approche originale ******')
        Pred = current_model.predict(Xtest)
        Prob = current_model.predict_proba(Xtest) if hasattr(current_model, 'predict_proba') else current_model.decision_function(Xtest)

        # Conversion en √©tiquettes binaires (0 ou 1)
        Pred = (Prob >= 0.5).astype(int)
        print(classification_report(Ytest, Pred))

        # Courbes Pr√©cision-Rappel
        plt.figure(figsize=(8, 6))
        for i in range(Ytest.shape[1]):
            precision, recall, _ = precision_recall_curve(Ytest[:, i], Prob[:, i])
            plt.plot(recall, precision, lw=2, label=f'Label {i}')
        
        plt.xlabel("Recall")
        plt.ylabel("Precision")
        plt.title(f"Courbes Pr√©cision-Rappel pour {name}")
        plt.legend()
        plt.show()

    # Sauvegarde du meilleur mod√®le
    if best_model:
        print(f"üíæ Enregistrement du meilleur mod√®le avec un score de {best_score}")
        joblib.dump(best_model, 'best_model.pkl')
    else:
        print("‚ùå Aucun mod√®le trouv√©.")


## test

In [74]:
Xtrain,Xtest,Ytrain,Ytest=train_test_split(X_encoded,Y_encoded_df,test_size=0.5,stratify=Y_encoded_df,random_state=1)

In [75]:
comparaison(Xtrain,Xtest,Ytrain,Ytest,models)

Local Outlier Factor : D√©tection de nouveaut√©s
Isolation Forest : D√©tection d'outliers
***************** RF *****************


MemoryError: Unable to allocate 187. MiB for an array with shape (15, 1632810) and data type float64