In [103]:
from collections import defaultdict, deque

import matplotlib.dates as mdates
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.ensemble import IsolationForest, RandomForestClassifier
from sklearn.metrics import (accuracy_score, classification_report,
                             make_scorer, recall_score)
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.preprocessing import StandardScaler

## Manipulating the data
### Loading and cleaning the data with pandas

In [104]:
# Load the datasets with the specified file paths
no_attack_path = "./BATADAL_dataset03.csv"
with_attacks_path = "./BATADAL_dataset04.csv"
attacks_info_path = "./Attacks_TrainingDataset2.csv"

# Read the data into pandas DataFrames
no_attacks = pd.read_csv(no_attack_path)
with_attacks = pd.read_csv(with_attacks_path)
batadal_attacks_info = pd.read_csv(attacks_info_path)

# Strip any leading/trailing spaces in column names
no_attacks.columns = no_attacks.columns.str.strip()
with_attacks.columns = with_attacks.columns.str.strip()

#missing columns
# Afficher les colonnes manquantes ou différentes
set_1 = set(no_attacks.columns)
set_2 = set(with_attacks.columns)

# Colonnes présentes dans l'un mais pas l'autre
missing_in_2 = set_1 - set_2
missing_in_1 = set_2 - set_1

print("Colonnes présentes seulement dans pandas_dataset_1:", missing_in_2)
print("Colonnes présentes seulement dans pandas_dataset_2:", missing_in_1)

batadal = pd.concat([no_attacks,with_attacks], ignore_index=True)
batadal

Colonnes présentes seulement dans pandas_dataset_1: set()
Colonnes présentes seulement dans pandas_dataset_2: set()


Unnamed: 0,DATETIME,L_T1,L_T2,L_T3,L_T4,L_T5,L_T6,L_T7,F_PU1,S_PU1,...,P_J256,P_J289,P_J415,P_J302,P_J306,P_J307,P_J317,P_J14,P_J422,ATT_FLAG
0,06/01/14 00,0.509730,2.049003,3.191145,2.792634,2.656091,5.316831,1.562321,98.998444,1.0,...,87.605774,26.495605,84.206619,18.901676,81.983734,18.791777,67.125603,29.387470,28.487471,0
1,06/01/14 01,0.412580,2.009072,3.642565,2.831673,3.126387,5.494855,1.852043,99.095901,1.0,...,89.448341,26.487326,85.900085,18.849329,82.150589,18.739643,67.178696,29.354256,28.454256,0
2,06/01/14 02,0.320112,1.986093,4.140192,3.256733,3.574601,5.500000,2.246126,98.420959,1.0,...,91.056114,26.487364,86.582474,19.597170,83.988579,19.496712,72.425293,29.354538,28.454538,0
3,06/01/14 03,0.332879,2.009203,4.673478,3.744497,3.952379,5.500000,3.203573,97.575172,1.0,...,92.594353,26.575815,88.020546,26.028486,64.670486,25.922703,76.275040,29.449951,28.549952,0
4,06/01/14 04,0.483496,2.089049,5.237937,4.409456,3.504676,5.500000,4.439714,97.351059,1.0,...,94.473099,26.723457,90.422462,26.209970,64.746620,26.104692,76.703529,29.574265,28.674263,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12933,24/12/16 20,2.650000,2.370000,3.850000,3.040000,3.820000,4.940000,2.190000,120.080000,1.0,...,70.030000,27.380000,84.140000,18.450000,81.670000,18.340000,66.040000,29.880000,28.980000,-999
12934,24/12/16 21,2.240000,2.560000,3.420000,2.920000,3.690000,5.020000,1.970000,119.120000,1.0,...,68.600000,27.660000,83.460000,25.400000,60.850000,25.280000,66.890000,30.190000,29.290000,-999
12935,24/12/16 22,1.910000,2.760000,2.950000,2.490000,2.700000,5.140000,1.870000,120.710000,1.0,...,85.630000,26.840000,82.820000,24.460000,59.560000,24.340000,66.080000,29.680000,28.780000,-999
12936,24/12/16 23,1.520000,2.520000,3.330000,2.030000,1.690000,5.100000,1.390000,120.020000,1.0,...,86.150000,25.780000,103.630000,24.770000,59.010000,24.650000,66.420000,28.980000,28.080000,-999


We need to fix the ATT_FLAG column (and all columns for that matter... not taking any chances...)
Some columns have too complex names with special caracters.

In [105]:
# Rename columns to simplify access
batadal_attacks_info.rename(
    columns={
        "Starting time [dd/mm/YY HH]": "Start_Time",
        "Ending time [dd/mm/YY HH]": "End_Time",
    },
    inplace=True,
)

We see that Starting time [dd/mm/YY HH], Ending time [dd/mm/YY HH] and DATETIME columns should be dates instead of strings/objects. We can use pandas to convert this data.

In [106]:
# Convert 'DATETIME' columns to datetime format for easy comparison
batadal["DATETIME"] = pd.to_datetime(
    batadal["DATETIME"],
    format="%d/%m/%y %H",
    dayfirst=True,
    errors="coerce",
)
batadal.iloc[:, [0, 1, 20, 25, 44]].info()

batadal_attacks_info["Start_Time"] = pd.to_datetime(
    batadal_attacks_info["Start_Time"],
    format="%d/%m/%Y %H",
    dayfirst=True,
    errors="coerce",
)
batadal_attacks_info["End_Time"] = pd.to_datetime(
    batadal_attacks_info["End_Time"],
    format="%d/%m/%Y %H",
    dayfirst=True,
    errors="coerce",
)
print("\n")
batadal_attacks_info.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12938 entries, 0 to 12937
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   DATETIME  12938 non-null  datetime64[ns]
 1   L_T1      12938 non-null  float64       
 2   F_PU7     12938 non-null  float64       
 3   S_PU9     12938 non-null  float64       
 4   ATT_FLAG  12938 non-null  int64         
dtypes: datetime64[ns](1), float64(3), int64(1)
memory usage: 505.5 KB


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7 entries, 0 to 6
Data columns (total 7 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   ID                  7 non-null      int64         
 1   Start_Time          7 non-null      datetime64[ns]
 2   End_Time            7 non-null      datetime64[ns]
 3   Duration [hours]    7 non-null      int64         
 4   Attack description  7 non-null      object        
 5   

In [107]:
# Loop through each attack period and set ATT_FLAG and T1_ATT_FLAG to T7_ATT_FLAG
for index, row in batadal_attacks_info.iterrows():
    # Extract start and end of the attack period from the attacks dataset
    attack_start = pd.to_datetime(row["Start_Time"], format="%d/%m/%Y %H")
    attack_end = pd.to_datetime(row["End_Time"], format="%d/%m/%Y %H")

    # Update ATT_FLAG for known attack periods
    batadal.loc[
        (batadal["DATETIME"] >= attack_start)
        & (batadal["DATETIME"] <= attack_end)
        & (batadal["ATT_FLAG"] == -999),
        "ATT_FLAG",
    ] = 1

    # Set ATT_FLAG to 0 for non-attack periods (everything else remaining)
    batadal.loc[
        (batadal["ATT_FLAG"] == -999), "ATT_FLAG"
    ] = 0

# Calculate the percentage of rows labeled as attack (ATT_FLAG == 1)
final_attack_count = batadal[batadal["ATT_FLAG"] == 1]["ATT_FLAG"].count()
final_attack_percentage = (final_attack_count / len(batadal)* 100)
print(
    f"Percentage of attack data after cross-referencing: {final_attack_percentage:.2f}%\n"
)
print(
    f"Nb of attack data after cross-referencing: {final_attack_count}\n"
)

batadal

Percentage of attack data after cross-referencing: 1.75%

Nb of attack data after cross-referencing: 227



Unnamed: 0,DATETIME,L_T1,L_T2,L_T3,L_T4,L_T5,L_T6,L_T7,F_PU1,S_PU1,...,P_J256,P_J289,P_J415,P_J302,P_J306,P_J307,P_J317,P_J14,P_J422,ATT_FLAG
0,2014-01-06 00:00:00,0.509730,2.049003,3.191145,2.792634,2.656091,5.316831,1.562321,98.998444,1.0,...,87.605774,26.495605,84.206619,18.901676,81.983734,18.791777,67.125603,29.387470,28.487471,0
1,2014-01-06 01:00:00,0.412580,2.009072,3.642565,2.831673,3.126387,5.494855,1.852043,99.095901,1.0,...,89.448341,26.487326,85.900085,18.849329,82.150589,18.739643,67.178696,29.354256,28.454256,0
2,2014-01-06 02:00:00,0.320112,1.986093,4.140192,3.256733,3.574601,5.500000,2.246126,98.420959,1.0,...,91.056114,26.487364,86.582474,19.597170,83.988579,19.496712,72.425293,29.354538,28.454538,0
3,2014-01-06 03:00:00,0.332879,2.009203,4.673478,3.744497,3.952379,5.500000,3.203573,97.575172,1.0,...,92.594353,26.575815,88.020546,26.028486,64.670486,25.922703,76.275040,29.449951,28.549952,0
4,2014-01-06 04:00:00,0.483496,2.089049,5.237937,4.409456,3.504676,5.500000,4.439714,97.351059,1.0,...,94.473099,26.723457,90.422462,26.209970,64.746620,26.104692,76.703529,29.574265,28.674263,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12933,2016-12-24 20:00:00,2.650000,2.370000,3.850000,3.040000,3.820000,4.940000,2.190000,120.080000,1.0,...,70.030000,27.380000,84.140000,18.450000,81.670000,18.340000,66.040000,29.880000,28.980000,0
12934,2016-12-24 21:00:00,2.240000,2.560000,3.420000,2.920000,3.690000,5.020000,1.970000,119.120000,1.0,...,68.600000,27.660000,83.460000,25.400000,60.850000,25.280000,66.890000,30.190000,29.290000,0
12935,2016-12-24 22:00:00,1.910000,2.760000,2.950000,2.490000,2.700000,5.140000,1.870000,120.710000,1.0,...,85.630000,26.840000,82.820000,24.460000,59.560000,24.340000,66.080000,29.680000,28.780000,0
12936,2016-12-24 23:00:00,1.520000,2.520000,3.330000,2.030000,1.690000,5.100000,1.390000,120.020000,1.0,...,86.150000,25.780000,103.630000,24.770000,59.010000,24.650000,66.420000,28.980000,28.080000,0


In [108]:
# Prepare the data
columns_to_exclude = ["DATETIME", "ATT_FLAG"]

initial_batadal_colums = batadal.columns

for n in range(1,4):
        for col in [col for col in initial_batadal_colums if col not in columns_to_exclude]:
                # Create a new column representing the algebraic variation between current and previous row
                batadal[f"{col}_past_{n}"] = batadal[col].shift(n)

# Drop the first row with NaN values due to the shift operation
batadal = batadal.dropna()

batadal


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented fr

Unnamed: 0,DATETIME,L_T1,L_T2,L_T3,L_T4,L_T5,L_T6,L_T7,F_PU1,S_PU1,...,P_J300_past_3,P_J256_past_3,P_J289_past_3,P_J415_past_3,P_J302_past_3,P_J306_past_3,P_J307_past_3,P_J317_past_3,P_J14_past_3,P_J422_past_3
3,2014-01-06 03:00:00,0.332879,2.009203,4.673478,3.744497,3.952379,5.500000,3.203573,97.575172,1.0,...,26.426495,87.605774,26.495605,84.206619,18.901676,81.983734,18.791777,67.125603,29.387470,28.487471
4,2014-01-06 04:00:00,0.483496,2.089049,5.237937,4.409456,3.504676,5.500000,4.439714,97.351059,1.0,...,26.422962,89.448341,26.487326,85.900085,18.849329,82.150589,18.739643,67.178696,29.354256,28.454256
5,2014-01-06 05:00:00,0.791114,2.773177,5.155802,3.937262,3.191528,5.322743,3.988906,94.135468,1.0,...,26.427771,91.056114,26.487364,86.582474,19.597170,83.988579,19.496712,72.425293,29.354538,28.454538
6,2014-01-06 06:00:00,1.186589,3.536068,4.983953,3.018011,2.859591,5.066728,2.977463,95.258003,1.0,...,26.519985,92.594353,26.575815,88.020546,26.028486,64.670486,25.922703,76.275040,29.449951,28.549952
7,2014-01-06 07:00:00,1.420449,3.872926,4.747458,3.581882,2.359944,5.152646,2.953742,96.947456,1.0,...,26.671642,94.473099,26.723457,90.422462,26.209970,64.746620,26.104692,76.703529,29.574265,28.674263
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12933,2016-12-24 20:00:00,2.650000,2.370000,3.850000,3.040000,3.820000,4.940000,2.190000,120.080000,1.0,...,26.740000,73.700000,26.700000,85.620000,19.900000,81.560000,19.790000,71.530000,29.290000,28.390000
12934,2016-12-24 21:00:00,2.240000,2.560000,3.420000,2.920000,3.690000,5.020000,1.970000,119.120000,1.0,...,26.910000,71.720000,26.880000,86.670000,19.700000,82.830000,19.600000,71.470000,29.430000,28.530000
12935,2016-12-24 22:00:00,1.910000,2.760000,2.950000,2.490000,2.700000,5.140000,1.870000,120.710000,1.0,...,29.810000,71.110000,29.810000,61.980000,26.100000,84.330000,26.100000,55.890000,32.070000,31.170000
12936,2016-12-24 23:00:00,1.520000,2.520000,3.330000,2.030000,1.690000,5.100000,1.390000,120.020000,1.0,...,27.420000,70.030000,27.380000,84.140000,18.450000,81.670000,18.340000,66.040000,29.880000,28.980000


In [150]:
from sklearn.manifold import TSNE
import plotly.express as px

reduction = min(3,round(batadal.shape[1]/10))

# Séparer les caractéristiques (sauf ATT_FLAG) et la colonne cible
X = batadal.drop(columns=columns_to_exclude)
X0 = X[X['ATT_FLAG'] == 0].copy()
y = batadal['ATT_FLAG']
y = y[y == 0].copy()

# Normaliser les données avant d'utiliser t-SNE
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


KeyboardInterrupt: 

In [114]:
# t-SNE to reduce to 3 dimensions
tnse = TSNE(n_components=reduction, random_state=42)
X_tsne = tnse.fit_transform(X_scaled)
X_tsne

# Convertir le résultat en DataFrame pour faciliter le traçage
tsne_df = pd.DataFrame(X_tsne, columns=['Dim1', 'Dim2', 'Dim3'])
tsne_df['ATT_FLAG'] = y.values

# Tracer les points en 3D, avec couleurs selon ATT_FLAG
fig = px.scatter_3d(tsne_df, x='Dim1', y='Dim2', z='Dim3', color='ATT_FLAG', 
                    color_discrete_map={0: 'blue', 1: 'red'}, 
                    labels={'ATT_FLAG': 'Attack Flag'}, 
                    title="3D t-SNE Projection of Batadal Data")
fig.update_traces(marker=dict(size=3))  # Ajuster la taille des points
fig.update_layout(scene=dict(
                    xaxis_title='Dimension 1',
                    yaxis_title='Dimension 2',
                    zaxis_title='Dimension 3'))

# Afficher le graphique interactif
fig.show()

# Tracer les points, avec les couleurs rouge pour ATT_FLAG = 1 et bleu pour ATT_FLAG = 0
#plt.figure(figsize=(10, 6))
#plt.scatter(tsne_df[tsne_df['ATT_FLAG'] == 0]['Dim1'], tsne_df[tsne_df['ATT_FLAG'] == 0]['Dim2'], c='blue', label='Normal')
#plt.scatter(tsne_df[tsne_df['ATT_FLAG'] == 1]['Dim1'], tsne_df[tsne_df['ATT_FLAG'] == 1]['Dim2'], c='red', label='Attack')
#plt.xlabel('Dimension 1')
#plt.ylabel('Dimension 2')
#plt.title('t-SNE Projection of Batadal Data')
#plt.legend()
#plt.show()

In [149]:
from sklearn.metrics import f1_score

# Appliquer l'Isolation Forest
iso_forest = IsolationForest(contamination=0.0098, random_state=42)  # Ajuster contamination selon le taux d'anomalie attendu
y_pred = iso_forest.fit_predict(X_scaled)

# Convertir les résultats en DataFrame pour une meilleure lisibilité
results_df = pd.DataFrame({
    'Is_Outlier': y_pred,  # -1 = anomalie, 1 = normal
    'ATT_FLAG': y.values
})

# Calcul du F1-Score basé sur ATT_FLAG comme vérité terrain
y_pred_binary = (results_df['Is_Outlier'] == -1).astype(int)
f1 = f1_score(results_df['ATT_FLAG'], y_pred_binary)

# Calcul des métriques pour ATT_FLAG = 1 et ATT_FLAG = 0
outliers_att_1 = results_df[(results_df['Is_Outlier'] == -1) & (results_df['ATT_FLAG'] == 1)].shape[0]
inliers_att_0 = results_df[(results_df['Is_Outlier'] == 1) & (results_df['ATT_FLAG'] == 0)].shape[0]
inliers_att_1 = results_df[(results_df['Is_Outlier'] == 1) & (results_df['ATT_FLAG'] == 1)].shape[0]
outliers_att_0 = results_df[(results_df['Is_Outlier'] == -1) & (results_df['ATT_FLAG'] == 0)].shape[0]

# Affichage des résultats finaux
print(f"Nombre de points avec ATT_FLAG = 1 qui sont hors de la distribution: {outliers_att_1}")
print(f"Nombre de points avec ATT_FLAG = 1 qui sont dans la distribution: {inliers_att_1}")
print(f"Nombre de points avec ATT_FLAG = 0 qui sont dans la distribution: {inliers_att_0}")
print(f"Nombre de points avec ATT_FLAG = 0 qui sont hors de la distribution: {outliers_att_0}")
print(f"F1-Score: {f1:.4f}")

Nombre de points avec ATT_FLAG = 1 qui sont hors de la distribution: 66
Nombre de points avec ATT_FLAG = 1 qui sont dans la distribution: 161
Nombre de points avec ATT_FLAG = 0 qui sont dans la distribution: 12647
Nombre de points avec ATT_FLAG = 0 qui sont hors de la distribution: 61
F1-Score: 0.3729
