In [9]:
import pandas as pd 
import numpy as np 
import glob
import os 
import seaborn as sns 
import matplotlib.pyplot as plt 
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report,accuracy_score
from sklearn.model_selection import KFold,StratifiedKFold, cross_val_score
from sklearn.model_selection import GridSearchCV
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTEENN
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import balanced_accuracy_score, precision_score, recall_score, f1_score

In [10]:
pd.set_option('display.max_columns', 50)

In [11]:

csv_files = glob.glob(os.path.join("*.csv"))
dataframes = []

# Loop through the list of CSV files and read each one
for file in csv_files:
    # Read the CSV file into a DataFrame
    df = pd.read_csv(file)
    # Append the DataFrame to the list
    dataframes.append(df)

# Optionally, concatenate all DataFrames into one large DataFrame
combined_df = pd.concat(dataframes, ignore_index=True)
combined_df.columns = combined_df.columns.str.strip()

In [12]:
columns_to_use = ["Total Fwd Packets",
                    "Total Backward Packets",
                    "Total Length of Fwd Packets",
                    "Total Length of Bwd Packets",
                    "Fwd Packet Length Min",
                    "Fwd Packet Length Std",
                    "Bwd Packet Length Min",
                    "Bwd Packet Length Std",
                    "Flow Duration",
                    "Flow IAT Mean",
                    "Flow IAT Std",
                    "Flow IAT Max",
                    "Flow IAT Min",
                    "Fwd IAT Min",
                    "Fwd IAT Mean",
                    "Fwd IAT Std",
                    "Fwd IAT Max",
                    "Bwd IAT Mean",
                    "Bwd IAT Std",
                    "Bwd IAT Max",
                    "Bwd IAT Min",
                    "SYN Flag Count",
                    "ACK Flag Count",
                    "FIN Flag Count",
                    "RST Flag Count",
                    "Flow Bytes/s",
                    "Flow Packets/s",
                    "Fwd Packets/s",
                    "Bwd Packets/s",
                    "Subflow Fwd Packets",
                    "Subflow Bwd Packets",
                    "Subflow Fwd Bytes",
                    "Subflow Bwd Bytes",
                    "Init_Win_bytes_forward",
                    "Init_Win_bytes_backward",
                    "Label"]

In [13]:
df = combined_df[columns_to_use]

In [14]:
df['Label'].value_counts()

Label
BENIGN                        2273097
DoS Hulk                       231073
PortScan                       158930
DDoS                           128027
DoS GoldenEye                   10293
FTP-Patator                      7938
SSH-Patator                      5897
DoS slowloris                    5796
DoS Slowhttptest                 5499
Bot                              1966
Web Attack � Brute Force         1507
Web Attack � XSS                  652
Infiltration                       36
Web Attack � Sql Injection         21
Heartbleed                         11
Name: count, dtype: int64

In [15]:
df.dropna(inplace=True)

# Replace unwanted characters in the 'Label' column
df['Label'] = df['Label'].str.replace('�', '-')

# Replace infinite values with NaN and drop any rows with NaN
df.replace([np.inf, -np.inf], np.nan, inplace=True)
df.dropna(inplace=True)

# Clip only numeric columns
numeric_columns = df.select_dtypes(include=[np.number]).columns
df[numeric_columns] = df[numeric_columns].clip(lower=-1e+308, upper=1e+308)

# Display initial class distribution
print("Class distribution before resampling:")
print(df["Label"].value_counts())

# Prepare features and labels
X = df.drop(columns="Label")
y = df["Label"]



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.dropna(inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Label'] = df['Label'].str.replace('�', '-')
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.replace([np.inf, -np.inf], np.nan, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-ver

Class distribution before resampling:
Label
BENIGN                        2271320
DoS Hulk                       230124
PortScan                       158804
DDoS                           128025
DoS GoldenEye                   10293
FTP-Patator                      7935
SSH-Patator                      5897
DoS slowloris                    5796
DoS Slowhttptest                 5499
Bot                              1956
Web Attack - Brute Force         1507
Web Attack - XSS                  652
Infiltration                       36
Web Attack - Sql Injection         21
Heartbleed                         11
Name: count, dtype: int64


In [18]:


# Przygotowanie danych (df oraz columns_to_use powinny być wcześniej zdefiniowane)
df = df[columns_to_use]

# Rozdzielenie danych na cechy i etykiety
X = df.drop(columns="Label")
y = df["Label"]

# 1. Podział danych na zbiór treningowy i testowy
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Skalowanie danych
scaler = StandardScaler()
X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns)
X_test_scaled = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)

# 2. Under-sampling dla najliczniejszych klas
target_count = 7000
under_sampling_dict = {
    'BENIGN': target_count,
    'DoS Hulk': target_count,
    'PortScan': target_count,
    'DDoS': target_count,
    'DoS GoldenEye': target_count
}

under_sampler = RandomUnderSampler(sampling_strategy=under_sampling_dict, random_state=42)
X_train_under, y_train_under = under_sampler.fit_resample(X_train_scaled, y_train)
X_train_under = pd.DataFrame(X_train_under, columns=X_train.columns)

# 4. Walidacja krzyżowa
kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
fold_metrics = []

for fold, (train_idx, val_idx) in enumerate(kf.split(X_train_under, y_train_under), 1):
    # Podział na fold treningowy i walidacyjny
    X_train_fold = X_train_under.iloc[train_idx]
    X_val_fold = X_train_under.iloc[val_idx]
    y_train_fold = y_train_under.iloc[train_idx]
    y_val_fold = y_train_under.iloc[val_idx]
    
    # 3. SMOTETEEN dla niedostatecznie reprezentowanych klas
    smoteenn = SMOTEENN(
        sampling_strategy={
            'FTP-Patator': target_count,
            'SSH-Patator': target_count,
            'DoS slowloris': target_count,
            'DoS Slowhttptest': target_count,
            'Bot': target_count,
            'Web Attack - Brute Force': target_count,
            'Web Attack - XSS': target_count,
            'Infiltration': target_count,
            'Web Attack - Sql Injection': target_count,
            'Heartbleed': target_count
        },
        random_state=42
    )
    
    X_train_resampled, y_train_resampled = smoteenn.fit_resample(X_train_fold, y_train_fold)
    X_train_resampled = pd.DataFrame(X_train_resampled, columns=X_train.columns)

    # Trening modelu
    rf = RandomForestClassifier(random_state=42)
    rf.fit(X_train_resampled, y_train_resampled)
    
    # Predykcja i obliczenie metryk
    y_pred_fold = rf.predict(X_val_fold)
    cm = confusion_matrix(y_val_fold, y_pred_fold)
    print(f"Macierz pomyłek dla fold {fold}:\n", cm)
    
    # Obliczenie metryk
    metrics = {
        'fold': fold,
        'balanced_acc': balanced_accuracy_score(y_val_fold, y_pred_fold),
        'precision': precision_score(y_val_fold, y_pred_fold, average='weighted', zero_division=0),
        'recall': recall_score(y_val_fold, y_pred_fold, average='weighted', zero_division=0),
        'f1': f1_score(y_val_fold, y_pred_fold, average='weighted', zero_division=0)
    }
    fold_metrics.append(metrics)
    
    print(f"\nFold {fold}:")
    print(f" Balanced Accuracy: {metrics['balanced_acc']:.4f}")
    print(f" Precision: {metrics['precision']:.4f}")
    print(f" Recall: {metrics['recall']:.4f}")
    print(f" F1 Score: {metrics['f1']:.4f}\n")


df_metrics = pd.DataFrame(fold_metrics)
print("Średnie wyniki walidacji krzyżowej:")
print(df_metrics.mean())




Macierz pomyłek dla fold 1:
 [[679   8   1   2   2   0   0   0   1   0   2   3   0   2]
 [  1 135   0   0   0   0   0   0   0   0   0   0   0   0]
 [  1   0 699   0   0   0   0   0   0   0   0   0   0   0]
 [  0   0   0 700   0   0   0   0   0   0   0   0   0   0]
 [  4   0   0   0 696   0   0   0   0   0   0   0   0   0]
 [  3   0   0   1   0 375   3   0   0   0   0   0   0   0]
 [  0   0   0   0   0   0 413   0   0   0   0   0   0   0]
 [  1   0   0   0   0   0   0 557   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   3   0   0   0   0   0]
 [  0   0   0   0   1   0   0   0   0 699   0   0   0   0]
 [  2   0   0   0   0   0   0   0   0   0 404   0   0   0]
 [  1   0   0   0   0   0   0   0   0   0   0  62   0  39]
 [  0   0   0   0   0   0   0   0   0   0   0   1   1   0]
 [  3   0   0   0   0   0   0   0   0   0   0  17   0  25]]

Fold 1:
 Balanced Accuracy: 0.8995
 Precision: 0.9837
 Recall: 0.9822
 F1 Score: 0.9826

Macierz pomyłek dla fold 2:
 [[677   6   0   3   5   1