In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import lightgbm as lgb
from sklearn.metrics import classification_report, confusion_matrix
import joblib

In [2]:
# Define file paths
file_paths = [
    r"/content/Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv",
    r"/content/Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv",
    r"/content/Friday-WorkingHours-Morning.pcap_ISCX.csv",
    r"/content/Monday-WorkingHours.pcap_ISCX.csv",
    r"/content/Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX.csv",
    r"/content/Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv",
    r"/content/Tuesday-WorkingHours.pcap_ISCX.csv",
    r"/content/Wednesday-workingHours.pcap_ISCX.csv"
]

In [3]:
# Define attack type mapping based on file names
attack_type_mapping = {
    "DDos": "DDos",
    "PortScan": "PortScan",
    "Infilteration": "Infilteration",
    "WebAttacks": "WebAttacks",
    "Morning": "Benign",
    "Afternoon": "Benign",
    "Monday": "Benign",
    "Tuesday": "Benign",
    "Wednesday": "Benign",
    "Thursday": "Benign",
    "Friday": "Benign"
}

# Load and combine data
data_frames = []
for file_path in file_paths:
    df = pd.read_csv(file_path)
    # Assign attack type based on file name
    for key, value in attack_type_mapping.items():
        if key in file_path:
            df['attack_type'] = value
            break
    data_frames.append(df)

combined_df = pd.concat(data_frames, ignore_index=True)

In [4]:
# Define severity mapping
severity_mapping = {
    "Benign": 0,
    "PortScan": 1,
    "Probe": 1,
    "DDos": 2,
    "Infilteration": 3,
    "WebAttacks": 3
}

# Map attack_type to severity
combined_df['severity'] = combined_df['attack_type'].map(severity_mapping)

In [5]:
# Replace infinite values with NaN
combined_df.replace([np.inf, -np.inf], np.nan, inplace=True)

# Fill missing values only for numeric columns for better performance
numeric_cols = combined_df.select_dtypes(include=[np.number]).columns
combined_df[numeric_cols] = combined_df[numeric_cols].fillna(combined_df[numeric_cols].mean())

In [6]:
# Separate features and target
X = combined_df.drop(columns=['attack_type', 'severity'])
y = combined_df['severity']

In [7]:
# Identify categorical and numerical features
categorical_features = X.select_dtypes(include=['object']).columns
numerical_features = X.select_dtypes(include=[np.number]).columns

In [8]:
# Preprocess the data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

In [9]:
# Create a pipeline with LightGBM classifier
lgb_model = lgb.LGBMClassifier(n_estimators=100, random_state=42, n_jobs=-1)
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', lgb_model)
])

In [10]:
# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
# Train the model
pipeline.fit(X_train, y_train)



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 2.641407 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14627
[LightGBM] [Info] Number of data points in the train set: 2264594, number of used features: 84
[LightGBM] [Info] Start training from score -0.420720
[LightGBM] [Info] Start training from score -2.290791
[LightGBM] [Info] Start training from score -2.527339
[LightGBM] [Info] Start training from score -1.817886


In [12]:
# Evaluate the model
y_pred = pipeline.predict(X_val)
print(classification_report(y_val, y_pred))
print(confusion_matrix(y_val, y_pred))



              precision    recall  f1-score   support

           0       0.77      1.00      0.87    372690
           1       0.99      0.57      0.73     57321
           2       0.99      0.74      0.85     44869
           3       0.97      0.19      0.31     91269

    accuracy                           0.80    566149
   macro avg       0.93      0.62      0.69    566149
weighted avg       0.84      0.80      0.76    566149

[[372093     89     96    412]
 [ 24298  32892     39     92]
 [ 11476     70  33230     93]
 [ 73816    286    267  16900]]


In [13]:
# Save the pipeline and numerical scaler
joblib.dump(pipeline, 'model_cicids_lightgbm.pkl')
joblib.dump(preprocessor.named_transformers_['num'], 'scaler_cicids.pkl')

['scaler_cicids.pkl']