# **Load Data**

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelBinarizer
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc
from imblearn.over_sampling import RandomOverSampler
import matplotlib.pyplot as plt
import requests, zipfile, io


# download link for the dataset
FILE_DOWNLOAD_URL = "http://cicresearch.ca/CICDataset/CIC-IDS-2017/Dataset/CIC-IDS-2017/CSVs/GeneratedLabelledFlows.zip"
response = requests.get(FILE_DOWNLOAD_URL)
dfs = []

# unzip the downloaded file
with zipfile.ZipFile(io.BytesIO(response.content)) as z:
    # skip the first folder name
    for file in z.namelist()[1:]:
        print(f"Loading {file}...")
        with z.open(file) as f:
            dfs.append(pd.read_csv(f, encoding='cp1252', nrows=50000)) # reading first 50k rows from each file just for testing

# combine all data into a single DataFrame
df = pd.concat(dfs, ignore_index=True)
print("Data loaded:", df.shape)


Loading TrafficLabelling /Wednesday-workingHours.pcap_ISCX.csv...
Loading TrafficLabelling /Tuesday-WorkingHours.pcap_ISCX.csv...
Loading TrafficLabelling /Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv...
Loading TrafficLabelling /Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX.csv...
Loading TrafficLabelling /Monday-WorkingHours.pcap_ISCX.csv...
Loading TrafficLabelling /Friday-WorkingHours-Morning.pcap_ISCX.csv...
Loading TrafficLabelling /Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv...
Loading TrafficLabelling /Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv...
Data loaded: (400000, 85)


# **Data Cleaning**

In [None]:
df.columns = df.columns.str.strip()   # remove extra spaces in column names
df.fillna(0, inplace=True)           # fill missing values
X = df.drop(columns=['Label', 'Flow ID', 'Source IP', 'Destination IP', 'Timestamp'])
y = df['Label']

# replace infinite values
X.replace([np.inf, -np.inf], np.nan, inplace=True)
X.fillna(0, inplace=True)


In [None]:
# Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)


In [None]:
# Handle Class Imbalance
ros = RandomOverSampler(random_state=42)
X_res, y_res = ros.fit_resample(X_train, y_train)


In [None]:
# Feature Scaling
scaler = StandardScaler()
X_res_scaled = scaler.fit_transform(X_res)
X_test_scaled = scaler.transform(X_test)


# **Hyperparameter Tuning**

In [None]:
param_grid = {'var_smoothing': [1e-9, 1e-8, 1e-7, 1e-6, 1e-5]}
gnb = GaussianNB()
grid = GridSearchCV(gnb, param_grid, cv=5, scoring='f1_weighted')
grid.fit(X_res_scaled, y_res)
best_var_smoothing = grid.best_params_['var_smoothing']
print("Best var_smoothing:", best_var_smoothing)

Best var_smoothing: 1e-09


In [None]:
# Train Naive Bayes Model
nb = GaussianNB(var_smoothing=best_var_smoothing)
nb.fit(X_res_scaled, y_res)
