# BoTNeTIoT-L01 Review
____


## Importing modules

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.utils import resample
from sklearn.decomposition import PCA
from scipy.stats import skew
from scipy.fft import fft
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

## Loading dataset

In [4]:
file_path = "./team_11_BotNeTIoT-L01_label_NoDuplicates.csv"
df = pd.read_csv(file_path)

print("Data shape:", df.shape)
df.head()

Data shape: (2426574, 25)


Unnamed: 0.1,Unnamed: 0,MI_dir_L0.1_weight,MI_dir_L0.1_mean,MI_dir_L0.1_variance,H_L0.1_weight,H_L0.1_mean,H_L0.1_variance,HH_L0.1_weight,HH_L0.1_mean,HH_L0.1_std,...,HH_jit_L0.1_mean,HH_jit_L0.1_variance,HpHp_L0.1_weight,HpHp_L0.1_mean,HpHp_L0.1_std,HpHp_L0.1_magnitude,HpHp_L0.1_radius,HpHp_L0.1_covariance,HpHp_L0.1_pcc,label
0,0,1.0,98.0,0.0,1.0,98.0,0.0,1.0,98.0,0.0,...,1505914000.0,0.0,1.0,98.0,0.0,98.0,0.0,0.0,0.0,0
1,1,1.93164,98.0,1.818989e-12,1.93164,98.0,1.818989e-12,1.93164,98.0,1.348699e-06,...,726310200.0,5.662344e+17,1.93164,98.0,1e-06,138.592929,1.818989e-12,0.0,0.0,0
2,2,2.904273,86.98175,231.1822,2.904273,86.98175,231.1822,1.0,66.0,0.0,...,1505914000.0,0.0,1.0,66.0,0.0,114.856432,0.0,0.0,0.0,0
3,3,3.902546,83.655268,204.0614,3.902546,83.655268,204.0614,1.0,74.0,0.0,...,1505914000.0,0.0,1.0,74.0,0.0,74.0,0.0,0.0,0.0,0
4,4,4.902545,81.685828,177.5746,4.902545,81.685828,177.5746,2.0,74.0,9.536743e-07,...,752957100.0,5.669445e+17,1.0,74.0,0.0,74.0,0.0,0.0,0.0,0


# Data Cleaning & Handling Imbalance

## Check missing values and drop if necessary

In [None]:
#1.Data Cleaning & Handling Imbalance

print(f"Total null values: {df.isnull().sum().sum()}\n")

# Drop duplicates if any
df = df.drop_duplicates()
if 'Unnamed: 0' in df.columns:
    df = df.drop(columns=['Unnamed: 0'])
df = df.apply(lambda x : np.round(x, 5))

df_attack = df[df['label'] == 0]   # attack
df_benign = df[df['label'] == 1]   # benign


df_attack_downsampled = resample(
    df_attack,
    replace=False,
    n_samples=len(df_benign),
    random_state=42
)

df_balanced = pd.concat([df_attack_downsampled, df_benign])
df_balanced = df_balanced.reset_index()
print(f"Total value count:\n\n{df_balanced['label'].value_counts()}")


## Statistical Features

In [None]:
# Create rolling window statistics
window_size = 50


### Mean

In [None]:
df_balanced.iloc[:, :-1].rolling(window=window_size).mean().apply(lambda x: np.round(x, 5))

### Variance

In [None]:
df_balanced.iloc[:, :-1].rolling(window=window_size).var().apply(lambda x: np.round(x, 5))

### Skewness


In [None]:
df_balanced.iloc[:, :-1].rolling(window=window_size).skew().apply(lambda x: np.round(x, 5))

## Time-Series Features

In [None]:
sliding_avg = df_balanced.iloc[:, :-1].rolling(window=10).mean().mean(axis=1)
sliding_avg.reindex()

In [None]:
plt.plot(sliding_avg)
plt.show()

In [None]:
# Sliding window averages


# Frequency domain (FFT magnitude of first feature as example)
sample_signal = df_balanced.iloc[:, 0].values[:512]   # take first 512 samples
fft_features = np.abs(fft(sample_signal))
plt.plot(fft_features[:50])
plt.title("Frequency Domain Features")
plt.show()


## Normalization

In [None]:
scaler = MinMaxScaler()
X = scaler.fit_transform(df_balanced.drop(columns=['label']))
y = df_balanced['label']


## Feature Creation

In [None]:
# Ratio of benign vs attack per device (if device column exists)
if 'device' in df_balanced.columns:
    device_profiles = df_balanced.groupby(['device', 'label']).size().unstack(fill_value=0)
    device_profiles['benign_attack_ratio'] = device_profiles['Benign'] / (device_profiles.sum(axis=1))
    print(device_profiles.head())

## Dimensionality Reduction PCA

In [None]:
# Make sure X is DataFrame
X_orig = pd.DataFrame(X)

# Reset indices to avoid mismatch
X = X_orig.reset_index(drop=True)
y = pd.Series(y).reset_index(drop=True)

# Combine into one DataFrame
df_xy = pd.concat([X, y.rename("label")], axis=1)

# Drop rows with NaN across all columns (features + label)
df_xy = df_xy.dropna()

# Split back into X and y
X_clean = df_xy.drop(columns=['label'])
y_clean = df_xy['label']


# Build pipeline for PCA
pca_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="mean")),  # handles any remaining NaN
    ("scaler", StandardScaler()),
    ("pca", PCA(n_components=2))
])

# Fit + transform
X_pca = pca_pipeline.fit_transform(X_clean)

# Plot PCA
plt.figure(figsize=(8,6))
sns.scatterplot(x=X_pca[:,0], y=X_pca[:,1], hue=y_clean, palette="Set1", alpha=0.6)
plt.title("PCA Scatterplot (with NaN handling and aligned labels)")
plt.show()


## Correlation Heatmap

In [None]:
X_orig.columns

In [None]:
plt.figure(figsize=(50,50))
sns.heatmap(pd.DataFrame(X).corr(), cmap='coolwarm', center=0)
plt.title("Correlation Heatmap")
plt.xticks(list(range(0,len(df_balanced.columns))),df_balanced.columns)
plt.yticks(list(range(0,len(df_balanced.columns))),df_balanced.columns)
plt.show()


## Summary Before vs. After Features

In [None]:
print("Original Feature Shape:", df.shape)
print("Engineered Feature Shape:", X.shape)

### Before

In [None]:
print(df.describe())

### After

In [None]:
print(pd.DataFrame(X).describe())

After feature engineering and normalization, the dataset contains 1,026,994 samples with 28 features.

The mean of each feature is close to 0, and the standard deviation is approximately 1, confirming that z-score normalization was applied successfully.

Minimum and maximum values typically range between -8 and +8, indicating that outliers exist but are bounded within a normalized scale.

The quartiles (25%, 50%, 75%) are centered around 0, showing the data is well-distributed after scaling.

Compared to the raw dataset , the engineered dataset ensures fair comparability across all traffic features, which is crucial for PCA and classification.
