# BoTNeTIoT-L01 Review
____


## Importing modules

In [None]:
# importing standard
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

## Loading dataset

In [None]:
# read raw file
file_path = "../Step1-Datasets-Feature-Engineering/team11_BotNeTIoT-L01_label_NoDuplicates.csv"
df = pd.read_csv(file_path)


if len(df.columns) == 1 and "github.com" in df.columns[0]:
    raise ValueError("You have downloaded an LFS pointer, and not the actual file.\n" \
    "Please download team11_BotNeTIoT-L01_label_NoDuplicates.csv directly from the git repo.\n" \
    "Place this download into the 'Step1-Datasets-Feature-Engineering' folder and try again.")

df = df.drop(columns="Unnamed: 0")


print("Data shape:", df.shape)
df.head()

# Data Cleaning & Handling Imbalance

## Check missing values and drop if necessary

In [None]:
# resample dataset, and drop unnamed

print(f"Total null values: {df.isnull().sum().sum()}\n")

# Drop duplicates if any
df = df.drop_duplicates()
if 'Unnamed: 0' in df.columns:
    df = df.drop(columns=['Unnamed: 0'])
df = df.apply(lambda x : np.round(x, 5))

df_attack = df[df['label'] == 0]   # attack
df_benign = df[df['label'] == 1]   # benign


df_attack_downsampled = resample(
    df_attack,
    replace=False,
    n_samples=len(df_benign),
    random_state=42
)

df_balanced = pd.concat([df_attack_downsampled, df_benign])
df_balanced = df_balanced.reset_index()
print(f"Total value count:\n\n{df_balanced['label'].value_counts()}")


## Normalization

In [None]:
# normalize using min max
scaler = MinMaxScaler()
X = scaler.fit_transform(df_balanced.drop(columns=['label']))
y = df_balanced['label']


## Feature Creation

In [None]:
# Ratio of benign vs attack per device (if device column exists)
if 'device' in df_balanced.columns:
    device_profiles = df_balanced.groupby(['device', 'label']).size().unstack(fill_value=0)
    device_profiles['benign_attack_ratio'] = device_profiles['Benign'] / (device_profiles.sum(axis=1))
    print(device_profiles.head())

## Dimensionality Reduction PCA

In [None]:
# Make sure X is DataFrame
X_orig = pd.DataFrame(X)

# Reset indices to avoid mismatch
X = X_orig.reset_index(drop=True)
y = pd.Series(y).reset_index(drop=True)

# Combine into one DataFrame
df_xy = pd.concat([X, y.rename("label")], axis=1)

# Drop rows with NaN across all columns (features + label)
df_xy = df_xy.dropna()

# Split back into X and y
X_clean = df_xy.drop(columns=['label'])
y_clean = df_xy['label']


# Build pipeline for PCA
pca_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="mean")),  # handles any remaining NaN
    ("scaler", StandardScaler()),
    ("pca", PCA(n_components=2))
])

# Fit + transform
X_pca = pca_pipeline.fit_transform(X_clean)

# Plot PCA
plt.figure(figsize=(8,6))
sns.scatterplot(x=X_pca[:,0], y=X_pca[:,1], hue=y_clean, palette="Set1", alpha=0.6)
plt.title("PCA Scatterplot (with NaN handling and aligned labels)")
plt.show()


## Correlation Heatmap

In [None]:
#create heatmap
plt.figure(figsize=(50,50))
sns.heatmap(pd.DataFrame(X).corr(), cmap='coolwarm', center=0)
plt.title("Correlation Heatmap")
plt.xticks(list(range(0,len(df_balanced.columns))),df_balanced.columns)
plt.yticks(list(range(0,len(df_balanced.columns))),df_balanced.columns)
plt.show()


## Summary Before vs. After Features

In [None]:
# print feature shapes
print("Original Feature Shape:", df.shape)
print("Engineered Feature Shape:", X.shape)

### Before

In [None]:
# features before
print(df.describe())

### After

In [None]:
# features after
print(pd.DataFrame(X).describe())

After feature engineering and normalization, the dataset contains 1,026,994 samples with 28 features.

The mean of each feature is close to 0, and the standard deviation is approximately 1, confirming that z-score normalization was applied successfully.

Minimum and maximum values typically range between -8 and +8, indicating that outliers exist but are bounded within a normalized scale.

The quartiles (25%, 50%, 75%) are centered around 0, showing the data is well-distributed after scaling.

Compared to the raw dataset , the engineered dataset ensures fair comparability across all traffic features, which is crucial for PCA and classification.


# Export Train/Split files

In [None]:
# split onto train/test (80/20)
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2)