In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

# Load dataset
df = pd.read_csv("../data/raw/CICIoV2024_sample.csv")

# Inspect shape
print("Dataset shape:", df.shape)

# Missing values check
print("Total missing values:", df.isna().sum().sum())

# Handle missing values
for col in df.columns:
    if df[col].dtype in ['int64', 'float64']:
        df[col].fillna(df[col].median(), inplace=True)
    else:
        df[col].fillna(df[col].mode()[0], inplace=True)

# Feature selection
selected_features = [
    'ID',
    'DATA_0','DATA_1','DATA_2','DATA_3',
    'DATA_4','DATA_5','DATA_6','DATA_7',
    'label','category','specific_class'
]
df = df[selected_features]

# Encode labels
df['label'] = df['label'].map({'benign': 0, 'malicious': 1})
df['category'] = df['category'].astype('category').cat.codes
df['specific_class'] = df['specific_class'].astype('category').cat.codes

# Scaling
scaler = StandardScaler()
numeric_cols = [
    'ID',
    'DATA_0','DATA_1','DATA_2','DATA_3',
    'DATA_4','DATA_5','DATA_6','DATA_7'
]
df[numeric_cols] = scaler.fit_transform(df[numeric_cols])

# Save processed data
df.to_csv("../data/processed/CICIoV2024_cleaned.csv", index=False)

df.head()
