In [1]:
# Step2_preprocessing_pipeline.ipynb

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import joblib
import os

# ---------------------------
# Paths (absolute paths with your main folder name)
# ---------------------------
RAW_DATA_PATH = r"C:\Users\uthay\Desktop\CyberThreats_FinancialLoss_Prediction_ML\data\interim\selected_features_cleaned.csv"
PROCESSED_PATH = r"C:\Users\uthay\Desktop\CyberThreats_FinancialLoss_Prediction_ML\data\processed"
PIPELINE_PATH = r"C:\Users\uthay\Desktop\CyberThreats_FinancialLoss_Prediction_ML\outputs\utils\preprocessing_pipeline.joblib"

os.makedirs(PROCESSED_PATH, exist_ok=True)
os.makedirs(r"C:\Users\uthay\Desktop\CyberThreats_FinancialLoss_Prediction_ML\outputs\utils", exist_ok=True)

# ---------------------------
# Load dataset (cleaned from Step1)
# ---------------------------
df = pd.read_csv(RAW_DATA_PATH)

# ---------------------------
# Selected features
# ---------------------------
numeric_features = ['Number of Affected Users', 'Incident Resolution Time (in Hours)']
categorical_features = ['Attack Type', 'Target Industry', 'Attack Source', 'Security Vulnerability Type']
target = 'Financial Loss (in Million $)'

# Add interaction feature
df['AttackType_TargetIndustry'] = df['Attack Type'] + "_" + df['Target Industry']
categorical_features.append('AttackType_TargetIndustry')

# ---------------------------
# Preprocessing pipelines
# ---------------------------
numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())])
categorical_transformer = OneHotEncoder(handle_unknown='ignore', sparse_output=False)

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

# ---------------------------
# Train/test split
# ---------------------------
X = df[numeric_features + categorical_features]
y = df[target]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print("Train shape:", X_train.shape)
print("Test shape:", X_test.shape)

# ---------------------------
# Fit preprocessing pipeline
# ---------------------------
preprocessor.fit(X_train)

X_train_processed = preprocessor.transform(X_train)
X_test_processed = preprocessor.transform(X_test)

# ---------------------------
# Save preprocessor
# ---------------------------
joblib.dump(preprocessor, PIPELINE_PATH)
print(f"Preprocessing pipeline saved at: {PIPELINE_PATH}")

# ---------------------------
# Save processed train/test datasets
# ---------------------------
np.save(os.path.join(PROCESSED_PATH, "X_train.npy"), X_train_processed)
np.save(os.path.join(PROCESSED_PATH, "X_test.npy"), X_test_processed)
np.save(os.path.join(PROCESSED_PATH, "y_train.npy"), y_train)
np.save(os.path.join(PROCESSED_PATH, "y_test.npy"), y_test)

print("Step 2 preprocessing completed successfully!")

Train shape: (2400, 7)
Test shape: (600, 7)
Preprocessing pipeline saved at: C:\Users\uthay\Desktop\CyberThreats_FinancialLoss_Prediction_ML\outputs\utils\preprocessing_pipeline.joblib
Step 2 preprocessing completed successfully!
