In [1]:
# Step2_preprocessing_binary_fixed.ipynb

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
import joblib
import os

# ---------------------------
# Base directory (updated)
# ---------------------------
BASE_DIR = r"C:\Users\uthay\Desktop\CyberThreats_FinancialLoss_Prediction_ML_Final_Project"

# ---------------------------
# Paths
# ---------------------------
TRAIN_PATH = os.path.join(BASE_DIR, "data", "interim", "train_features_cleaned_binary.csv")
TEST_PATH  = os.path.join(BASE_DIR, "data", "interim", "test_features_cleaned_binary.csv")
PROCESSED_PATH = os.path.join(BASE_DIR, "data", "processed")
PIPELINE_PATH  = os.path.join(BASE_DIR, "outputs", "utils", "preprocessing_pipeline_binary.joblib")

os.makedirs(PROCESSED_PATH, exist_ok=True)
os.makedirs(os.path.dirname(PIPELINE_PATH), exist_ok=True)

# ---------------------------
# Load Step 1 datasets
# ---------------------------
train_df = pd.read_csv(TRAIN_PATH)
test_df  = pd.read_csv(TEST_PATH)

df = pd.concat([train_df, test_df], ignore_index=True)
print("Combined dataset shape:", df.shape)

# ---------------------------
# Feature lists
# ---------------------------
numeric_features = ['Number of Affected Users', 'Incident Resolution Time (in Hours)']
categorical_features = ['Attack Type', 'Target Industry', 'Attack Source', 'Security Vulnerability Type']
target = 'loss_class_binary'

# ---------------------------
# Derived interaction feature
# ---------------------------
df['AttackType_TargetIndustry'] = df['Attack Type'] + "_" + df['Target Industry']
categorical_features.append('AttackType_TargetIndustry')

# ---------------------------
# Train/Test split (stratified)
# ---------------------------
X = df[numeric_features + categorical_features]
y = df[target]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("\nTrain class distribution:\n", y_train.value_counts(normalize=True))
print("\nTest class distribution:\n", y_test.value_counts(normalize=True))

# ---------------------------
# Preprocessing pipelines
# ---------------------------
numeric_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = OneHotEncoder(handle_unknown='ignore', sparse_output=False)

preprocessor = ColumnTransformer([
    ('num', numeric_transformer, numeric_features),
    ('cat', categorical_transformer, categorical_features)
])

# ---------------------------
# Fit on TRAIN only
# ---------------------------
preprocessor.fit(X_train)

X_train_processed = preprocessor.transform(X_train)
X_test_processed = preprocessor.transform(X_test)

# Feature names
feature_names = preprocessor.get_feature_names_out()
print("\nProcessed train shape:", X_train_processed.shape)
print("Processed test shape:", X_test_processed.shape)
print("Final feature count:", len(feature_names))

# ---------------------------
# Save preprocessor
# ---------------------------
joblib.dump(preprocessor, PIPELINE_PATH)
print(f"\nPreprocessing pipeline saved at:\n{PIPELINE_PATH}")

# ---------------------------
# Save processed datasets
# ---------------------------
np.save(os.path.join(PROCESSED_PATH, "X_train_binary.npy"), X_train_processed)
np.save(os.path.join(PROCESSED_PATH, "X_test_binary.npy"), X_test_processed)
np.save(os.path.join(PROCESSED_PATH, "y_train_binary.npy"), y_train.values)
np.save(os.path.join(PROCESSED_PATH, "y_test_binary.npy"), y_test.values)

# Save as DataFrames
pd.DataFrame(X_train_processed, columns=feature_names).to_csv(
    os.path.join(PROCESSED_PATH, "X_train_binary.csv"), index=False
)
pd.DataFrame(X_test_processed, columns=feature_names).to_csv(
    os.path.join(PROCESSED_PATH, "X_test_binary.csv"), index=False
)

print("\n✅ Step 2 (Binary) preprocessing completed successfully!")
print("Processed datasets saved at:", PROCESSED_PATH)


Combined dataset shape: (3000, 8)

Train class distribution:
 loss_class_binary
0    0.647083
1    0.352917
Name: proportion, dtype: float64

Test class distribution:
 loss_class_binary
0    0.646667
1    0.353333
Name: proportion, dtype: float64

Processed train shape: (2400, 65)
Processed test shape: (600, 65)
Final feature count: 65

Preprocessing pipeline saved at:
C:\Users\uthay\Desktop\CyberThreats_FinancialLoss_Prediction_ML_Final_Project\outputs\utils\preprocessing_pipeline_binary.joblib

✅ Step 2 (Binary) preprocessing completed successfully!
Processed datasets saved at: C:\Users\uthay\Desktop\CyberThreats_FinancialLoss_Prediction_ML_Final_Project\data\processed
