In [1]:
# Step1_feature_selection_and_cleaning.ipynb

import pandas as pd
import os

# ---------------------------
# Paths (absolute path to your dataset)
# ---------------------------
RAW_DATA_PATH = r"C:\Users\uthay\Desktop\CyberThreats_FinancialLoss_Prediction_ML\data\raw\Global_Cybersecurity_Threats_2015-2024 (1).csv"
INTERIM_PATH = r"C:\Users\uthay\Desktop\CyberThreats_FinancialLoss_Prediction_ML\data\interim"
os.makedirs(INTERIM_PATH, exist_ok=True)

# ---------------------------
# Load dataset
# ---------------------------
print("Loading dataset...")
df = pd.read_csv(RAW_DATA_PATH)
print("Dataset loaded successfully!")
print("Total rows:", df.shape[0])
print("Total columns:", df.shape[1])

# ---------------------------
# Select important features
# ---------------------------
selected_features = [
    'Attack Type',
    'Target Industry',
    'Number of Affected Users',
    'Attack Source',
    'Security Vulnerability Type',
    'Incident Resolution Time (in Hours)',
    'Financial Loss (in Million $)'  # target
]

df = df[selected_features]
print("\nFirst 5 rows (selected features):")
print(df.head())

# ---------------------------
# Identify numeric and categorical columns
# ---------------------------
numeric_cols = ['Number of Affected Users', 'Incident Resolution Time (in Hours)']
categorical_cols = ['Attack Type', 'Target Industry', 'Attack Source', 'Security Vulnerability Type']
target = 'Financial Loss (in Million $)'

print("\nNumeric columns:", numeric_cols)
print("Categorical columns:", categorical_cols)
print("Target variable:", target)

# ---------------------------
# Basic cleaning
# ---------------------------
df = df.drop_duplicates()

df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].median())
for col in categorical_cols:
    df[col] = df[col].fillna(df[col].mode()[0])

print("\nMissing values after cleaning:")
print(df.isnull().sum())

# ---------------------------
# Save cleaned dataset
# ---------------------------
CLEANED_FILE = os.path.join(INTERIM_PATH, "selected_features_cleaned.csv")
df.to_csv(CLEANED_FILE, index=False)
print(f"\nCleaned dataset saved at: {CLEANED_FILE}")

Loading dataset...
Dataset loaded successfully!
Total rows: 3000
Total columns: 10

First 5 rows (selected features):
         Attack Type     Target Industry  Number of Affected Users  \
0           Phishing           Education                    773169   
1         Ransomware              Retail                    295961   
2  Man-in-the-Middle                  IT                    605895   
3         Ransomware  Telecommunications                    659320   
4  Man-in-the-Middle                  IT                    810682   

  Attack Source Security Vulnerability Type  \
0  Hacker Group          Unpatched Software   
1  Hacker Group          Unpatched Software   
2  Hacker Group              Weak Passwords   
3  Nation-state          Social Engineering   
4       Insider          Social Engineering   

   Incident Resolution Time (in Hours)  Financial Loss (in Million $)  
0                                   63                          80.53  
1                                 