In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

# ---------- PATHS ----------
enhanced_path = '../Retrieval/Data_Combined/dota_pro_combined_enhanced.csv'
X_path        = '../Retrieval/Data_Combined/dota_X_final.csv'
X_scaled_path = '../Retrieval/Data_Combined/dota_X_final_scaled.csv'
y_path        = '../Retrieval/Data_Combined/dota_y_final.csv'

# ---------- LOAD ENHANCED DATA ----------
df = pd.read_csv(enhanced_path)
print("Loaded enhanced dataset:", df.shape)

# ---------- TARGET ----------
target_col = 'radiant_win'
y = df[target_col].astype(int)

# ---------- DROP NON-FEATURE COLUMNS ----------
drop_cols = [
    target_col,
    'match_id',
    'match_seq_num',
    'start_time',
    'start_time_utc',
]

drop_cols = [c for c in drop_cols if c in df.columns]
X_raw = df.drop(columns=drop_cols)
print("After dropping ID/meta cols:", X_raw.shape)

# ---------- HANDLE CATEGORICALS ----------
cat_cols = X_raw.select_dtypes(include=['object']).columns.tolist()
print("Categorical columns:", cat_cols)

X = pd.get_dummies(X_raw, columns=cat_cols, drop_first=True)
print("After one-hot encoding:", X.shape)

# ---------- HANDLE MISSING VALUES ----------
# For this project, 0 is a reasonable default for most engineered stats (no event / no value recorded)
X = X.replace([np.inf, -np.inf], np.nan)
X = X.fillna(0)

print("Any NaNs left?", X.isnull().sum().sum())

# ---------- SAVE RAW FEATURES & LABELS ----------
X.to_csv(X_path, index=False)
y.to_csv(y_path, index=False)

print(f"Saved X to {X_path}")
print(f"Saved y to {y_path}")


Loaded enhanced dataset: (15000, 108)
After dropping ID/meta cols: (15000, 104)
Categorical columns: []
After one-hot encoding: (15000, 104)
Any NaNs left? 0
Saved X to ../Retrieval/Data_Combined/dota_X_final.csv
Saved y to ../Retrieval/Data_Combined/dota_y_final.csv
