In [4]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

# ---------- PATHS ----------
enhanced_path   = '../Retrieval/Data_Combined/dota_pro_combined_enhanced.csv'

X_full_path     = '../Retrieval/Data_Combined/dota_X_final_full.csv'
X_noleak_path   = '../Retrieval/Data_Combined/dota_X_final_noleak.csv'
X_scaled_path   = '../Retrieval/Data_Combined/dota_X_final_noleak_scaled.csv'
y_path          = '../Retrieval/Data_Combined/dota_y_final.csv'
X_mid_path      = '../Retrieval/Data_Combined/dota_X_midgame.csv'
X_mid_scaled_path = '../Retrieval/Data_Combined/dota_X_midgame_scaled.csv'
# ---------- MID-GAME FILTER CONFIG ----------
# Columns containing these substrings will be EXCLUDED from the mid-game feature set.
MIDGAME_EXCLUDE_KEYS = [
    "tower_damage",    # objective damage is very end-game heavy
    "hero_damage",     # total damage is strongly end-game
    "roshan",          # roshans are late-game objectives
    "barracks",        # barracks status is usually late-game
]

# ---------- LOAD ENHANCED DATA ----------
df = pd.read_csv(enhanced_path)
print("Loaded enhanced dataset:", df.shape)


# ---------- TARGET ----------
target_col = 'radiant_win'
y = df[target_col].astype(int)

# ---------- DROP NON-FEATURE COLUMNS ----------
drop_cols = [
    target_col,     # label
    'match_id',
    'match_seq_num',
    'start_time',
    'start_time_utc',
]

drop_cols = [c for c in drop_cols if c in df.columns]
X_raw = df.drop(columns=drop_cols)
print("After dropping ID/meta cols:", X_raw.shape)

# ---------- HANDLE CATEGORICALS ----------
cat_cols = X_raw.select_dtypes(include=['object']).columns.tolist()
print("Categorical columns:", cat_cols)

X = pd.get_dummies(X_raw, columns=cat_cols, drop_first=True)
print("After one-hot encoding:", X.shape)

# ---------- HANDLE MISSING VALUES ----------
# For this project, 0 is a reasonable default for most engineered stats (no event / no value recorded)
X = X.replace([np.inf, -np.inf], np.nan)
X = X.fillna(0)

print("Any NaNs left in X?", X.isnull().sum().sum())

# ---------- LEAKY vs NO-LEAK FEATURE SETS ----------
# "Leaky" features are those that encode end-of-game outcome directly
leaky_cols = [
    "final_gold_advantage",
    "final_xp_advantage",
    "towers_killed_radiant",
    "towers_killed_dire",
    "roshans_killed_radiant",
    "roshans_killed_dire",
    "tower_dominance",
    "gold_dominance",
    "kill_dominance",
    "damage_share_radiant",
]

# keep only those that actually exist after get_dummies
leaky_cols = [c for c in leaky_cols if c in X.columns]
print("Leaky columns used in full set but dropped in no-leak set:")
print(leaky_cols)

X_full = X.copy()
X_noleak = X.drop(columns=leaky_cols)

print("X_full shape:   ", X_full.shape)
print("X_noleak shape: ", X_noleak.shape)

# ---------- SAVE RAW FEATURES & LABELS ----------
X_full.to_csv(X_full_path, index=False)
X_noleak.to_csv(X_noleak_path, index=False)
y.to_csv(y_path, index=False)

print(f"Saved X_full   to {X_full_path}")
print(f"Saved X_noleak to {X_noleak_path}")
print(f"Saved y        to {y_path}")

# ---------- BUILD MID-GAME FEATURE SET (MORE REAL-TIME) ----------
# Start from the no-leak set, then remove very late-game heavy features
midgame_cols = [
    c for c in X_noleak.columns
    if not any(key in c.lower() for key in MIDGAME_EXCLUDE_KEYS)
]

X_mid = X_noleak[midgame_cols]

print("X_mid shape (mid-game subset):", X_mid.shape)

# Save mid-game raw features
X_mid.to_csv(X_mid_path, index=False)
print(f"Saved X_mid to {X_mid_path}")

# ---------- SCALED VERSIONS ----------
scaler_noleak = StandardScaler()
X_noleak_scaled = pd.DataFrame(
    scaler_noleak.fit_transform(X_noleak),
    columns=X_noleak.columns
)
X_noleak_scaled.to_csv(X_scaled_path, index=False)
print(f"Saved scaled no-leak X to {X_scaled_path}")

scaler_mid = StandardScaler()
X_mid_scaled = pd.DataFrame(
    scaler_mid.fit_transform(X_mid),
    columns=X_mid.columns
)
X_mid_scaled.to_csv(X_mid_scaled_path, index=False)
print(f"Saved scaled mid-game X to {X_mid_scaled_path}")


Loaded enhanced dataset: (15000, 108)
After dropping ID/meta cols: (15000, 104)
Categorical columns: []
After one-hot encoding: (15000, 104)
Any NaNs left in X? 0
Leaky columns used in full set but dropped in no-leak set:
['final_gold_advantage', 'final_xp_advantage', 'towers_killed_radiant', 'towers_killed_dire', 'roshans_killed_radiant', 'roshans_killed_dire', 'tower_dominance', 'gold_dominance', 'kill_dominance', 'damage_share_radiant']
X_full shape:    (15000, 104)
X_noleak shape:  (15000, 94)
Saved X_full   to ../Retrieval/Data_Combined/dota_X_final_full.csv
Saved X_noleak to ../Retrieval/Data_Combined/dota_X_final_noleak.csv
Saved y        to ../Retrieval/Data_Combined/dota_y_final.csv
X_mid shape (mid-game subset): (15000, 86)
Saved X_mid to ../Retrieval/Data_Combined/dota_X_midgame.csv
Saved scaled no-leak X to ../Retrieval/Data_Combined/dota_X_final_noleak_scaled.csv
Saved scaled mid-game X to ../Retrieval/Data_Combined/dota_X_midgame_scaled.csv
