In [6]:
import pandas as pd
import numpy as np

# ------------------- CONFIG ----------------------------

INPUT_PATH  = '../Retrieval/Data_Combined/dota_pro_combined.csv'
OUTPUT_PATH = '../Retrieval/Data_Combined/dota_pro_combined_enhanced.csv'

Q_SHORT          = 0.25
Q_LONG           = 0.75
EARLY_STOMP_TIME = 1800
LATE_GAME_TIME   = 2700
TOTAL_TOWERS     = 22

REQUIRED_COLS = [
    "kills_radiant", "kills_dire",
    "deaths_radiant", "deaths_dire",
    "assists_radiant", "assists_dire",
    "gold_per_min_radiant", "gold_per_min_dire",
    "xp_per_min_radiant", "xp_per_min_dire",
    "hero_damage_radiant", "hero_damage_dire",
    "towers_killed_radiant", "towers_killed_dire",
    "gold_spent_radiant", "gold_spent_dire",
    "last_hits_radiant", "last_hits_dire",
    "denies_radiant", "denies_dire",
    "duration", "final_gold_advantage",
]

# ---------------- LOAD & VALIDATE -------------------------

df = pd.read_csv(INPUT_PATH)
original_cols = df.columns.tolist()

missing = [c for c in REQUIRED_COLS if c not in df.columns]
if missing:
    raise ValueError(f"Missing required columns: {missing}")

print("Base dataset loaded:", df.shape)

print("-"*80)
print("CREATING ADDITIONAL ENGINEERED FEATURES")
print("-"*80)

# 1. RATIO FEATURES (more stable than raw differences)
print("\n1. Creating ratio features...")

# K/D ratios (with safety check for division by zero)
df['kd_ratio_radiant'] = df['kills_radiant'] / (df['deaths_radiant'] + 1)
df['kd_ratio_dire'] = df['kills_dire'] / (df['deaths_dire'] + 1)
df['kd_advantage'] = df['kd_ratio_radiant'] - df['kd_ratio_dire']

# GPM ratios
df['gpm_ratio'] = df['gold_per_min_radiant'] / (df['gold_per_min_dire'] + 1)
df['xpm_ratio'] = df['xp_per_min_radiant'] / (df['xp_per_min_dire'] + 1)

# Damage efficiency
df['damage_per_gold_radiant'] = df['hero_damage_radiant'] / (df['gold_spent_radiant'] + 1)
df['damage_per_gold_dire'] = df['hero_damage_dire'] / (df['gold_spent_dire'] + 1)

# Tower control
df['tower_ratio'] = df['towers_killed_radiant'] / (df['towers_killed_dire'] + 1)

print(f"Created {7} ratio features")

# 2. INTERACTION FEATURES
print("\n2. Creating interaction features...")

# Kill participation (kills + assists)
df['kill_participation_radiant'] = df['kills_radiant'] + df['assists_radiant']
df['kill_participation_dire'] = df['kills_dire'] + df['assists_dire']

# Combat effectiveness (kills per death)
df['combat_effectiveness_radiant'] = (df['kills_radiant'] + df['assists_radiant']) / (df['deaths_radiant'] + 1)
df['combat_effectiveness_dire'] = (df['kills_dire'] + df['assists_dire']) / (df['deaths_dire'] + 1)

# Economy advantage (normalized by duration)
df['gold_per_minute_match'] = (df['gold_spent_radiant'] + df['gold_spent_dire']) / df['duration'] * 60
df['gold_advantage_per_min'] = df['final_gold_advantage'] / (df['duration'] / 60)

# Objective control score
df['objective_score_radiant'] = (df['towers_killed_radiant'] * 3 +
                                 df['roshans_killed_radiant'] * 5 +
                                 df.get('obs_placed_radiant', 0) * 0.5)
df['objective_score_dire'] = (df['towers_killed_dire'] * 3 +
                              df['roshans_killed_dire'] * 5 +
                              df.get('obs_placed_dire', 0) * 0.5)

print(f"Created {8} interaction features")

# 3. TEMPORAL FEATURES
print("\n3. Creating temporal features...")

# Match pace indicators
df['kills_per_minute'] = (df['kills_radiant'] + df['kills_dire']) / (df['duration'] / 60)
df['is_long_game'] = (df['duration'] > df['duration'].quantile(0.75)).astype(int)
df['is_short_game'] = (df['duration'] < df['duration'].quantile(0.25)).astype(int)

# Game phase analysis
df['likely_early_stomp'] = ((df['duration'] < 1800) &
                            (df['kills_radiant'] + df['kills_dire'] < 30)).astype(int)
df['likely_late_game'] = (df['duration'] > 2700).astype(int)

print(f"Created {5} temporal features")

# 4. DOMINANCE INDICATORS
print("\n4. Creating dominance indicators...")

df['gold_dominance'] = df['final_gold_advantage'] / (df['gold_spent_radiant'] +
                                                     df['gold_spent_dire'] + 1)

total_kills = df['kills_radiant'] + df['kills_dire']
df['kill_dominance'] = (df['kills_radiant'] - df['kills_dire']) / (total_kills + 1)

if 'level_radiant' in df.columns and 'level_dire' in df.columns:
    df['level_advantage'] = df['level_radiant'] - df['level_dire']
else:
    df['level_advantage'] = 0

df['damage_share_radiant'] = df['hero_damage_radiant'] / (
    df['hero_damage_radiant'] + df['hero_damage_dire'] + 1
)

df['tower_dominance'] = (df['towers_killed_radiant'] - df['towers_killed_dire']) / 22

print(f"Created {5} dominance indicators")

# 5. EFFICIENCY METRICS
print("\n5. Creating efficiency metrics...")

df['lh_efficiency_radiant'] = df['last_hits_radiant'] / (df['duration'] / 60)
df['lh_efficiency_dire'] = df['last_hits_dire'] / (df['duration'] / 60)

df['gold_per_kill_radiant'] = df['gold_spent_radiant'] / (df['kills_radiant'] + 1)
df['gold_per_kill_dire'] = df['gold_spent_dire'] / (df['kills_dire'] + 1)

if 'obs_placed_radiant' in df.columns:
    df['ward_efficiency_radiant'] = df['obs_placed_radiant'] / (df['duration'] / 60)
    df['ward_efficiency_dire'] = df['obs_placed_dire'] / (df['duration'] / 60)

print(f"Created {6} efficiency metrics")

# 6. CATEGORICAL ENCODINGS
print("\n6. Encoding categorical features...")

if 'region' in df.columns:
    df['region_encoded'] = df['region'].astype('category').cat.codes

if 'patch' in df.columns:
    df['patch_encoded'] = df['patch'].astype('category').cat.codes

if 'game_mode' in df.columns:
    df['game_mode_encoded'] = df['game_mode'].astype('category').cat.codes

print(f"Encoded {3} categorical features")

# 7. TEAM DIFFERENCE FEATURES
print("\n7. Creating team difference features...")

difference_features = {
    'kill_diff': ('kills_radiant', 'kills_dire'),
    'death_diff': ('deaths_radiant', 'deaths_dire'),
    'assist_diff': ('assists_radiant', 'assists_dire'),
    'last_hit_diff': ('last_hits_radiant', 'last_hits_dire'),
    'deny_diff': ('denies_radiant', 'denies_dire'),
    'damage_diff': ('hero_damage_radiant', 'hero_damage_dire'),
    'healing_diff': ('hero_healing_radiant', 'hero_healing_dire'),
    'tower_damage_diff': ('tower_damage_radiant', 'tower_damage_dire'),
}

for feat_name, (rad_col, dire_col) in difference_features.items():
    if rad_col in df.columns and dire_col in df.columns:
        df[feat_name] = df[rad_col] - df[dire_col]

print(f"Created {len(difference_features)} difference features")

# ---------- SUMMARY ----------
print("\n\n" + "-"*80)
print("FEATURE ENGINEERING SUMMARY")
print("-"*80)
print(f"Original features: 65")
print(f"New features created: {len(df.columns) - 65}")
print(f"Total features: {len(df.columns)}")

print("\nFeature Categories:")
print(f"  - Ratio features: 7")
print(f"  - Interaction features: 8")
print(f"  - Temporal features: 5")
print(f"  - Dominance indicators: 5")
print(f"  - Efficiency metrics: 6")
print(f"  - Encoded categoricals: 3")
print(f"  - Team differences: {len(difference_features)}")

# Data Quality Check
print("\nData Quality Check:")
numeric_df = df.select_dtypes(include=[np.number])
inf_mask = np.isinf(numeric_df).any()
inf_cols = numeric_df.columns[inf_mask].tolist()

if inf_cols:
    print(f"Warning: Infinite values in {len(inf_cols)} columns: {inf_cols[:5]}")
    df.replace([np.inf, -np.inf], np.nan, inplace=True)
    print("   Replaced infinite values with NaN")
else:
    print("No infinite values detected")

nan_summary = df.isnull().sum().sum()
print(f"Missing values: {nan_summary} total")

#-------------MISSING VALUE CLEANUP---------------

print("\nMissing value breakdown (top 20 columns):")
nan_counts = df.isnull().sum()
nan_counts = nan_counts[nan_counts > 0].sort_values(ascending=False)
print(nan_counts.head(20))

print("\nApplying targeted missing-value handling...")

event_like_cols = [c for c in df.columns
                   if any(key in c.lower() for key in ["event_", "ward", "rune", "roshan", "tower_killed"])]

for col in event_like_cols:
    df[col] = df[col].fillna(0)

for col in ["gold_spent_radiant", "gold_spent_dire",
            "gold_per_min_radiant", "gold_per_min_dire",
            "xp_per_min_radiant", "xp_per_min_dire"]:
    if col in df.columns:
        df[col] = df[col].fillna(df[col].median())

df.replace([np.inf, -np.inf], np.nan, inplace=True)

print("Final missing values:", df.isnull().sum().sum())


# ---------- SAVE ENHANCED DATASET ----------
print("\nSaving enhanced dataset...")
df.to_csv(OUTPUT_PATH, index=False)
print(f"Saved to '{OUTPUT_PATH}'")

# ---------- CORRELATIONS FOR NEW FEATURES ----------
print("\nTop 15 NEW features correlated with radiant_win:")

new_features = [c for c in df.columns
                if c not in original_cols and c != 'radiant_win']

if new_features:
    new_corr = (df[new_features]
                .corrwith(df['radiant_win'].astype(int))
                .abs()
                .sort_values(ascending=False))
    for i, (feat, corr) in enumerate(new_corr.head(15).items(), 1):
        print(f"{i:2d}. {feat:40s}: {corr:.4f}")

print("\n" + "-"*80)
print("FEATURE ENGINEERING COMPLETE")
print("\nSaving full feature correlation ranking...")

numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
if 'radiant_win' in numeric_cols:
    numeric_cols.remove('radiant_win')

corr_series = (df[numeric_cols]
               .corrwith(df['radiant_win'].astype(int))
               .sort_values(key=lambda s: s.abs(), ascending=False))

corr_df = corr_series.to_frame(name='corr_with_radiant_win')
corr_path = '../Retrieval/Data_Combined/feature_correlations_all.csv'
corr_df.to_csv(corr_path)

print(f"Saved correlations to {corr_path}")
print("Top 10 features:")
print(corr_df.head(10))

print("-"*80)
print("\nRecommendations for modeling:")
print("1. Use 'dota_pro_combined_enhanced.csv' for baseline models")
print("2. Consider feature selection to reduce dimensionality")
print("3. Ratio/dominance features often perform well in competitive games")
print("4. Gold/kill advantages are traditionally strong predictors")
print("5. Ready for train/test split and model training!")


Base dataset loaded: (15000, 65)
--------------------------------------------------------------------------------
CREATING ADDITIONAL ENGINEERED FEATURES
--------------------------------------------------------------------------------

1. Creating ratio features...
Created 7 ratio features

2. Creating interaction features...
Created 8 interaction features

3. Creating temporal features...
Created 5 temporal features

4. Creating dominance indicators...
Created 5 dominance indicators

5. Creating efficiency metrics...
Created 6 efficiency metrics

6. Encoding categorical features...
Encoded 3 categorical features

7. Creating team difference features...
Created 8 difference features


--------------------------------------------------------------------------------
FEATURE ENGINEERING SUMMARY
--------------------------------------------------------------------------------
Original features: 65
New features created: 43
Total features: 108

Feature Categories:
  - Ratio features: 7
  - In