In [9]:
import pandas as pd
import numpy as np

# ------------------- CONFIG ----------------------------

INPUT_PATH  = '../Retrieval/Data_Combined/combined_matches_players_all_1.csv'
OUTPUT_PATH = '../Retrieval/Data_Combined/combined_matches_players_all_1_enhanced.csv'

REQUIRED_COLS = [
    'account_id', 'gold_spent', 'hero_damage',
    'hero_healing', 'tower_damage', 'party_size',
    'camps_stacked', 'creeps_stacked', 'obs_placed',
    'sen_placed', 'purchase_tpscroll', 'rune_pickups',
    'roshans_killed', 'towers_killed', 'region', 'skill'
]

# ---------------- LOAD & VALIDATE -------------------------

df = pd.read_csv(INPUT_PATH)
original_cols = df.columns.tolist()

missing = [c for c in REQUIRED_COLS if c not in df.columns]
if missing:
    raise ValueError(f"Missing required columns: {missing}")

print("Base dataset loaded:", df.shape)

print("-"*80)
print("CREATING ADDITIONAL ENGINEERED FEATURES")
print("-"*80)

# ---------------------------------------------------------
# Since only REQUIRED_COLS are allowed, all other columns 
# referenced in the old script are invalid and removed.
#
# Only create features that use REQUIRED_COLS.
# ---------------------------------------------------------

print("\n1. Creating basic derived features from REQUIRED_COLS...")

# Simple efficiency metrics based only on REQUIRED_COLS
df['damage_per_gold'] = df['hero_damage'] / (df['gold_spent'] + 1)
df['healing_per_gold'] = df['hero_healing'] / (df['gold_spent'] + 1)
df['tower_damage_share'] = df['tower_damage'] / (df['towers_killed'] + 1)

print("Created 3 allowed derived features")

# ---------- SUMMARY ----------
print("\n\n" + "-"*80)
print("FEATURE ENGINEERING SUMMARY")
print("-"*80)
print(f"Original features: {len(original_cols)}")
print(f"New features created: {len(df.columns) - len(original_cols)}")
print(f"Total features: {len(df.columns)}")

# Data Quality Check
print("\nData Quality Check:")
numeric_df = df.select_dtypes(include=[np.number])
inf_mask = np.isinf(numeric_df).any()
inf_cols = numeric_df.columns[inf_mask].tolist()

if inf_cols:
    print(f"Warning: Infinite values in {len(inf_cols)} columns: {inf_cols[:5]}")
    df.replace([np.inf, -np.inf], np.nan, inplace=True)
    print("   Replaced infinite values with NaN")
else:
    print("No infinite values detected")

nan_summary = df.isnull().sum().sum()
print(f"Missing values: {nan_summary} total")

print("\nMissing value breakdown (top 20 columns):")
nan_counts = df.isnull().sum()
nan_counts = nan_counts[nan_counts > 0].sort_values(ascending=False)
print(nan_counts.head(20))

print("\nApplying targeted missing-value handling...")

# Event-like columns from REQUIRED_COLS
event_like_cols = [c for c in ['obs_placed','sen_placed','rune_pickups','roshans_killed','towers_killed']]

for col in event_like_cols:
    if col in df.columns:
        df[col] = df[col].fillna(0)

# Gold-based median fill
if 'gold_spent' in df.columns:
    df['gold_spent'] = df['gold_spent'].fillna(df['gold_spent'].median())

df.replace([np.inf, -np.inf], np.nan, inplace=True)

print("Final missing values:", df.isnull().sum().sum())

# ---------- SAVE ENHANCED DATASET ----------
print("\nSaving enhanced dataset...")
df.to_csv(OUTPUT_PATH, index=False)
print(f"Saved to '{OUTPUT_PATH}'")

print("-"*80)
print("FEATURE ENGINEERING COMPLETE")


Base dataset loaded: (1042190, 47)
--------------------------------------------------------------------------------
CREATING ADDITIONAL ENGINEERED FEATURES
--------------------------------------------------------------------------------

1. Creating basic derived features from REQUIRED_COLS...
Created 3 allowed derived features


--------------------------------------------------------------------------------
FEATURE ENGINEERING SUMMARY
--------------------------------------------------------------------------------
Original features: 47
New features created: 3
Total features: 50

Data Quality Check:
No infinite values detected
Missing values: 1718937 total

Missing value breakdown (top 20 columns):
skill                 1042190
purchase_tpscroll      227560
tower_damage_share      40837
sen_placed              32731
rune_pickups            32731
creeps_stacked          32731
camps_stacked           32731
obs_placed              32731
towers_killed           31461
roshans_killed       