# 🔧 Run Feature Engineering Pipeline

This notebook executes the complete feature engineering pipeline to generate the processed features needed for analysis.

## What it does:
1. Loads raw CTG data
2. Creates three-tier feature architecture
3. Saves processed features for other notebooks


In [4]:
# Import required libraries
import pandas as pd
import numpy as np
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

print("Starting Feature Engineering Pipeline...")
print("="*50)

Starting Feature Engineering Pipeline...


In [5]:
# Load CTG data
base_path = Path('/Users/diyagamah/Documents/nba_props_model/data')
season_path = base_path / 'ctg_data_organized' / 'players' / '2023-24' / 'regular_season'

# Load offensive data
offensive_path = season_path / 'offensive_overview' / 'offensive_overview.csv'
offensive_df = pd.read_csv(offensive_path)

# Clean percentage columns
for col in ['Usage', 'AST%', 'TOV%']:
    if col in offensive_df.columns and offensive_df[col].dtype == 'object':
        offensive_df[col] = offensive_df[col].str.replace('%', '').astype(float)

print(f"✓ Loaded offensive data: {len(offensive_df)} players")

# Load defense data
defense_path = season_path / 'defense_rebounding' / 'defense_rebounding.csv'
defense_df = pd.read_csv(defense_path)

# Clean defense percentage columns
for col in ['fgOR%', 'fgDR%', 'ftOR%', 'ftDR%', 'BLK%', 'STL%', 'FOUL%']:
    if col in defense_df.columns and defense_df[col].dtype == 'object':
        defense_df[col] = defense_df[col].str.replace('%', '').astype(float)

print(f"✓ Loaded defense data: {len(defense_df)} players")

# Load shooting data
shooting_path = season_path / 'shooting_overall' / 'shooting_overall.csv'
if shooting_path.exists():
    shooting_df = pd.read_csv(shooting_path)
    for col in ['eFG%', '2P%', '3P%', 'FT%']:
        if col in shooting_df.columns and shooting_df[col].dtype == 'object':
            shooting_df[col] = shooting_df[col].str.replace('%', '').astype(float)
    print(f"✓ Loaded shooting data: {len(shooting_df)} players")

✓ Loaded offensive data: 503 players
✓ Loaded defense data: 503 players
✓ Loaded shooting data: 503 players


In [6]:
# Merge all data
player_df = offensive_df.copy()

# Merge defense data
defense_cols = ['Player', 'Team', 'fgOR%', 'fgDR%', 'ftOR%', 'ftDR%', 'BLK%', 'STL%', 'FOUL%']
defense_cols = [col for col in defense_cols if col in defense_df.columns]
player_df = player_df.merge(
    defense_df[defense_cols],
    on=['Player', 'Team'],
    how='left',
    suffixes=('', '_def')
)

# Merge shooting data if exists
if 'shooting_df' in locals():
    shooting_cols = ['Player', 'Team', 'eFG%', '2P%', '3P%', 'FT%']
    shooting_cols = [col for col in shooting_cols if col in shooting_df.columns]
    player_df = player_df.merge(
        shooting_df[shooting_cols],
        on=['Player', 'Team'],
        how='left',
        suffixes=('', '_shoot')
    )

print(f"\n✓ Merged data shape: {player_df.shape}")
print(f"  Columns: {player_df.shape[1]}")
print(f"  Players: {player_df.shape[0]}")


✓ Merged data shape: (503, 26)
  Columns: 26
  Players: 503


In [7]:
# Create Core Performance Features (Tier 1)
core_features = pd.DataFrame(index=player_df.index)

# Basic performance metrics
core_features['USG_percent'] = player_df['Usage']
core_features['PSA'] = player_df['PSA']
core_features['MIN'] = player_df['MIN']

# Playmaking
if 'AST%' in player_df.columns:
    core_features['AST_percent'] = player_df['AST%']
    core_features['AST_to_USG_Ratio'] = player_df['AST%'] / (player_df['Usage'] + 0.001)

# Rebounding
if 'fgDR%' in player_df.columns:
    core_features['fgDR_percent'] = player_df['fgDR%']
if 'fgOR%' in player_df.columns:
    core_features['fgOR_percent'] = player_df['fgOR%']
    core_features['Total_REB_percent'] = player_df['fgDR%'] + player_df['fgOR%']

# Shooting efficiency
if 'eFG%' in player_df.columns:
    core_features['eFG_percent'] = player_df['eFG%']

# Turnover rate
if 'TOV%' in player_df.columns:
    core_features['TOV_percent'] = player_df['TOV%']

# Defense
if all(col in player_df.columns for col in ['BLK%', 'STL%']):
    core_features['Defensive_Activity'] = player_df['BLK%'] + player_df['STL%']

print(f"✓ Created {len(core_features.columns)} core performance features")

✓ Created 11 core performance features


In [8]:
# Create Contextual Features (Tier 2)
context_features = pd.DataFrame(index=player_df.index)

# Minutes category
context_features['Minutes_Season_Avg'] = player_df['MIN']
context_features['Minutes_Category'] = pd.cut(
    player_df['MIN'], 
    bins=[0, 15, 25, 32, 40],
    labels=['Bench', 'Rotation', 'Starter', 'Star']
)

# Position inference
if 'AST%' in player_df.columns and 'fgDR%' in player_df.columns:
    conditions = [
        (player_df['AST%'] > player_df['AST%'].quantile(0.7)),
        (player_df['fgDR%'] > player_df['fgDR%'].quantile(0.7)),
    ]
    choices = ['Guard', 'Big']
    context_features['Position_Inferred'] = np.select(conditions, choices, default='Forward')

# Role classification
if 'Usage' in player_df.columns and 'MIN' in player_df.columns:
    conditions = [
        (player_df['Usage'] > 25) & (player_df['MIN'] > 30),
        (player_df['Usage'] > 20) & (player_df['MIN'] > 25),
        (player_df['MIN'] > 20),
    ]
    choices = ['Primary', 'Secondary', 'Role']
    context_features['Player_Role'] = np.select(conditions, choices, default='Bench')

print(f"✓ Created {len(context_features.columns)} contextual features")

✓ Created 4 contextual features


In [9]:
# Create Temporal Features (Tier 3) - Simulated for now
temporal_features = pd.DataFrame(index=player_df.index)

# Consistency score
if 'Usage Rank' in player_df.columns and 'PSA Rank' in player_df.columns:
    temporal_features['Consistency_Score'] = 1 / (1 + np.abs(player_df['Usage Rank'] - player_df['PSA Rank'])/100)

# Usage stability
if 'Usage' in player_df.columns:
    from scipy import stats
    usage_zscore = np.abs(stats.zscore(player_df['Usage'].fillna(player_df['Usage'].mean())))
    temporal_features['Usage_Stability'] = 1 / (1 + np.exp(-usage_zscore))

# Performance tier
if 'MIN' in player_df.columns and 'Usage' in player_df.columns:
    temporal_features['Performance_Tier'] = (
        player_df['MIN'].rank(pct=True) * 0.5 + 
        player_df['Usage'].rank(pct=True) * 0.5
    )

# Opportunity score
if all(col in player_df.columns for col in ['MIN', 'Usage', 'PSA']):
    temporal_features['Opportunity_Score'] = (
        player_df['MIN'] * player_df['Usage'] * player_df['PSA'] / 10000
    )

print(f"✓ Created {len(temporal_features.columns)} temporal features")

✓ Created 4 temporal features


In [10]:
# Create Engineered Features (NEW)
engineered_features = pd.DataFrame(index=player_df.index)

# Efficiency at Volume: How efficient are high-usage players?
if 'Usage' in player_df.columns and 'eFG%' in player_df.columns:
    engineered_features['Efficiency_x_Volume'] = player_df['Usage'] * player_df['eFG%'] / 100
    print(f"  ✓ Added Efficiency_x_Volume (USG × eFG)")

# Playmaking Efficiency: Assist-to-Turnover quality
if 'AST%' in player_df.columns and 'TOV%' in player_df.columns:
    # Add small constant to avoid division by zero
    engineered_features['Playmaking_Efficiency'] = player_df['AST%'] / (player_df['TOV%'] + 0.1)
    print(f"  ✓ Added Playmaking_Efficiency (AST/TOV ratio)")

# True Shooting x Minutes: High-efficiency scorers with playing time
if 'MIN' in player_df.columns and 'eFG%' in player_df.columns:
    engineered_features['Minutes_x_Efficiency'] = (player_df['MIN'] / 82) * player_df['eFG%']
    print(f"  ✓ Added Minutes_x_Efficiency")

# Offensive Load: Combined usage and assist responsibility
if 'Usage' in player_df.columns and 'AST%' in player_df.columns:
    engineered_features['Offensive_Load'] = player_df['Usage'] + (player_df['AST%'] * 0.5)
    print(f"  ✓ Added Offensive_Load (total offensive responsibility)")

print(f"✓ Created {len(engineered_features.columns)} engineered features")

  ✓ Added Efficiency_x_Volume (USG × eFG)
  ✓ Added Playmaking_Efficiency (AST/TOV ratio)
  ✓ Added Minutes_x_Efficiency
  ✓ Added Offensive_Load (total offensive responsibility)
✓ Created 4 engineered features


In [11]:
# Combine all features
final_features = pd.concat([
    player_df[['Player', 'Team']],
    core_features,
    context_features.select_dtypes(exclude=['category']),  # Exclude categorical for now
    temporal_features,
    engineered_features  # Add engineered features
], axis=1)

# Remove redundant feature (100% correlated with MIN)
if 'Minutes_Season_Avg' in final_features.columns:
    final_features = final_features.drop('Minutes_Season_Avg', axis=1)
    print("  ✓ Removed redundant Minutes_Season_Avg (duplicate of MIN)")

# Add target variable (PRA estimate) - CORRECTED VERSION
if all(col in player_df.columns for col in ['MIN', 'Usage', 'PSA', 'AST%', 'fgDR%']):
    # Convert total season minutes to per-game minutes (assuming 82-game season)
    mpg = player_df['MIN'] / 82
    
    # Points estimate: MPG * Usage% * (PSA/100) * 1.2
    points_est = mpg * (player_df['Usage'] / 100) * (player_df['PSA'] / 100) * 1.2
    
    # Rebounds estimate: MPG * Rebounding% * 1.8
    rebounds_est = mpg * (player_df['fgDR%'] / 100) * 1.8
    
    # Assists estimate: MPG * AST% * 0.8
    assists_est = mpg * (player_df['AST%'] / 100) * 0.8
    
    # Combined PRA estimate
    final_features['PRA_estimate'] = points_est + rebounds_est + assists_est
    
    # Add individual components for analysis
    final_features['Points_estimate'] = points_est
    final_features['Rebounds_estimate'] = rebounds_est
    final_features['Assists_estimate'] = assists_est

print(f"\n✅ FINAL FEATURE MATRIX:")
print(f"  Shape: {final_features.shape}")
print(f"  Total features: {len(final_features.columns) - 2} (excluding Player, Team)")
print(f"\nFeature categories:")
print(f"  - Core Performance: {len(core_features.columns)} features")
print(f"  - Contextual: {len([c for c in context_features.columns if c in final_features.columns])} features")
print(f"  - Temporal: {len(temporal_features.columns)} features")
print(f"  - Engineered: {len(engineered_features.columns)} features")

  ✓ Removed redundant Minutes_Season_Avg (duplicate of MIN)

✅ FINAL FEATURE MATRIX:
  Shape: (503, 27)
  Total features: 25 (excluding Player, Team)

Feature categories:
  - Core Performance: 11 features
  - Contextual: 2 features
  - Temporal: 4 features
  - Engineered: 4 features


In [12]:
# Save processed features
output_path = Path('/Users/diyagamah/Documents/nba_props_model/data/processed')
output_path.mkdir(parents=True, exist_ok=True)

# Save main features file
output_file = output_path / 'player_features_2023_24.csv'
final_features.to_csv(output_file, index=False)
print(f"\n💾 Saved processed features to:")
print(f"   {output_file}")

# Show top players
if 'PRA_estimate' in final_features.columns:
    print("\n🏆 TOP 10 PLAYERS BY PRA:")
    print("="*50)
    top_10 = final_features.nlargest(10, 'PRA_estimate')[['Player', 'Team', 'PRA_estimate']]
    for idx, row in top_10.iterrows():
        print(f"{row['Player']:25s} ({row['Team']})  →  {row['PRA_estimate']:.1f}")

print("\n✅ Feature engineering pipeline complete!")
print("   You can now run the other notebooks with processed features.")


💾 Saved processed features to:
   /Users/diyagamah/Documents/nba_props_model/data/processed/player_features_2023_24.csv

🏆 TOP 10 PLAYERS BY PRA:
Nikola Jokic              (DEN)  →  43.0
Luka Doncic               (DAL)  →  42.7
Domantas Sabonis          (SAC)  →  41.0
Giannis Antetokounmpo     (MIL)  →  37.2
LeBron James              (LAL)  →  32.4
Paolo Banchero            (ORL)  →  31.2
Anthony Davis             (LAL)  →  31.2
Shai Gilgeous-Alexander   (OKC)  →  30.9
Jayson Tatum              (BOS)  →  30.9
Anthony Edwards           (MIN)  →  30.1

✅ Feature engineering pipeline complete!
   You can now run the other notebooks with processed features.
