In [2]:
import pandas as pd
import numpy as np

In [12]:
import pandas as pd
import numpy as np

# Load the cleaned data
df = pd.read_csv('../Data/processed/clean_batting_stats.csv')

# Create previous season stats for each player
def create_previous_season_features(df):
    # Sort by player and season
    df = df.sort_values(['Name', 'Season'])
    
    # Create lag features (previous season's stats)
    lag_stats = ['AVG', 'HR', 'RBI', 'OBP', 'PA']
    for stat in lag_stats:
        df[f'prev_{stat}'] = df.groupby('Name')[stat].shift(1)
        df[f'prev2_{stat}'] = df.groupby('Name')[stat].shift(2)  # 2 years ago
        
    # Calculate year-over-year changes
    for stat in lag_stats:
        df[f'{stat}_change'] = df[stat] - df[f'prev_{stat}']
        
    # Calculate career averages up to that point
    for stat in lag_stats:
        df[f'{stat}_career_avg'] = df.groupby('Name')[stat].expanding().mean().reset_index(0, drop=True)
    
    return df

# Create features
df_features = create_previous_season_features(df)

# Create target variables (next season's stats)
target_stats = ['AVG', 'HR', 'RBI', 'OBP']
for stat in target_stats:
    df_features[f'next_{stat}'] = df_features.groupby('Name')[stat].shift(-1)

# Remove rows where we don't have next season's data (2023) or previous season's data (2019)
df_features = df_features.dropna(subset=[f'next_{stat}' for stat in target_stats])
# df_features = df_features.dropna(subset=[f'prev_{stat}' for stat in lag_stats])

# Show what features we created
print("Features created:")
print(df_features.columns.tolist())

# Save features dataset
df_features.to_csv('../Data/processed/features_and_targets.csv', index=False)

# Show some basic stats about our features
print("\nDataset shape:", df_features.shape)
print("\nSample of features for one player:")
print(df_features[df_features['Name'] == df_features['Name'].iloc[0]].head())

Features created:
['IDfg', 'Season', 'Name', 'Team', 'Age', 'G', 'AB', 'PA', 'H', '1B', '2B', '3B', 'HR', 'R', 'RBI', 'BB', 'IBB', 'SO', 'HBP', 'SF', 'SH', 'GDP', 'SB', 'CS', 'AVG', 'GB', 'FB', 'LD', 'IFFB', 'Pitches', 'Balls', 'Strikes', 'IFH', 'BU', 'BUH', 'BB%', 'K%', 'BB/K', 'OBP', 'SLG', 'OPS', 'ISO', 'BABIP', 'GB/FB', 'LD%', 'GB%', 'FB%', 'IFFB%', 'HR/FB', 'IFH%', 'BUH%', 'wOBA', 'wRAA', 'wRC', 'Bat', 'Fld', 'Rep', 'Pos', 'RAR', 'WAR', 'Dol', 'Spd', 'wRC+', 'WPA', '-WPA', '+WPA', 'RE24', 'REW', 'pLI', 'phLI', 'PH', 'WPA/LI', 'Clutch', 'FB% (Pitch)', 'FBv', 'SL%', 'SLv', 'CT%', 'CTv', 'CB%', 'CBv', 'CH%', 'CHv', 'SF%', 'SFv', 'KN%', 'KNv', 'XX%', 'PO%', 'wFB', 'wSL', 'wCT', 'wCB', 'wCH', 'wSF', 'wKN', 'wFB/C', 'wSL/C', 'wCT/C', 'wCB/C', 'wCH/C', 'wSF/C', 'wKN/C', 'O-Swing%', 'Z-Swing%', 'Swing%', 'O-Contact%', 'Z-Contact%', 'Contact%', 'Zone%', 'F-Strike%', 'SwStr%', 'BsR', 'FA% (sc)', 'FT% (sc)', 'FC% (sc)', 'FS% (sc)', 'FO% (sc)', 'SI% (sc)', 'SL% (sc)', 'CU% (sc)', 'KC% (sc)', 

In [13]:
# See what features we have for our target stats
columns = [col for col in df_features.columns if any(stat in col for stat in ['AVG', 'HR', 'RBI', 'OBP'])]
print("Stat-related columns:", columns)

# Quick look at the distribution of our target variables
print("\nTarget variable statistics:")
print(df_features[['next_AVG', 'next_HR', 'next_RBI', 'next_OBP']].describe())

Stat-related columns: ['HR', 'RBI', 'AVG', 'OBP', 'HR/FB', 'AVG+', 'OBP+', 'HR/FB%+', 'prev_AVG', 'prev2_AVG', 'prev_HR', 'prev2_HR', 'prev_RBI', 'prev2_RBI', 'prev_OBP', 'prev2_OBP', 'AVG_change', 'HR_change', 'RBI_change', 'OBP_change', 'AVG_career_avg', 'HR_career_avg', 'RBI_career_avg', 'OBP_career_avg', 'next_AVG', 'next_HR', 'next_RBI', 'next_OBP']

Target variable statistics:
         next_AVG     next_HR    next_RBI    next_OBP
count  390.000000  390.000000  390.000000  390.000000
mean     0.261777   19.174359   65.471795    0.337018
std      0.030898   10.646233   27.068664    0.034493
min      0.184000    0.000000   12.000000    0.238000
25%      0.242000   11.000000   46.000000    0.315000
50%      0.262000   17.000000   66.500000    0.333000
75%      0.278000   26.750000   86.000000    0.355750
max      0.364000   62.000000  139.000000    0.490000
