In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Load feature data
df = pd.read_csv('../Data/processed/features_and_targets.csv')


Unnamed: 0,IDfg,Season,Name,Team,Age,G,AB,PA,H,1B,...,PA_change,AVG_career_avg,HR_career_avg,RBI_career_avg,OBP_career_avg,PA_career_avg,next_AVG,next_HR,next_RBI,next_OBP
0,9256,2020,A.J. Pollock,LAD,32,55,196,210,54,29,...,,0.276,16.000000,34.0,0.314000,210.000000,0.245,14.0,56.0,0.292
1,15640,2021,Aaron Judge,NYY,29,148,550,633,158,95,...,,0.287,39.000000,98.0,0.373000,633.000000,0.311,62.0,131.0,0.425
2,10950,2020,Adam Duvall,ATL,31,57,190,209,45,21,...,,0.237,16.000000,33.0,0.301000,209.000000,0.228,38.0,113.0,0.281
3,15223,2019,Adam Frazier,PIT,27,152,554,608,154,104,...,,0.278,10.000000,50.0,0.336000,608.000000,0.230,7.0,23.0,0.297
4,15223,2020,Adam Frazier,PIT,28,58,209,230,48,34,...,-378.0,0.254,8.500000,36.5,0.316500,419.000000,0.305,5.0,43.0,0.368
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
385,17232,2020,Yoan Moncada,CHW,25,52,200,231,45,28,...,-328.0,0.270,15.500000,51.5,0.343500,395.000000,0.263,14.0,61.0,0.375
386,19556,2021,Yordan Alvarez,HOU,24,144,537,598,149,80,...,,0.277,33.000000,104.0,0.346000,598.000000,0.306,37.0,97.0,0.406
387,19198,2019,Yuli Gurriel,HOU,35,144,564,612,168,95,...,,0.298,31.000000,104.0,0.343000,612.000000,0.232,6.0,22.0,0.274
388,19198,2020,Yuli Gurriel,HOU,36,57,211,230,49,30,...,-382.0,0.265,18.500000,63.0,0.308500,421.000000,0.319,15.0,81.0,0.383


In [2]:
# Define our target variables and features
targets = ['next_AVG', 'next_HR', 'next_RBI', 'next_OBP']

In [3]:
# Remove non-numeric columns and target variables from features
feature_cols = df.select_dtypes(include=[np.number]).columns
feature_cols = [col for col in feature_cols if col not in targets]

In [5]:
# Create seperate models for each target
results = {}
for target in targets:
    print(f'\nModeling {target}:')
    
    # Prepare X and Y
    X = df[feature_cols]
    y = df[target]
    
    # Split data 
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Train Models
    models = {
        'Linear Regression': LinearRegression(),
        'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42)
    }
    
    for name, model in models.items():
        # Train model
        model.fit(X_train, y_train)
        
        # Make Predictions
        y_pred = model.predict(X_test)
        
        # Calculate Metrics
        mse = mean_squared_error(y_test, y_pred)
        mae = mean_absolute_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)
        
        print(f'\n{name} Results:')
        print(f'Mean Squared Error: {mse:.4f}')
        print(f'Mean Absolute Error: {mae:.4f}')
        print(f'R^2 Score: {r2:.4f}')
        
        # Store Results
        results[(target, name)] = {
            'mse': mse,
            'mae': mae,
            'r2': r2
        }
        
# Save Results
results_df = pd.DataFrame(results).T
results_df.to_csv('../Models/model_results.csv')


Modeling next_AVG:


ValueError: Input X contains NaN.
LinearRegression does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.impute import SimpleImputer

# Load feature data
df = pd.read_csv('../Data/processed/features_and_targets.csv')

# Define our target variables and features
targets = ['next_AVG', 'next_HR', 'next_RBI', 'next_OBP']

# Remove non-numeric columns and target variables from features
feature_cols = df.select_dtypes(include=[np.number]).columns
feature_cols = [col for col in feature_cols if col not in targets]

# Create imputer for missing values
imputer = SimpleImputer(strategy='mean')

# Create separate models for each target
results = {}
for target in targets:
    print(f"\nModeling {target}:")
    
    # Prepare X and y
    X = df[feature_cols]
    y = df[target]
    
    # Remove rows where target is NaN
    mask = ~y.isna()
    X = X[mask]
    y = y[mask]
    
    # Impute missing values in features
    X = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)
    
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Train models
    models = {
        'Linear Regression': LinearRegression(),
        'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42)
    }
    
    for name, model in models.items():
        # Train model
        model.fit(X_train, y_train)
        
        # Make predictions
        y_pred = model.predict(X_test)
        
        # Calculate metrics
        mse = mean_squared_error(y_test, y_pred)
        mae = mean_absolute_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)
        
        print(f"\n{name} Results:")
        print(f"Mean Squared Error: {mse:.4f}")
        print(f"Mean Absolute Error: {mae:.4f}")
        print(f"R² Score: {r2:.4f}")
        
        # Store results
        results[(target, name)] = {
            'mse': mse,
            'mae': mae,
            'r2': r2
        }

# Save results
results_df = pd.DataFrame(results).T
results_df.to_csv('../Models/model_results.csv')


Modeling next_AVG:

Linear Regression Results:
Mean Squared Error: 3.1578
Mean Absolute Error: 1.1477
R² Score: -4095.2505

Random Forest Results:
Mean Squared Error: 0.0008
Mean Absolute Error: 0.0234
R² Score: -0.0549

Modeling next_HR:

Linear Regression Results:
Mean Squared Error: 19927395.3746
Mean Absolute Error: 1217.2877
R² Score: -201856.8940

Random Forest Results:
Mean Squared Error: 48.7951
Mean Absolute Error: 5.3154
R² Score: 0.5057

Modeling next_RBI:

Linear Regression Results:
Mean Squared Error: 293808154.5707
Mean Absolute Error: 4773.7523
R² Score: -504713.3976

Random Forest Results:
Mean Squared Error: 286.4830
Mean Absolute Error: 13.6418
R² Score: 0.5079

Modeling next_OBP:

Linear Regression Results:
Mean Squared Error: 8.7118
Mean Absolute Error: 2.2586
R² Score: -8628.4778

Random Forest Results:
Mean Squared Error: 0.0009
Mean Absolute Error: 0.0224
R² Score: 0.1161


In [7]:


# Load data
df = pd.read_csv('../Data/processed/features_and_targets.csv')

# Let's look at feature importance for each target
targets = ['next_AVG', 'next_HR', 'next_RBI', 'next_OBP']

for target in targets:
    print(f"\nTop 10 most important features for predicting {target}:")
    
    # Prepare X and y
    feature_cols = df.select_dtypes(include=[np.number]).columns
    feature_cols = [col for col in feature_cols if col not in targets]
    X = df[feature_cols]
    y = df[target]
    
    # Remove rows where target is NaN
    mask = ~y.isna()
    X = X[mask]
    y = y[mask]
    
    # Impute missing values
    imputer = SimpleImputer(strategy='mean')
    X = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)
    
    # Train Random Forest
    rf = RandomForestRegressor(n_estimators=100, random_state=42)
    rf.fit(X, y)
    
    # Get feature importance
    importance = pd.DataFrame({
        'feature': X.columns,
        'importance': rf.feature_importances_
    })
    
    # Show top 10 features
    print(importance.sort_values('importance', ascending=False).head(10))


Top 10 most important features for predicting next_AVG:
            feature  importance
331  AVG_career_avg    0.050540
312             xBA    0.045615
307         HardHit    0.019131
209            TTO%    0.013953
286            BB%+    0.013043
203           Pull%    0.010231
100        O-Swing%    0.009947
285            AVG+    0.008535
87              wSL    0.008473
25               LD    0.008432

Top 10 most important features for predicting next_HR:
           feature  importance
1           Season    0.189272
305        Barrel%    0.182785
39             ISO    0.037806
290           ISO+    0.027846
308       HardHit%    0.020854
332  HR_career_avg    0.020126
306          maxEV    0.013979
44             FB%    0.011375
301         Hard%+    0.011114
318        prev_HR    0.009504

Top 10 most important features for predicting next_RBI:
           feature  importance
1           Season    0.447447
305        Barrel%    0.033712
39             ISO    0.032079
332  HR_caree