In [32]:
import pandas as pd
# import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor, AdaBoostRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from pybaseball import batting_stats
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error


# Random Forest Regressor Training

In [33]:
data = (batting_stats(2014, 2018))
X = data[['Age', 'GB', 'LD', 'SwStr%']]
y = data['H']

In [34]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

rf_model = RandomForestRegressor(n_estimators=128, random_state=78,)

rf_model = rf_model.fit(X_train, y_train)

predictions = rf_model.predict(X_test)

In [35]:

print(f"Training Score: {rf_model.score(X_train, y_train)}")
print(f"Testing Score: {rf_model.score(X_test, y_test)}")

Training Score: 0.944221213469784
Testing Score: 0.5011051973595293


# Gradient Boosting Regressor Training

In [36]:
# Load data
data_path = 'resources\cleaned_extended_data3 (2).csv' 
df = pd.read_csv(data_path)
X = df.drop('next_year_hits', axis=1)  
y = df['next_year_hits']

In [37]:
# Splitting the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
gb_model = GradientBoostingRegressor(n_estimators=190, learning_rate=0.1, max_depth=8, random_state=42)
gb_model.fit(X_train, y_train)
y_pred = gb_model.predict(X_test)


In [38]:
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f'MSE: {mse}')
print(f'MAE: {mae}')
print(f'R² Score: {r2}')
# show train score
print(f'Train score: {gb_model.score(X_train, y_train)}')
# show test score
print(f'Test score: {gb_model.score(X_test, y_test)}')

MSE: 398.34215931956476
MAE: 13.657809975591451
R² Score: 0.8506441710830508
Train score: 0.9894306570044707
Test score: 0.8506441710830508


# The Wizard's Code

In [39]:
def test_model(model, data):
    
    X_train, X_test, y_train, y_test = data
    reg = model.fit(X_train, y_train)
    
    train_score = reg.score(X_train, y_train)
    test_score = reg.score(X_test, y_test)
    
    # print(f'Model: {type(reg).__name__}')
    
    # print(f'Train score: {train_score}')
    # print(f'Test Score: {test_score}\n')    
    
    return {'train':train_score, 'test':test_score}

In [40]:
data2 = pd.read_csv('resources\data2.csv')
relevant_cols=['Age','PA','H','HR','R','RBI','SB','BB','SO','AVG','OBP','SLG','OPS','wOBA','WAR','next_year_hits']

In [41]:
df = data2.copy(deep=True)

relevant = df.loc[:,relevant_cols]
relevant = relevant.dropna()

relevant.head()

Unnamed: 0,Age,PA,H,HR,R,RBI,SB,BB,SO,AVG,OBP,SLG,OPS,wOBA,WAR,next_year_hits
0,25,614,180,32,129,80,30,81,91,0.346,0.438,0.64,1.078,0.449,10.4,176.0
1,26,608,147,39,101,79,24,122,124,0.312,0.46,0.628,1.088,0.447,9.5,137.0
2,22,654,172,42,118,99,6,124,131,0.33,0.46,0.649,1.109,0.461,9.3,123.0
3,23,682,172,41,104,90,11,92,158,0.299,0.402,0.59,0.991,0.415,9.3,173.0
4,25,678,154,52,128,114,9,127,208,0.284,0.422,0.627,1.049,0.43,8.7,115.0


In [28]:
y = relevant.pop('next_year_hits')
X = relevant

X.head()

Unnamed: 0,Age,PA,H,HR,R,RBI,SB,BB,SO,AVG,OBP,SLG,OPS,wOBA,WAR
0,25,614,180,32,129,80,30,81,91,0.346,0.438,0.64,1.078,0.449,10.4
1,26,608,147,39,101,79,24,122,124,0.312,0.46,0.628,1.088,0.447,9.5
2,22,654,172,42,118,99,6,124,131,0.33,0.46,0.649,1.109,0.461,9.3
3,23,682,172,41,104,90,11,92,158,0.299,0.402,0.59,0.991,0.415,9.3
4,25,678,154,52,128,114,9,127,208,0.284,0.422,0.627,1.049,0.43,8.7


In [30]:
def run(model, X, y, model_name):
    
    results = []
    max_r2 = -999.0
    max_index = -999

    idx = 0

    for trees in range(40, 200, 10):
        for depth in range(2, 10, 2):
            
            print(f'\ntrees: {trees}, max-depth: {depth}')
        
            X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

            scaler = StandardScaler().fit(X_train)
            X_train = scaler.transform(X_train)
            X_test = scaler.transform(X_test)
            
            forest_data = [X_train, X_test, y_train, y_test] 
        
            scores = test_model(model(n_estimators=trees, max_depth=depth), forest_data)
        
            results.append((trees, depth, scores['train'], scores['test'], len(X)))
        
            if scores['test']>max_r2:
                max_r2 = scores['test']
                max_index=idx
                print(f'New max score: {max_r2}')
                
            idx += 1
            
    best = results[max_index]
    print(f'\n\nbest {model_name}\ntrain score: {best[2]}\ntest score: {best[3]}\nnum rows: {best[4]}\ntrees: {best[0]}\nmax-depth: {best[1]}\n\n')
    
    return results

In [42]:
results_rf = run(RandomForestRegressor, X, y, 'RANDOM FOREST')
results_gb = run(GradientBoostingRegressor, X, y, 'GRADIENT BOOSTING')


trees: 40, max-depth: 2
New max score: 0.30306105796070837

trees: 40, max-depth: 4
New max score: 0.38958277912228245

trees: 40, max-depth: 6
New max score: 0.4633674279301355

trees: 40, max-depth: 8
New max score: 0.540624890160579

trees: 50, max-depth: 2

trees: 50, max-depth: 4

trees: 50, max-depth: 6

trees: 50, max-depth: 8
New max score: 0.5413284055070626

trees: 60, max-depth: 2

trees: 60, max-depth: 4

trees: 60, max-depth: 6

trees: 60, max-depth: 8
New max score: 0.5426878288058165

trees: 70, max-depth: 2

trees: 70, max-depth: 4

trees: 70, max-depth: 6

trees: 70, max-depth: 8

trees: 80, max-depth: 2

trees: 80, max-depth: 4

trees: 80, max-depth: 6

trees: 80, max-depth: 8

trees: 90, max-depth: 2

trees: 90, max-depth: 4

trees: 90, max-depth: 6

trees: 90, max-depth: 8
New max score: 0.5434844159647483

trees: 100, max-depth: 2

trees: 100, max-depth: 4

trees: 100, max-depth: 6

trees: 100, max-depth: 8
New max score: 0.5440218843540023

trees: 110, max-depth:

KeyboardInterrupt: 

In [None]:
# standard regressors
# test_model(LinearRegression(), data)

# test_model(KNeighborsRegressor(), data)
# test_model(RandomForestRegressor(), data)
# test_model(ExtraTreesRegressor(), data)
# test_model(AdaBoostRegressor(), data)
# test_model(SVR(C=1.0, epsilon=0.2), data)
# test_model(GradientBoostingRegressor(), data)

In [None]:
# Define a list of models
# models = [
#     LinearRegression(),
#     KNeighborsRegressor(),
#     RandomForestRegressor(),
#     ExtraTreesRegressor(),
#     AdaBoostRegressor(),
#     SVR(C=1.0, epsilon=0.2),
#     GradientBoostingRegressor()
# ]

# # Split the data into training and test sets
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# # Create a tuple of the training and test sets
# data_split = (X_train, X_test, y_train, y_test)

# # Iterate through the models and print the scores for each one
# for model in models:
#     scores = test_model(model, data_split)
#     print(f'Model: {type(model).__name__}, Scores: {scores}')