# Advanced Machine Learning Models
Model 3 - Decision Tree Model with Full Features  
Model 4 - Decision Tree Model with EDA Features  
Model 5 - LASSO Model with Full Features  
Model 6 - LASSO Model with EDA Features  

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, f1_score
from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

### Import Data

In [2]:
# Read in the CSV
df = pd.read_csv("Clean_2015_Pitching_Data.csv", sep = ",")

# Create a dataframe that eliminates all string identifiers and target features other than strikeout
full_features = df.drop(['ID', 'last_year', 'NAME', 'new_yahoo', 'new_IP', 'new_win', 
'new_loss', 'new_hit', 'new_earned_run', 'new_walk'], axis=1)
full_features.head()

Unnamed: 0,player_age,last_year_Yahoo,IP,pa,ab,hit,single,double,triple,home_run,...,breaking_avg_break_x,breaking_avg_break,breaking_range_speed,n_offspeed_formatted,offspeed_avg_speed,offspeed_avg_spin,offspeed_avg_break_x,offspeed_avg_break,offspeed_range_speed,new_strikeout
0,26,576.6,228.2,886,820,174,120,32,6,16,...,0.4,4.2,1.8,27.6,91.8,2056,-16.8,17.3,1.3,151
1,32,441.6,178.2,707,657,145,94,31,6,14,...,-2.0,8.7,1.7,31.6,79.0,1848,13.3,16.2,1.4,119
2,33,394.6,181.2,745,680,159,113,26,1,19,...,13.8,16.6,5.4,6.1,85.4,1824,-16.2,17.9,2.2,186
3,28,246.0,172.0,753,692,199,135,34,1,29,...,15.8,16.5,1.5,15.4,84.6,1522,-14.4,15.4,1.7,184
4,27,487.0,200.0,791,747,172,123,29,2,18,...,5.5,8.5,2.0,2.3,86.4,1557,-12.9,14.8,1.2,107


In [3]:
# Create dataframe that only has features that were selected in the EDA
EDA_features = df[['new_strikeout', 'strikeout', 'k_percent', 'p_swinging_strike', 
'xba', 'z_swing_miss_percent', 'iz_contact_percent', 'in_zone_swing_miss', 'whiff_percent']]
EDA_features.head()

Unnamed: 0,new_strikeout,strikeout,k_percent,p_swinging_strike,xba,z_swing_miss_percent,iz_contact_percent,in_zone_swing_miss,whiff_percent
0,151,207,23.4,388,0.215,18.3,80.8,218,25.4
1,119,138,19.5,288,0.225,17.5,82.1,161,24.2
2,186,167,22.4,283,0.227,19.6,80.1,195,23.6
3,184,149,19.8,231,0.275,12.8,86.8,114,21.2
4,107,198,25.0,351,0.245,15.8,83.9,154,29.7


### Model 3 - Decision Tree Model with Full Features

In [4]:
# Separate features (X) and target variable (y)
X = full_features.drop(['new_strikeout'], axis =1)
y = full_features['new_strikeout']

# Split data into training and testing sets (test_size=0.2 for 20% test data)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [5]:
# Create the decision tree regression model
model = DecisionTreeRegressor(max_depth=3)

# Train the model on the training data
model.fit(X_train, y_train)

# Make predictions on testing data
y_pred = model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)
r2 = r2_score(y_test, y_pred)
print("R-squared:", r2)

Mean Squared Error: 2138.0212190110615
R-squared: 0.12592914194383165


#### Model 3 - Decision Tree with Full Features Results
| Depth | MSE   | R2    |
| ----- | ----- | ----- |
| 5     | 2632 | -.07?? |
| 4     | 1946 | .20 |
| 3     | 2138 | .13 |
| 2     | 1591 | .35 |
| 1     | 1783 | .27 |

### Model 4 - Decision Tree with EDA Features

In [6]:
# Separate features (X) and target variable (y)
X = EDA_features.drop(['new_strikeout'], axis =1)
y = EDA_features['new_strikeout']

# Split data into training and testing sets (test_size=0.2 for 20% test data)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [16]:
# Create the decision tree regression model
model = DecisionTreeRegressor(max_depth=3)

# Train the model on the training data
model.fit(X_train, y_train)

# Make predictions on testing data
y_pred = model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)
r2 = r2_score(y_test, y_pred)
print("R-squared:", r2)

Mean Squared Error: 1365.2803924553702
R-squared: 0.4418428622178452


#### Model 4 Decision Tree with EDA Features Results
| Depth | MSE   | R2    |
| ----- | ----- | ----- |
| 5     | 1750 | .28 |
| 4     | 1645 | .33 |
| 3     | 1365 | .44 |
| 2     | 1591 | .35 |
| 1     | 1783 | .27 |

In [17]:
# Access feature importances
feature_importances = model.feature_importances_

# Sort features and importances together by importance (descending order)
sorted_features_and_importances = sorted(zip(X.columns, feature_importances), key=lambda x: x[1], reverse=True)

# Print features in order of importance
for feature, importance in sorted_features_and_importances:
  print(f"{feature}: {importance:.4f}")

k_percent: 0.8171
strikeout: 0.1626
whiff_percent: 0.0203
p_swinging_strike: 0.0000
xba: 0.0000
z_swing_miss_percent: 0.0000
iz_contact_percent: 0.0000
in_zone_swing_miss: 0.0000


### Model 5 - LASSO Model with Full Features

In [20]:
from sklearn.linear_model import Lasso

# Separate features (X) and target variable (y)
X = full_features.drop(['new_strikeout'], axis =1)
y = full_features['new_strikeout']

# Split data into training and testing sets (test_size=0.2 for 20% test data)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create the LASSO model with alpha (regularization parameter)
model = Lasso(alpha=11) 

# Train the model on the training data
model.fit(X_train, y_train)

# Make predictions on testing data
y_pred = model.predict(X_test)

# Evaluate model performance (e.g., using mean squared error)
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)
r2 = r2_score(y_test, y_pred)
print("R-squared:", r2)

Mean Squared Error: 1254.5788622332357
R-squared: 0.4871001218974983


  model = cd_fast.enet_coordinate_descent(


#### Model 5 - Lasso Alpha with Full Features Analysis
| Alpha | MSE   | R2    |
| ----- | ----- | ----- |
| .001  | 1398 | .4281 |
| .01   | 1401 | .4217 |
| .1    | 1372 | .4388 |
| 1     | 1354 | .4462 |
| 10    | 1255 | .4865 |
| 11    | 1254 | .4871 |
| 100   | 1293 | .4713 |

In [21]:
# Access coefficients
# lasso_coefficients = model.coef_

# Sort features and coefficients together by absolute coefficient value (descending order)
# sorted_features_and_coefficients = sorted(zip(X.columns, lasso_coefficients), key=lambda x: abs(x[1]), reverse=True)

# Print features and coefficients in sorted order
# for feature_name, coefficient in sorted_features_and_coefficients:
 #  print(f"{feature_name}: {coefficient:.4f}")

# Access feature importances
feature_importances = model.feature_importances_

# Sort features and importances together by importance (descending order)
sorted_features_and_importances = sorted(zip(X.columns, feature_importances), key=lambda x: x[1], reverse=True)

# Print features in order of importance
for feature, importance in sorted_features_and_importances:
  print(f"{feature}: {importance:.4f}")

AttributeError: 'Lasso' object has no attribute 'feature_importances_'

### Model 6 - LASSO Model with EDA Features

In [11]:
from sklearn.linear_model import Lasso

# Separate features (X) and target variable (y)
X = EDA_features.drop(['new_strikeout'], axis =1)
y = EDA_features['new_strikeout']

# Split data into training and testing sets (test_size=0.2 for 20% test data)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create the LASSO model with alpha (regularization parameter)
model = Lasso(alpha=1) 

# Train the model on the training data
model.fit(X_train, y_train)

# Make predictions on testing data
y_pred = model.predict(X_test)

# Evaluate model performance (e.g., using mean squared error)
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)
r2 = r2_score(y_test, y_pred)
print("R-squared:", r2)

Mean Squared Error: 1328.6815572360792
R-squared: 0.456805284021488


#### Model 6 - Lasso Alpha with EDA Features Analysis
| Alpha | MSE   | R2    |
| ----- | ----- | ----- |
| .001  | 1312 | .4633 |
| .01   | 1314 | .4625 |
| .1    | 1315 | .4621 |
| 1     | 1328 | .4568 |
| 10    | 1362 | .4432 |
| 100   | 1556 | .3635 |

In [12]:
# Access coefficients
lasso_coefficients = model.coef_

# Sort features and coefficients together by absolute coefficient value (descending order)
sorted_features_and_coefficients = sorted(zip(X.columns, lasso_coefficients), key=lambda x: abs(x[1]), reverse=True)

# Print features and coefficients in sorted order
for feature_name, coefficient in sorted_features_and_coefficients:
  print(f"{feature_name}: {coefficient:.4f}")

k_percent: 3.4438
whiff_percent: 1.5132
iz_contact_percent: 1.4107
strikeout: 0.3825
in_zone_swing_miss: 0.2413
p_swinging_strike: -0.1948
xba: -0.0000
z_swing_miss_percent: -0.0000
