In [2]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
import matplotlib.pyplot as plt




In [3]:
# Load the dataset
df = pd.read_csv('Cleaned_CombinedBPI.csv', encoding='latin1')
# df.shape
df.shape

(3551, 24)

In [4]:
df.dropna().shape


(2807, 24)

In [5]:
# Show all the columns
df.columns

# df["Status"]

Index(['player_id', 'year_x', 'player_age', 'hit', 'single', 'double',
       'triple', 'home_run', 'strikeout', 'walk', 'k_percent', 'bb_percent',
       'batting_avg', 'Name', 'Team', 'Pos', 'Injury / Surgery Date',
       'Injury / Surgery', 'Status', 'IL Retro Date', 'Eligible to Return',
       'Return Date', 'Latest Update', 'year_y'],
      dtype='object')

In [6]:
# Fix the column year by deleting the decimal
import numpy as np

# Replace non-finite values with a default value (0) then convert to integer
df['year_y'] = df['year_y'].replace([np.inf, -np.inf, np.nan], 0)
df['year_y'] = df['year_y'].astype('int')
df.head()

Unnamed: 0,player_id,year_x,player_age,hit,single,double,triple,home_run,strikeout,walk,...,Team,Pos,Injury / Surgery Date,Injury / Surgery,Status,IL Retro Date,Eligible to Return,Return Date,Latest Update,year_y
0,408234,2020,37,51,37,4,0,10,51,24,...,DET,1B,9/2/2022,Strained biceps,Activated,9/3/2022,9/13/2022,9/19/2022,Activated,2022
1,408234,2020,37,51,37,4,0,10,51,24,...,DET,1B,4/10/2021,Strained biceps,Activated,4/11/2021,4/21/2021,4/25/2021,Activated,2021
2,443558,2020,39,56,34,6,0,16,58,25,...,,,,,,,,,,0
3,444482,2020,32,61,45,10,1,5,45,13,...,,,,,,,,,,0
4,446334,2020,34,49,31,10,1,7,39,11,...,ARI,3B,7/25/2023,Strained lower back,Activated,7/26/2023,8/5/2023,8/21/2023,Activated,2023


In [7]:
# Create a column by taking the difference between the Return Date and the Eligible to Return
df['Days to Return'] = pd.to_datetime(df['Return Date']) - pd.to_datetime(df['Eligible to Return'])
df['Days to Return'] = df['Days to Return'].dt.days
df

Unnamed: 0,player_id,year_x,player_age,hit,single,double,triple,home_run,strikeout,walk,...,Pos,Injury / Surgery Date,Injury / Surgery,Status,IL Retro Date,Eligible to Return,Return Date,Latest Update,year_y,Days to Return
0,408234,2020,37,51,37,4,0,10,51,24,...,1B,9/2/2022,Strained biceps,Activated,9/3/2022,9/13/2022,9/19/2022,Activated,2022,6.0
1,408234,2020,37,51,37,4,0,10,51,24,...,1B,4/10/2021,Strained biceps,Activated,4/11/2021,4/21/2021,4/25/2021,Activated,2021,4.0
2,443558,2020,39,56,34,6,0,16,58,25,...,,,,,,,,,0,
3,444482,2020,32,61,45,10,1,5,45,13,...,,,,,,,,,0,
4,446334,2020,34,49,31,10,1,7,39,11,...,3B,7/25/2023,Strained lower back,Activated,7/26/2023,8/5/2023,8/21/2023,Activated,2023,16.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3546,686752,2024,26,83,52,14,2,15,114,38,...,SP,3/28/2023,Strained oblique,Activated,3/29/2023,5/28/2023,8/2/2023,Activated,2023,66.0
3547,687765,2024,26,129,79,31,3,16,105,41,...,,,,,,,,,0,
3548,690986,2024,22,118,76,24,1,17,115,41,...,SP,6/16/2024,Sprained ankle,Activated,6/13/2024,6/28/2024,7/6/2024,Activated,2024,8.0
3549,694297,2024,25,157,107,25,6,19,155,34,...,,,,,,,,,0,


In [8]:
# Drop the year_x
df = df.drop('year_x', axis=1)
df.head()

Unnamed: 0,player_id,player_age,hit,single,double,triple,home_run,strikeout,walk,k_percent,...,Pos,Injury / Surgery Date,Injury / Surgery,Status,IL Retro Date,Eligible to Return,Return Date,Latest Update,year_y,Days to Return
0,408234,37,51,37,4,0,10,51,24,22.1,...,1B,9/2/2022,Strained biceps,Activated,9/3/2022,9/13/2022,9/19/2022,Activated,2022,6.0
1,408234,37,51,37,4,0,10,51,24,22.1,...,1B,4/10/2021,Strained biceps,Activated,4/11/2021,4/21/2021,4/25/2021,Activated,2021,4.0
2,443558,39,56,34,6,0,16,58,25,27.1,...,,,,,,,,,0,
3,444482,32,61,45,10,1,5,45,13,20.6,...,,,,,,,,,0,
4,446334,34,49,31,10,1,7,39,11,18.7,...,3B,7/25/2023,Strained lower back,Activated,7/26/2023,8/5/2023,8/21/2023,Activated,2023,16.0


In [9]:
# Find the types of injuries
df['Injury / Surgery'].value_counts()

Injury / Surgery
Strained hamstring                    312
Strained oblique                      211
COVID-19                              175
Strained groin                        125
Tommy John surgery                    122
                                     ... 
Core surgery; Microdiscectomy           1
Fractured finger (left index)           1
Anxiety                                 1
Strained achilles (right and left)      1
Knee infection                          1
Name: count, Length: 380, dtype: int64

In [10]:
# Define the unique values in each column
unique_values = df.nunique()
unique_values


player_id                528
player_age                24
hit                      165
single                   130
double                    50
triple                    14
home_run                  51
strikeout                206
walk                     100
k_percent                248
bb_percent               142
batting_avg              156
Name                     529
Team                      30
Pos                       21
Injury / Surgery Date    735
Injury / Surgery         380
Status                     9
IL Retro Date            657
Eligible to Return       675
Return Date              581
Latest Update             30
year_y                     6
Days to Return           112
dtype: int64

In [11]:
# Find the missing values
missing_values = df.isnull().sum()
missing_values

player_id                  0
player_age                 0
hit                        0
single                     0
double                     0
triple                     0
home_run                   0
strikeout                  0
walk                       0
k_percent                  0
bb_percent                 0
batting_avg                0
Name                       0
Team                     180
Pos                      180
Injury / Surgery Date    180
Injury / Surgery         180
Status                   208
IL Retro Date            180
Eligible to Return       256
Return Date              651
Latest Update            180
year_y                     0
Days to Return           727
dtype: int64

In [12]:
# Fill the missing values with 0
df = df.fillna(0)
df

Unnamed: 0,player_id,player_age,hit,single,double,triple,home_run,strikeout,walk,k_percent,...,Pos,Injury / Surgery Date,Injury / Surgery,Status,IL Retro Date,Eligible to Return,Return Date,Latest Update,year_y,Days to Return
0,408234,37,51,37,4,0,10,51,24,22.1,...,1B,9/2/2022,Strained biceps,Activated,9/3/2022,9/13/2022,9/19/2022,Activated,2022,6.0
1,408234,37,51,37,4,0,10,51,24,22.1,...,1B,4/10/2021,Strained biceps,Activated,4/11/2021,4/21/2021,4/25/2021,Activated,2021,4.0
2,443558,39,56,34,6,0,16,58,25,27.1,...,0,0,0,0,0,0,0,0,0,0.0
3,444482,32,61,45,10,1,5,45,13,20.6,...,0,0,0,0,0,0,0,0,0,0.0
4,446334,34,49,31,10,1,7,39,11,18.7,...,3B,7/25/2023,Strained lower back,Activated,7/26/2023,8/5/2023,8/21/2023,Activated,2023,16.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3546,686752,26,83,52,14,2,15,114,38,25.8,...,SP,3/28/2023,Strained oblique,Activated,3/29/2023,5/28/2023,8/2/2023,Activated,2023,66.0
3547,687765,26,129,79,31,3,16,105,41,19.6,...,0,0,0,0,0,0,0,0,0,0.0
3548,690986,22,118,76,24,1,17,115,41,22.3,...,SP,6/16/2024,Sprained ankle,Activated,6/13/2024,6/28/2024,7/6/2024,Activated,2024,8.0
3549,694297,25,157,107,25,6,19,155,34,23.3,...,0,0,0,0,0,0,0,0,0,0.0


In [13]:
# Print the types for each column in the data
df.dtypes



player_id                  int64
player_age                 int64
hit                        int64
single                     int64
double                     int64
triple                     int64
home_run                   int64
strikeout                  int64
walk                       int64
k_percent                float64
bb_percent               float64
batting_avg              float64
Name                      object
Team                      object
Pos                       object
Injury / Surgery Date     object
Injury / Surgery          object
Status                    object
IL Retro Date             object
Eligible to Return        object
Return Date               object
Latest Update             object
year_y                     int64
Days to Return           float64
dtype: object

In [15]:
from sklearn.preprocessing import OneHotEncoder
# Use lambda function to filter categorical columns (object data types)
categorical_cols = df.columns[df.dtypes == 'object'].tolist()

# Convert all categorical columns to strings to avoid mixed types
df[categorical_cols] = df[categorical_cols].astype(str)

# One-Hot Encoding
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
encoded_features = encoder.fit_transform(df[categorical_cols])

# Convert the encoded features back to a DataFrame
encoded_df = pd.DataFrame(encoded_features, columns=encoder.get_feature_names_out(categorical_cols))

# Drop the original categorical columns from the main DataFrame
df_encoded = df.drop(columns=categorical_cols)

# Concatenate the original DataFrame (with categorical columns dropped) with the encoded features
final_df = pd.concat([df_encoded, encoded_df], axis=1)

In [16]:
# Define the target column and split the dataset into features (X) and target (y)
target_column = 'Days to Return'  # Assuming this is your target column
X = final_df.drop(columns=target_column)
y = final_df[target_column]

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)


In [17]:
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.ensemble import RandomForestClassifier
# Train a Random Forest model
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

In [18]:
 # Evaluate the model
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy}")

Model Accuracy: 0.8171589310829818


In [21]:
# Evaluate the Model: Calculate the model's performance using a confusion matrix and classification report:

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))



[[0 0 0 ... 0 0 0]
 [0 1 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 3 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 2]]
              precision    recall  f1-score   support

      -330.0       0.00      0.00      0.00         1
       -36.0       1.00      1.00      1.00         1
       -35.0       0.00      0.00      0.00         0
       -30.0       1.00      1.00      1.00         1
       -18.0       0.00      0.00      0.00         1
       -16.0       0.00      0.00      0.00         1
       -10.0       1.00      1.00      1.00         1
        -9.0       1.00      1.00      1.00         1
        -8.0       1.00      1.00      1.00         1
        -7.0       0.67      1.00      0.80         2
        -6.0       1.00      0.50      0.67         2
        -4.0       0.00      0.00      0.00         1
        -3.0       1.00      1.00      1.00         1
         0.0       0.77      0.95      0.85       238
         1.0       0.68      0.98      0.80        51
         2.0   

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
pd.DataFrame({
    "actual":y_test,
    "predictions": y_pred,
    "error":y_pred - y_test
}).head(3)

Unnamed: 0,actual,predictions,error
2665,11.0,11.0,0.0
2971,0.0,0.0,0.0
709,7.0,0.0,-7.0


In [None]:
# scalers = {
#     "StandardScaler": StandardScaler(),
#     "MinMaxScaler": MinMaxScaler()
}

In [None]:
# random_forest_param_grid = {
#     "randomforest__n_estimators": [100, 200, 300],
#     "randomforest__max_features": ["auto", "sqrt", "log2"],
#     "randomforest__max_depth": [10,20,30],
#     "randomforest__min_samples_split":[2,4,10],
#     "randomforest__min_samples_leaf": [1,2,4]
}

In [None]:
# best_models={}
# from sklearn.pipeline import Pipeline
# for scaler_name, scaler in scalers.items():
    
#     pipeline = Pipeline(steps=[(scaler_name, scaler), ("randomforest", RandomForestClassifier())])

#     grid_search = GridSearchCV(pipeline, random_forest_param_grid, cv = 5, n_jobs=-1, verbose = 1)

#     grid_search.fit(X_train, y_train)

#     best_models[scaler_name] =  {
#         "best_estimator": grid_search.best_estimator_,
#         "best_score": grid_search.best_score_,
#         "best_params": grid_search.best_params_
#     }

In [24]:
# for scaler_name, info in best_models.items():
#     print(f"scaler: {scaler_name}")
#     print(f"best cv score: {info['best_score']}")
#     print(f"best params: {info['best_params']}")
#     print(f"*"*50)