# Project 3 - Machine Learning

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score

In [7]:
# Load in training, validation, and testing dataframes
train = pd.read_csv('/Users/omar/Desktop/DSCI 372M/Projects/project3/injury_train.csv')
val = pd.read_csv('/Users/omar/Desktop/DSCI 372M/Projects/project3/injury_val.csv')
test = pd.read_csv('/Users/omar/Desktop/DSCI 372M/Projects/project3/injury_test.csv')

## Logistic Classification:

For logistic classification, I have chosen the target variable to predict a player's game availability ('out' or 'not out'). For the sake of this model, I will classify 'Out' and 'Doubtful' as a player being out, game availability wise, as a player very rarley ends up playing when listed as doubtful.

In [11]:
# Function 'is_out' that adds the binary target column (is_out) and returns the resulting dataframe
def is_out(df):
    df['is_out'] = df['report_status'].apply(lambda x: 1 if x in ['Out', 'Doubtful'] else 0) # 0 = playing, 1 = out (not playing)
    return df

# Apply 'is_out' function to train_df, val_df, and test_df
train_df = is_out(train)
val_df = is_out(val)
test_df = is_out(test)

# Separate features (X) and target (y)
X_train = train_df.drop(['is_out', 'report_status', 'gsis_id', 'player_name', 'date_modified'], axis=1)
y_train = train_df['is_out']

X_val = val_df.drop(['is_out', 'report_status', 'gsis_id', 'player_name', 'date_modified'], axis=1)
y_val = val_df['is_out']

X_test = test_df.drop(['is_out', 'report_status', 'gsis_id', 'player_name', 'date_modified'], axis=1)
y_test = test_df['is_out']

# Separate categorical features
categorical = ['position', 'team', 'game_type', 'report_injury', 'practice_status']

X_train_cat = train_df[categorical]
X_val_cat = val_df[categorical]
X_test_cat = test_df[categorical]

# Initialize OneHotEncoder
ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

# Fit and transform training data 
ohe.fit(X_train_cat)
X_train_encoded = ohe.transform(X_train_cat)

# Transform validation and test data
X_val_encoded = ohe.transform(X_val_cat)
X_test_encoded = ohe.transform(X_test_cat)

# Get feature names and convert to dataframes
encoded_cols = ohe.get_feature_names_out(categorical)

X_train_encoded_df = pd.DataFrame(X_train_encoded, columns=encoded_cols)
X_val_encoded_df = pd.DataFrame(X_val_encoded, columns=encoded_cols)
X_test_encoded_df = pd.DataFrame(X_test_encoded, columns=encoded_cols)

# Separate numerical features
num_cols = ['week', 'year']
X_train_num = train_df[num_cols].reset_index(drop=True)
X_val_num = val_df[num_cols].reset_index(drop=True)
X_test_num = test_df[num_cols].reset_index(drop=True)

# Initialize StandardScaler and scale numerical features
scaler = StandardScaler()
X_train_num_df = pd.DataFrame(scaler.fit_transform(X_train_num), columns=num_cols)
X_val_num_df = pd.DataFrame(scaler.transform(X_val_num), columns=num_cols)
X_test_num_df = pd.DataFrame(scaler.transform(X_test_num), columns=num_cols)

# Combine (concatenate) scaled numerical and encoded categorical features
X_train_processed = pd.concat([X_train_num_df, X_train_encoded_df], axis=1)
X_val_processed = pd.concat([X_val_num_df, X_val_encoded_df], axis=1)
X_test_processed = pd.concat([X_test_num_df, X_test_encoded_df], axis=1)

# Train logistic regression 
model = LogisticRegression(max_iter=1000)
model.fit(X_train_processed, y_train)

# Final evaluation on all sets
y_pred_train = model.predict(X_train_processed)
y_pred_val = model.predict(X_val_processed)
y_pred_test = model.predict(X_test_processed)

train_accuracy = accuracy_score(y_train, y_pred_train)
val_accuracy = accuracy_score(y_val, y_pred_val)
test_accuracy = accuracy_score(y_test, y_pred_test)

print("Logistic Classification Prediction Accuracies:")
print(f"  Training Accuracy: {train_accuracy}")
print(f"  Validation Accuracy: {val_accuracy}")
print(f"  Test Accuracy: {test_accuracy}")

Logistic Classification Prediction Accuracies:
  Training Accuracy: 0.9220330841514306
  Validation Accuracy: 0.8726534043149341
  Test Accuracy: 0.8670495937237321


## Support Vector Machine:

For support vector machines, I will utilize the already processed training, validation, and test sets from the logistic classification section of the project (X_train_processed, X_val_processed, X_test_processed, y_train, y_val, y_test). The two kernel tricks I have chosen to use for this are a Radial Basis Function (RBF) kernel, as well as a polynomial kernel. I will also use GridSearchCV in order to tune the hyperparameters simultaneously.

In [12]:
# 1. SVM with RBF Kernel ----------------------------------------------------------------------------------------

# Define hyperparameter grid for GridSearchCV
rbf_param_grid = {'C': [0.1, 1, 10], 'gamma': [0.001, 0.1]}

# GridSearchCV with SVC (RBF kernel) and the hyperparameter grid (2-fold cross-validation)
rbf_grid = GridSearchCV(SVC(kernel='rbf'), rbf_param_grid, cv=2)

# Train GridSearchCV on training data
rbf_grid.fit(X_train_processed, y_train)

# Get best estimator found by GridSearchCV
rbf_best_svm = rbf_grid.best_estimator_

# Evaluation on all sets
rbf_y_pred_train = rbf_best_svm.predict(X_train_processed)
rbf_y_pred_val = rbf_best_svm.predict(X_val_processed)
rbf_y_pred_test = rbf_best_svm.predict(X_test_processed)

print("RBF Kernel Prediction Accuracies:")
print(f"  Training Accuracy: {accuracy_score(y_train, rbf_y_pred_train)}")
print(f"  Validation Accuracy: {accuracy_score(y_val, rbf_y_pred_val)}")
print(f"  Test Accuracy: {accuracy_score(y_test, rbf_y_pred_test)}")
print(f"  Best Parameters: {rbf_grid.best_params_}")

RBF Kernel Prediction Accuracies:
  Training Accuracy: 0.9214026239154582
  Validation Accuracy: 0.8730736901092743
  Test Accuracy: 0.8711123564023536
  Best Parameters: {'C': 10, 'gamma': 0.001}


In [15]:
# 2. SVM with Polynomial Kernel ------------------------------------------------------------------------------------

# Define hyperparameter grid for GridSearchCV
poly_param_grid = {'C': [0.1, 1, 10], 'degree': [2, 3]}

# GridSearchCV with SVC (polynomial kernel) and the hyperparameter grid (2-fold cross-validation)
poly_grid = GridSearchCV(SVC(kernel='poly'), poly_param_grid, cv=2)

# Train GridSearchCV on training data
poly_grid.fit(X_train_processed, y_train)

# Get best estimator found by GridSearchCV
poly_best_svm = poly_grid.best_estimator_

# Evaluation on all sets
poly_y_pred_train = poly_best_svm.predict(X_train_processed)
poly_y_pred_val = poly_best_svm.predict(X_val_processed)
poly_y_pred_test = poly_best_svm.predict(X_test_processed)

print("Polynomial Kernel Prediction Accuracies:")
print(f"  Training Accuracy: {accuracy_score(y_train, poly_y_pred_train)}")
print(f"  Validation Accuracy: {accuracy_score(y_val, poly_y_pred_val)}")
print(f"  Test Accuracy: {accuracy_score(y_test, poly_y_pred_test)}")
print(f"  Best Parameters: {poly_grid.best_params_}")

Polynomial Kernel Prediction Accuracies:
  Training Accuracy: 0.9213726019994596
  Validation Accuracy: 0.8711123564023536
  Test Accuracy: 0.8607453068086298
  Best Parameters: {'C': 0.1, 'degree': 2}


## Decision Trees:

For decision trees, I will utilize the already processed training, validation, and test sets from the logistic classification and SVM sections of the project (X_train_processed, X_val_processed, X_test_processed, y_train, y_val, y_test). I will also use GridSearchCV for hyperparameter tuning.

In [32]:
# Define hyperparameter grid for GridSearchCV
dt_param_grid = {
    'max_depth': [3, 5, 7, 10, None],  # max_depth values
    'min_samples_split': [2, 5, 10],  # min_samples_split values
    'min_samples_leaf': [1, 2, 4]  # min_samples_leaf values
}

# GridSearchCV with DecisionTreeClassifier and the hyperparameter grid (3-fold cross-validation)
dt_grid = GridSearchCV(DecisionTreeClassifier(), dt_param_grid, cv=3)

# Train decision tree model on training data
dt_grid.fit(X_train_processed, y_train)

# Get best estimator found by GridSearchCV
best_dt = dt_grid.best_estimator_

# Evaluation on all sets
dt_y_pred_train = dt.predict(X_train_processed)
dt_y_pred_val = dt.predict(X_val_processed)
dt_y_pred_test = dt.predict(X_test_processed)

print("Decision Tree Prediction Accuracies:")
print(f"  Training Accuracy: {accuracy_score(y_train, dt_y_pred_train)}")
print(f"  Validation Accuracy: {accuracy_score(y_val, dt_y_pred_val)}")
print(f"  Test Accuracy: {accuracy_score(y_test, dt_y_pred_test)}")
print(f"  Best Parameters: {dt_grid.best_params_}")

Decision Tree Prediction Accuracies:
  Training Accuracy: 0.9982287069560779
  Validation Accuracy: 0.7975623423928271
  Test Accuracy: 0.7922387223311852
  Best Parameters: {'max_depth': 3, 'min_samples_leaf': 1, 'min_samples_split': 2}


## Random Forests:

For random forests, I will utilize the already processed training, validation, and test sets from the logistic classification, SVM, and decision tree sections of the project (X_train_processed, X_val_processed, X_test_processed, y_train, y_val, y_test). I will also use GridSearchCV for hyperparameter tuning.

In [44]:
# Define hyperparameter grid for GridSearchCV
rf_param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# GridSearchCV with RandomForestsClassifier and the hyperparameter grid (3-fold cross-validation)
rf_grid = GridSearchCV(RandomForestClassifier(random_state=42), rf_param_grid, cv=3)

# Train decision tree model on training data
rf_grid.fit(X_train_processed, y_train)

# Get best estimator found by GridSearchCV
best_rf = rf_grid.best_estimator_

# Evaluation on all sets
rf_y_pred_train = best_rf.predict(X_train_processed)
rf_y_pred_val = best_rf.predict(X_val_processed)
rf_y_pred_test = best_rf.predict(X_test_processed)

print("Random Forest Prediction Accuracies:")
print(f"  Training Accuracy: {accuracy_score(y_train, rf_y_pred_train)}")
print(f"  Validation Accuracy: {accuracy_score(y_val, rf_y_pred_val)}")
print(f"  Test Accuracy: {accuracy_score(y_test, rf_y_pred_test)}")
print(f"  Best Parameters: {rf_grid.best_params_}")

Random Forest Prediction Accuracies:
  Training Accuracy: 0.9229937854633883
  Validation Accuracy: 0.8645278789576912
  Test Accuracy: 0.8613056878677501
  Best Parameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}


## Neural Networks:

For decision trees, I will utilize the already processed training, validation, and test sets from the logistic classification and SVM sections of the project (X_train_processed, X_val_processed, X_test_processed, y_train, y_val, y_test). I have chosen to use scikit-learn's MLPClassifier (Multi-Layer Perceptron Classifier) for this task.

In [14]:
# Define random seeds
random_seeds = [42, 123, 7, 99, 1000]
num_seeds = len(random_seeds)

# Initialize lists to store accuracies for each set
train_accuracies = []
val_accuracies = []
test_accuracies = []

# Training/Evaluation Loop
for seed in random_seeds:
    # MLPClassifier with specific random seed
    mlp = MLPClassifier(random_state=seed, max_iter=1000)

    # Train MLPClassifier on processed training data
    mlp.fit(X_train_processed, y_train)

    # Evaluation on all sets
    y_pred_train = mlp.predict(X_train_processed)
    y_pred_val = mlp.predict(X_val_processed)
    y_pred_test = mlp.predict(X_test_processed)

    # Calculate accuracy scores and append to respective lists
    train_accuracies.append(accuracy_score(y_train, y_pred_train))
    val_accuracies.append(accuracy_score(y_val, y_pred_val))
    test_accuracies.append(accuracy_score(y_test, y_pred_test))

# Average accuracies across all random seeds
avg_train_accuracy = np.mean(train_accuracies)
avg_val_accuracy = np.mean(val_accuracies)
avg_test_accuracy = np.mean(test_accuracies)

print("Neural Network Prediction Accuracies (Averaged over seeds):")
print(f"  Average Training Accuracy: {avg_train_accuracy}")
print(f"  Average Validation Accuracy: {avg_val_accuracy}")
print(f"  Average Test Accuracy: {avg_test_accuracy}")

Neural Network Prediction Accuracies (Averaged over seeds):
  Average Training Accuracy: 0.9772914227385993
  Average Validation Accuracy: 0.7966096945923227
  Average Test Accuracy: 0.755505743905856
