In [None]:
## FINAL MODEL

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.linear_model import Lasso, RidgeCV
from sklearn.metrics import make_scorer, r2_score, mean_squared_error

# Loading dataset
file_path = '/content/Full Dataset (Imputed Values).csv'
data = pd.read_csv(file_path)

# Use all numerical rows except for specified columns as features
numerical_features = data.select_dtypes(include=['number']).columns
features_to_exclude = ['WS', 'WS/48', 'BPM', 'VORP', 'VORP/48']
selected_features = [feature for feature in numerical_features if feature not in features_to_exclude]

# Splitting the dataset
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

# Computing medians for imputation
X_train_median = train_data[selected_features].median()
y_train_median = train_data['BPM'].median()

# Applying imputation
X_train_selected = train_data[selected_features].fillna(X_train_median)
X_test_selected = test_data[selected_features].fillna(X_train_median)
y_train_vorp = train_data['BPM'].fillna(y_train_median)
y_test_vorp = test_data['BPM'].fillna(y_train_median)

# Function to perform cross-validation and calculate average evaluation metrics for training and testing
def cross_validate_model(model, X, y, cv=5):
    scoring = {'mse': 'neg_mean_squared_error', 'r2': 'r2'}
    results = cross_validate(model, X, y, cv=cv, scoring=scoring, return_train_score=True)

    # Negate MSE scores to make them positive
    train_mse = -np.mean(results['train_mse'])
    test_mse = -np.mean(results['test_mse'])

    train_rmse = np.sqrt(train_mse)
    test_rmse = np.sqrt(test_mse)

    train_r2 = np.mean(results['train_r2'])
    test_r2 = np.mean(results['test_r2'])

    return train_mse, train_rmse, train_r2, test_mse, test_rmse, test_r2

# Define and prepare models
lasso_model_vorp = Lasso(alpha=0.5, max_iter=10000)
alphas = np.logspace(-6, 6, 13)
ridge_cv_vorp = RidgeCV(alphas=alphas, cv=5, scoring='neg_mean_squared_error')

# Cross-validate models
print("Cross-validating Lasso Model...")
lasso_metrics = cross_validate_model(lasso_model_vorp, X_train_selected, y_train_vorp)
print("Lasso Metrics (Train MSE, Train RMSE, Train R2, Test MSE, Test RMSE, Test R2):", lasso_metrics)

print("\nCross-validating Ridge Model...")
ridge_metrics = cross_validate_model(ridge_cv_vorp, X_train_selected, y_train_vorp)
print("Ridge Metrics (Train MSE, Train RMSE, Train R2, Test MSE, Test RMSE, Test R2):", ridge_metrics)


Cross-validating Lasso Model...
Lasso Metrics (Train MSE, Train RMSE, Train R2, Test MSE, Test RMSE, Test R2): (7.576934438902299, 2.7526231923207902, 0.1818563638926438, 8.175323338961931, 2.859252234232218, 0.05720288751396807)

Cross-validating Ridge Model...
Ridge Metrics (Train MSE, Train RMSE, Train R2, Test MSE, Test RMSE, Test R2): (6.497136280686581, 2.5489480733601813, 0.3039753726671358, 8.175374794890025, 2.859261232362308, 0.05598324866468085)
