In [None]:
pip install chart_studio


In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_validate, KFold
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import make_scorer, mean_squared_error, r2_score
import numpy as np

# Load the dataset
df = pd.read_csv('/content/College to NBA.csv')

# Define predictors and the target variable, excluding specific columns
predictors = df.select_dtypes(include=['float64', 'int64']).drop(['Draft Year', 'Pk', 'WS/48', 'VORP', 'VORP/48', 'BPM', 'WS'], axis=1).columns
X = df[predictors]
y = df['VORP/48']

# Initial Random Forest model to determine feature importances
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X, y)  # Using all data to get feature importance

# Get feature importances and sort them
feature_importances = pd.Series(rf.feature_importances_, index=predictors).sort_values(ascending=False)

# Select the top N most important features, for example, top 10
top_n_features = feature_importances.head(15).index.tolist()

# Redefine X to include only the top N features
X_selected = df[top_n_features]

# Normalize the selected features
scaler = MinMaxScaler()
X_normalized = scaler.fit_transform(X_selected)

# Define a K-Fold cross-validator with shuffling
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Define scorers for MSE and R^2
scorers = {
    'MSE': make_scorer(mean_squared_error, greater_is_better=False),
    'r2': make_scorer(r2_score)
}

# Run cross-validation
cv_results = cross_validate(RandomForestRegressor(n_estimators=100, random_state=42),
                            X_normalized, y, cv=kf, scoring=scorers, return_train_score=True)

# Calculate average scores
average_train_mse = -np.mean(cv_results['train_MSE'])
average_test_mse = -np.mean(cv_results['test_MSE'])
average_train_rmse = np.sqrt(average_train_mse)
average_test_rmse = np.sqrt(average_test_mse)
average_train_r2 = np.mean(cv_results['train_r2'])
average_test_r2 = np.mean(cv_results['test_r2'])

# Print results
print(f"Selected Features: {top_n_features}")
print(f"Average Training MSE: {average_train_mse}")
print(f"Average Training RMSE: {average_train_rmse}")
print(f"Average Training R^2: {average_train_r2}")
print(f"Average Testing MSE: {average_test_mse}")
print(f"Average Testing RMSE: {average_test_rmse}")
print(f"Average Testing R^2: {average_test_r2}")

In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_validate, KFold
from sklearn.metrics import make_scorer, mean_squared_error, r2_score
from sklearn.impute import SimpleImputer
import numpy as np

def run_model(dataset_path):
    df = pd.read_csv(dataset_path)
    predictors = df.select_dtypes(include=['float64', 'int64']).drop(['Draft Year', 'Pk', 'WS/48', 'VORP', 'VORP/48', 'BPM', 'WS'], axis=1).columns
    X = df[predictors]
    y = df['WS/48']
    imputer = SimpleImputer(strategy='mean')
    X_imputed = imputer.fit_transform(X)

    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    scorers = {
        'MSE': make_scorer(mean_squared_error, greater_is_better=False),
        'r2': 'r2'
    }

    cv_results = cross_validate(RandomForestRegressor(random_state=42), X_imputed, y, cv=kf, scoring=scorers, return_train_score=True)

    train_rmse = np.sqrt(-cv_results['train_MSE'].mean())
    test_rmse = np.sqrt(-cv_results['test_MSE'].mean())

    return {
        'train_mse': -cv_results['train_MSE'].mean(),
        'train_rmse': train_rmse,
        'train_r2': cv_results['train_r2'].mean(),
        'test_mse': -cv_results['test_MSE'].mean(),
        'test_rmse': test_rmse,
        'test_r2': cv_results['test_r2'].mean()
    }

# Paths to your datasets
dataset_paths = ['/content/College to NBA.csv', '/content/Quantitative Stats (College, NBA, Combine).csv', '/content/Full Dataset (Imputed Values).csv']
metrics = [run_model(path) for path in dataset_paths]

# Printing the metrics for each dataset
for i, metric in enumerate(metrics, start=1):
    print(f"Metrics for Dataset {i}: {metric}")


In [None]:
## FINAL MODEL

import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold, cross_validate, RandomizedSearchCV
from sklearn.metrics import make_scorer, mean_squared_error, r2_score
from sklearn.impute import SimpleImputer
import numpy as np

def run_model(dataset_path):
    # Load the dataset
    df = pd.read_csv(dataset_path)

    # Define predictors and the target variable, excluding specific columns
    predictors = df.select_dtypes(include=['float64', 'int64']).drop(['Draft Year', 'Pk', 'WS/48', 'VORP', 'VORP/48', 'BPM', 'WS'], axis=1)
    X = df[predictors.columns]
    y = df['VORP/48']

    # Impute missing values
    imputer = SimpleImputer(strategy='mean')
    X_imputed = imputer.fit_transform(X)

    # Define a K-Fold cross-validator
    kf = KFold(n_splits=5, shuffle=True, random_state=42)

    # Setup hyperparameter tuning
    param_dist = {
        'n_estimators': [100, 200, 300],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'max_features': ['auto', 'sqrt']
    }

    rf = RandomForestRegressor(random_state=42)
    random_search = RandomizedSearchCV(rf, param_distributions=param_dist, n_iter=10, cv=kf, scoring='r2', random_state=42)
    random_search.fit(X_imputed, y)

    # Best model selection
    best_model = random_search.best_estimator_

    # Cross-validation with the best model
    cv_results = cross_validate(best_model, X_imputed, y, cv=kf,
                                scoring={'MSE': make_scorer(mean_squared_error, greater_is_better=False), 'r2': 'r2'},
                                return_train_score=True)

    # Calculate and print MSE and RMSE for training and testing
    train_mse = -cv_results['train_MSE'].mean()
    test_mse = -cv_results['test_MSE'].mean()
    train_rmse = np.sqrt(train_mse)
    test_rmse = np.sqrt(test_mse)

    print(f"Train MSE: {train_mse}, Train RMSE: {train_rmse}")
    print(f"Test MSE: {test_mse}, Test RMSE: {test_rmse}")

    return {
        'best_params': random_search.best_params_,
        'train_mse': train_mse,
        'train_rmse': train_rmse,
        'train_r2': cv_results['train_r2'].mean(),
        'test_mse': test_mse,
        'test_rmse': test_rmse,
        'test_r2': cv_results['test_r2'].mean()
    }

# Replace '/path/to/your/dataset.csv' with the actual paths to your datasets
dataset_paths = ['/content/College to NBA.csv', '/content/Quantitative Stats (College, NBA, Combine).csv', '/content/Full Dataset (Imputed Values).csv']
metrics = [run_model(path) for path in dataset_paths]

# Printing the metrics for each dataset
for i, metric in enumerate(metrics, start=1):
    print(f"Metrics for Dataset {i}: {metric}")


  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


Train MSE: 0.00030720137886763594, Train RMSE: 0.017527161175376803
Test MSE: 0.0008608705594380626, Test RMSE: 0.029340595758062967


  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


Train MSE: 0.00037113461202550185, Train RMSE: 0.019264854321419143
Test MSE: 0.0008012238692816743, Test RMSE: 0.02830589813593051


  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


Train MSE: 0.00026703100589368646, Train RMSE: 0.016341083375764485
Test MSE: 0.0007889253933885197, Test RMSE: 0.02808781574612949
Metrics for Dataset 1: {'best_params': {'n_estimators': 300, 'min_samples_split': 2, 'min_samples_leaf': 4, 'max_features': 'auto', 'max_depth': 30}, 'train_mse': 0.00030720137886763594, 'train_rmse': 0.017527161175376803, 'train_r2': 0.6986148170633857, 'test_mse': 0.0008608705594380626, 'test_rmse': 0.029340595758062967, 'test_r2': 0.13946254935279248}
Metrics for Dataset 2: {'best_params': {'n_estimators': 100, 'min_samples_split': 10, 'min_samples_leaf': 4, 'max_features': 'sqrt', 'max_depth': 30}, 'train_mse': 0.00037113461202550185, 'train_rmse': 0.019264854321419143, 'train_r2': 0.6357050514303316, 'test_mse': 0.0008012238692816743, 'test_rmse': 0.02830589813593051, 'test_r2': 0.1971390795243721}
Metrics for Dataset 3: {'best_params': {'n_estimators': 300, 'min_samples_split': 2, 'min_samples_leaf': 4, 'max_features': 'auto', 'max_depth': 30}, 'trai