In [None]:
pip install tensorflow

In [None]:
## GOOD RESULTS WITH GENERALIZATION AND L2 THING
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import RepeatedKFold
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l2

# Load the dataset
data = pd.read_csv('/content/Full Dataset (Imputed Values, -1s removed).csv')  # Update the path as necessary

# Filter for draft years 2017-2023
# This is getting data for 2017-2023
data_filtered = data[data['Draft Year'].between(2017, 2023)]
#data_filtered = data

# Select only numerical columns
numerical_data_filtered = data_filtered.select_dtypes(include=['number'])

# Impute missing values with mean
imputer = SimpleImputer(strategy='mean')
numerical_data_imputed = imputer.fit_transform(numerical_data_filtered)
numerical_data_imputed_df = pd.DataFrame(numerical_data_imputed, columns=numerical_data_filtered.columns)

# Normalize the data
scaler = StandardScaler()
scaled_data = scaler.fit_transform(numerical_data_imputed_df)
scaled_data_df = pd.DataFrame(scaled_data, columns=numerical_data_imputed_df.columns)

# Prepare your features and target variable, excluding 'Draft Year' from features for the model
features = scaled_data_df.drop(columns=['WS', 'WS/48', 'BPM', 'VORP/48'])
targets_ws48 = scaled_data_df['WS/48']

def create_model(input_shape):
    model = Sequential([
        Dense(64, activation='relu', input_shape=(input_shape,), kernel_regularizer=l2(0.01)),  # L2 regularization added
        Dropout(0.5),  # Dropout added
        Dense(32, activation='relu', kernel_regularizer=l2(0.01)),  # L2 regularization added
        Dropout(0.5),  # Dropout added
        Dense(1)
    ])
    model.compile(optimizer=Adam(learning_rate=0.001), loss='mean_squared_error')
    return model

# Convert features and targets to numpy arrays for cross-validation
X = features.to_numpy()
y = targets_ws48.to_numpy().ravel()

# Define Repeated K-Fold Cross Validator
rkf = RepeatedKFold(n_splits=10, n_repeats=5, random_state=42)

mse_scores = []
rmse_scores = []
r2_scores = []
train_mse_scores = []
train_rmse_scores = []
train_r2_scores = []

# Cross-validation loop
for train_index, test_index in rkf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    model = create_model(input_shape=X_train.shape[1])
    # Epochs are how many times over you go through
    model.fit(X_train, y_train, epochs=100, batch_size=10, verbose=0)

    # Training set evaluation
    train_predictions = model.predict(X_train)
    train_mse = mean_squared_error(y_train, train_predictions.flatten())
    train_rmse = np.sqrt(train_mse)
    train_r2 = r2_score(y_train, train_predictions.flatten())
    train_mse_scores.append(train_mse)
    train_rmse_scores.append(train_rmse)
    train_r2_scores.append(train_r2)

    # Testing set evaluation
    predictions = model.predict(X_test)
    mse = mean_squared_error(y_test, predictions.flatten())
    rmse = np.sqrt(mse)
    r_squared = r2_score(y_test, predictions.flatten())
    mse_scores.append(mse)
    rmse_scores.append(rmse)
    r2_scores.append(r_squared)

# Compute average metrics for both training and testing sets
print(f"Average Train MSE: {np.mean(train_mse_scores)}")
print(f"Average Train RMSE: {np.mean(train_rmse_scores)}")
print(f"Average Train R-squared: {np.mean(train_r2_scores)}")
print(f"Average Test MSE: {np.mean(mse_scores)}")
print(f"Average Test RMSE: {np.mean(rmse_scores)}")
print(f"Average Test R-squared: {np.mean(r2_scores)}")

In [None]:
## GOOD RESULTS WITH Generalization, L2 and simplifying model through increased dropout rate and less layers (FINAL MODEL)
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import RepeatedKFold
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l2

# Load the dataset
data = pd.read_csv('/content/Quantitative Stats (College, NBA, Combine).csv')  # Update the path as necessary

# Filter for draft years 2017-2023
# This is getting data for 2017-2023
#data_filtered = data[data['Draft Year'].between(2017, 2023)]
data_filtered = data

# Select only numerical columns
numerical_data_filtered = data_filtered.select_dtypes(include=['number'])

# Impute missing values with mean
imputer = SimpleImputer(strategy='mean')
numerical_data_imputed = imputer.fit_transform(numerical_data_filtered)
numerical_data_imputed_df = pd.DataFrame(numerical_data_imputed, columns=numerical_data_filtered.columns)

# Normalize the data
scaler = StandardScaler()
scaled_data = scaler.fit_transform(numerical_data_imputed_df)
scaled_data_df = pd.DataFrame(scaled_data, columns=numerical_data_imputed_df.columns)

# Prepare your features and target variable, excluding 'Draft Year' from features for the model
features = scaled_data_df.drop(columns=['WS', 'WS/48', 'BPM', 'VORP', 'VORP/48'])
targets_ws48 = scaled_data_df['VORP/48']

def create_model(input_shape, dropout_rate=0.6, l2_lambda=0.01):
    model = Sequential([
        Dense(32, activation='relu', input_shape=(input_shape,), kernel_regularizer=l2(l2_lambda)),  # Reduced from 64 to 32
        Dropout(dropout_rate),  # Increased dropout rate
        Dense(16, activation='relu', kernel_regularizer=l2(l2_lambda)),  # Reduced from 32 to 16, simplifying the model
        Dropout(dropout_rate),  # Consistent dropout rate
        Dense(1)  # Output layer remains the same
    ])
    model.compile(optimizer=Adam(learning_rate=0.001), loss='mean_squared_error')
    return model

# Convert features and targets to numpy arrays for cross-validation
X = features.to_numpy()
y = targets_ws48.to_numpy().ravel()

# Define Repeated K-Fold Cross Validator
rkf = RepeatedKFold(n_splits=10, n_repeats=5, random_state=42)

mse_scores = []
rmse_scores = []
r2_scores = []
train_mse_scores = []
train_rmse_scores = []
train_r2_scores = []

# Cross-validation loop
for train_index, test_index in rkf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    model = create_model(input_shape=X_train.shape[1])
    # Epochs are how many times over you go through
    model.fit(X_train, y_train, epochs=100, batch_size=10, verbose=0)

    # Training set evaluation
    train_predictions = model.predict(X_train)
    train_mse = mean_squared_error(y_train, train_predictions.flatten())
    train_rmse = np.sqrt(train_mse)
    train_r2 = r2_score(y_train, train_predictions.flatten())
    train_mse_scores.append(train_mse)
    train_rmse_scores.append(train_rmse)
    train_r2_scores.append(train_r2)

    # Testing set evaluation
    predictions = model.predict(X_test)
    mse = mean_squared_error(y_test, predictions.flatten())
    rmse = np.sqrt(mse)
    r_squared = r2_score(y_test, predictions.flatten())
    mse_scores.append(mse)
    rmse_scores.append(rmse)
    r2_scores.append(r_squared)

# Compute average metrics for both training and testing sets
print(f"Average Train MSE: {np.mean(train_mse_scores)}")
print(f"Average Train RMSE: {np.mean(train_rmse_scores)}")
print(f"Average Train R-squared: {np.mean(train_r2_scores)}")
print(f"Average Test MSE: {np.mean(mse_scores)}")
print(f"Average Test RMSE: {np.mean(rmse_scores)}")
print(f"Average Test R-squared: {np.mean(r2_scores)}")

In [None]:
# Define hyperparameters to test
learning_rates = [0.01, 0.001, 0.0001]
l2_lambdas = [0.01, 0.001, 0.0001]
dropout_rates = [0.3, 0.5, 0.7]

best_r2 = -float('inf')
best_params = {}

# Cross-validation loop with hyperparameter grid
for lr in learning_rates:
    for l2 in l2_lambdas:
        for dropout in dropout_rates:
            print(f"Testing LR={lr}, L2={l2}, Dropout={dropout}")
            r2_scores = []

            for train_index, test_index in rkf.split(X):
                X_train, X_test = X[train_index], X[test_index]
                y_train, y_test = y[train_index], y[test_index]

                model = Sequential([
                    Dense(64, activation='relu', input_shape=(X_train.shape[1],), kernel_regularizer=l2(l2)),
                    Dropout(dropout),
                    Dense(32, activation='relu', kernel_regularizer=l2(l2)),
                    Dropout(dropout),
                    Dense(1)
                ])
                model.compile(optimizer=Adam(learning_rate=lr), loss='mean_squared_error')

                model.fit(X_train, y_train, epochs=100, batch_size=10, verbose=0)

                predictions = model.predict(X_test)
                r_squared = r2_score(y_test, predictions.flatten())
                r2_scores.append(r_squared)

            average_r2 = np.mean(r2_scores)
            print(f"Average Test R-squared: {average_r2}")

            if average_r2 > best_r2:
                best_r2 = average_r2
                best_params = {'learning_rate': lr, 'l2_lambda': l2, 'dropout_rate': dropout}

print("Best Hyperparameters:")
print(best_params)


ModuleNotFoundError: No module named 'tensorflow.keras.wrappers'