In [2]:
 # Import Modules 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler
import pandas as pd
import numpy as np
from tensorflow.keras.models import Model
from tensorflow.keras import layers
import tensorflow as tf
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import kerastuner as kt
from mmdata import * 





  import kerastuner as kt


In [3]:
# Import the final data csv
data = pd.read_csv('final_data.csv')
data 

Unnamed: 0,YEAR,TEAM,TEAM NO,SEED,ROUND,POWER RATING,POWER RATING RANK,NET RPI,RESUME,WAB RANK,...,POWER-PATH,SEED WON,SEED LOST,SEED DIFF,FIRST ROUND,SECOND ROUND,SWEET 16,ELITE 8,FINAL 4,TOTAL
0,2016,Arizona,603,6,64,89.0,59,26,27,23,...,4.4,119,51,68,10,2,0,1,0,13
1,2016,Austin Peay,602,16,64,68.8,462,189,218,229,...,-41.8,119,51,68,10,2,0,1,0,13
2,2016,Baylor,601,5,64,85.5,152,25,25,22,...,5.1,119,51,68,10,2,0,1,0,13
3,2016,Buffalo,600,14,64,75.7,396,91,112,129,...,-24.4,119,51,68,10,2,0,1,0,13
4,2016,Butler,599,9,32,84.2,194,56,29,32,...,-3.5,119,51,68,10,2,0,1,0,13
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
779,2021,Oregon St.,830,12,8,79.7,332,91,32,47,...,-19.5,11,1,10,9,6,2,1,0,18
780,2021,USC,817,6,8,85.8,143,19,37,7,...,7.5,11,1,10,9,6,2,1,0,18
781,2021,Arkansas,872,3,8,86.3,129,14,20,12,...,7.6,11,1,10,9,6,2,1,0,18
782,2021,UCLA,819,11,4,81.9,267,46,55,24,...,-6.8,11,1,10,9,6,2,1,0,18


In [4]:
data.columns

Index(['YEAR', 'TEAM', 'TEAM NO', 'SEED', 'ROUND', 'POWER RATING',
       'POWER RATING RANK', 'NET RPI', 'RESUME', 'WAB RANK', 'ELO', 'B POWER',
       'Q1 W', 'Q2 W', 'Q1 PLUS Q2 W', 'Q3 Q4 L', 'PLUS 500', 'R SCORE',
       'BY YEAR NO', 'BY ROUND NO', 'CURRENT ROUND', 'SCORE', 'PAKE',
       'PAKE RANK', 'PASE', 'PASE RANK', 'GAMES', 'W', 'L', 'WIN%', 'R64',
       'R32', 'S16', 'E8', 'F4', 'F2', 'CHAMP', 'TOP2', 'F4%', 'CHAMP%',
       'POWER', 'PATH', 'DRAW', 'WINS', 'POOL VALUE', 'POOL S-RANK',
       'NCAA S-RANK', 'VAL Z-SCORE', 'POWER-PATH', 'SEED WON', 'SEED LOST',
       'SEED DIFF', 'FIRST ROUND', 'SECOND ROUND', 'SWEET 16', 'ELITE 8',
       'FINAL 4', 'TOTAL'],
      dtype='object')

In [5]:
# List features and target
X = data.drop(columns='CHAMP')
y = data['CHAMP']
input_nodes = X.shape[1]
print(X.dtypes, y.dtypes)

YEAR                   int64
TEAM                  object
TEAM NO                int64
SEED                   int64
ROUND                  int64
POWER RATING         float64
POWER RATING RANK      int64
NET RPI                int64
RESUME                 int64
WAB RANK               int64
ELO                    int64
B POWER              float64
Q1 W                   int64
Q2 W                   int64
Q1 PLUS Q2 W           int64
Q3 Q4 L                int64
PLUS 500               int64
R SCORE              float64
BY YEAR NO             int64
BY ROUND NO            int64
CURRENT ROUND          int64
SCORE                float64
PAKE                 float64
PAKE RANK              int64
PASE                 float64
PASE RANK              int64
GAMES                  int64
W                      int64
L                      int64
WIN%                 float64
R64                    int64
R32                    int64
S16                    int64
E8                     int64
F4            

In [6]:
# Encode the data
X_encoder = OneHotEncoder()
y_encoder = LabelEncoder()

X = X_encoder.fit_transform(X)
y = y_encoder.fit_transform(y)

In [7]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
# Intialize and fit robust scaler... best for this data
scaler = RobustScaler(with_centering=False)

X_scaler = scaler.fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)



In [9]:
# Method for creating a model with hyperparameters
def create_model(hp):
    model = tf.keras.models.Sequential()
    activation = hp.Choice('activation', values=['relu', 'tanh', 'sigmoid'])
    # Decide number of neurons in each layer
    model.add(tf.keras.layers.Dense(units=hp.Int('first_units',
        min_value=32,
        max_value=512,
        step=32), activation=activation, input_dim=X.shape[1]))
    # Decide number of hidden layers
    for i in range(hp.Int('num_layers', 1, 5)):
        model.add(tf.keras.layers.Dense(units=hp.Int(
            f'layer_{i}_units',
            min_value=32,
            max_value=512,
            step=32), activation=activation))
    model.add(tf.keras.layers.Dense(1, activation='sigmoid'))
    # Compile the model
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model


In [10]:
# Initialize the tuner
tuner = kt.Hyperband(create_model,
                        objective='val_accuracy',
                        max_epochs=100,
                        hyperband_iterations=2)

Reloading Tuner from .\untitled_project\tuner0.json


In [11]:
# Search for the best hyperparameters
tuner.search(X_train_scaled, y_train, epochs=100, validation_data=(X_test_scaled, y_test))

In [12]:
# Get the best hyperparameters
best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]
best_hps.values

{'activation': 'tanh',
 'first_units': 288,
 'num_layers': 3,
 'layer_0_units': 384,
 'layer_1_units': 256,
 'layer_2_units': 448,
 'layer_3_units': 384,
 'layer_4_units': 32,
 'tuner/epochs': 2,
 'tuner/initial_epoch': 0,
 'tuner/bracket': 4,
 'tuner/round': 0}

In [13]:
# Fit the model with the best hyperparameters
model = tuner.hypermodel.build(best_hps)
model.fit(X_train_scaled, y_train, epochs=100, validation_data=(X_test_scaled, y_test))





Epoch 1/100


Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoc

<keras.src.callbacks.History at 0x2a184970ee0>

In [14]:
# Evaluate the model
model_loss, model_accuracy = model.evaluate(X_test_scaled, y_test, verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

5/5 - 0s - loss: -1.0855e+02 - accuracy: 0.8854 - 12ms/epoch - 2ms/step
Loss: -108.55207824707031, Accuracy: 0.8853503465652466


In [31]:
# predict the 2024 champ
# get the 2024 data from the csv
data = pd.read_csv('final_data.csv')
data_2024 = data.loc[data['YEAR'] == 2023]
data_2024 

Unnamed: 0,YEAR,TEAM,TEAM NO,SEED,ROUND,POWER RATING,POWER RATING RANK,NET RPI,RESUME,WAB RANK,...,POWER-PATH,SEED WON,SEED LOST,SEED DIFF,FIRST ROUND,SECOND ROUND,SWEET 16,ELITE 8,FINAL 4,TOTAL
460,2023,Arizona,1010,2,64,89.0,59,10,7,7,...,15.6,65,20,45,5,4,3,2,0,14
461,2023,Baylor,1006,3,32,87.1,104,15,4,12,...,12.0,65,20,45,5,4,3,2,0,14
462,2023,Connecticut,1002,4,1,89.2,57,8,19,13,...,14.5,65,20,45,5,4,3,2,0,14
463,2023,Duke,999,5,32,87.1,104,16,28,16,...,9.8,65,20,45,5,4,3,2,0,14
464,2023,Gonzaga,995,3,8,89.9,46,6,17,9,...,17.6,65,20,45,5,4,3,2,0,14
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
575,2023,Texas,957,2,8,90.1,43,7,2,8,...,14.5,14,5,9,5,4,3,2,0,14
576,2023,Creighton,1001,6,8,87.6,92,17,40,29,...,7.9,14,5,9,5,4,3,2,0,14
577,2023,Kansas St.,985,3,8,84.5,180,23,8,17,...,6.9,14,5,9,5,4,3,2,0,14
578,2023,San Diego St.,961,5,2,86.0,136,14,26,10,...,3.1,14,5,9,5,4,3,2,0,14


In [32]:
#Predict the 2024 champ
X_2024 = data_2024.drop(columns='CHAMP')
X_2024 = X_encoder.fit_transform(X_2024)


In [33]:
X_2024

<120x1922 sparse matrix of type '<class 'numpy.float64'>'
	with 6840 stored elements in Compressed Sparse Row format>

In [39]:
# predict the 2024 champ

y_pred = model.predict(X_2024)
y_pred = y_encoder.fit_transform(y_pred)
y_pred




  y = column_or_1d(y, warn=True)


array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int64)