In [None]:
!pip install cbbpy
import cbbpy.mens_scraper as s
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from numpy.testing import suppress_warnings
import math
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
pd.set_option('display.max_columns',None)

## Functions for creating dataframe

In [None]:
def string_modify(keys,add,vals_to_skip = 2): #columns in original dataframe that will be modified, string to add onto keys
    new_keys = []
    for i in keys[vals_to_skip:]:
        mod_key = i + add
        new_keys.append(mod_key) 
        
    return new_keys 

def scorestrip(string): #takes a teams record in the form of 'W-L' and returns how many wins and losses they have in the season.
    numbers = string.split('-')
    print(numbers)
    win = int(numbers[0])
    lose = int(numbers[1])

    return win,lose 

def dict_from_lists(keys,values): #keys and values are same length
    dic = {}
    for i in range(0,len(keys)):
        dic[keys[i]] = values[i]
        
    return dic 

def datasetup(game_info,boxscore):
    df = pd.DataFrame(columns=['game_id','team','win %','m % fgm','m fgm','m % 2pm','m 2pm','m % 3pm','m 3pm','m % ftm','m ftm','m oreb','m dreb','m ast','m stl',\
                              'm blk','m to','m pf','pts','s fgm','s 2pm','s 3pm','s ftm','s oreb','s dreb','s ast','s stl',\
                              's blk','s to','s pf','s % fgm','s % 2pm','s % 3pm', 's % ftm'])
    boxscore = boxscore[boxscore['player'].str.contains('TEAM')==True]
    boxscore = boxscore.drop(['player','player_id','position','starter','min','reb'],axis=1)
    if boxscore.columns[0] == 'Unnamed: 0':
        boxscore = boxscore.drop(['Unnamed: 0'],axis=1)
    boxscore = boxscore.replace(0, pd.np.nan).dropna(axis=0, how='any',subset=['fta','2pa','3pa',]).fillna(0)
    teams = boxscore['team']
    for i, team in enumerate(teams):
        if i==0:
            row = np.empty(34)
            row[:] = np.nan
            df.loc[0] = row
        else:
            i_range = np.linspace(0,i-1,i)
            n = 0
            pts = boxscore['pts'].iloc[i]
            values_avg = np.empty(16)
            
            values_mem = pd.DataFrame(columns= ['fgm','fga','2pm','2pa','3pm','3pa','ftm','fta','oreb','dreb','ast','stl',\
                              'blk','to','pf','pts'])
            for j in i_range:
                j = int(j)
                if boxscore['team'].iloc[j] == team:
                    values = boxscore.iloc[j,2:]
                    values_avg = values_avg + np.array(values)
                    values_mem.loc[len(values_mem)] = values
                    n+=1

            if n>=2:
                
                values_mem['% fgm'] = values_mem['fgm'] / values_mem['fga']
                values_mem['% 2pm'] = values_mem['2pm'] / values_mem['2pa']
                values_mem['% 3pm'] = values_mem['3pm'] / values_mem['3pa']
                values_mem['% ftm'] = values_mem['ftm'] / values_mem['fta']
                """else:
                    values_mem['% ftm'] = np.zeros(len(values_mem))"""
                values_mem = values_mem.drop(['fga','2pa','3pa','fta','pts'],axis=1)
                values_std = list(values_mem.std())
                
                
                values_avg = list(values_avg / n)
                values_avg[-1] = pts
                values_avg.insert(0,values_avg[0] / values_avg[1]) #adding % fgm
                values_avg.insert(3,values_avg[3] / values_avg[4]) #adding % 2pm
                values_avg.insert(6,values_avg[6] / values_avg[7]) #adding % 3pm
                if values_avg[10] !=0:
                    values_avg.insert(9,values_avg[9] / values_avg[10]) #adding % ftm
                else:
                    values_avg.insert(9,0)
                del values_avg[2]
                del values_avg[4]
                del values_avg[6]
                del values_avg[8]
                print(i)
                
                game_id = boxscore['game_id'].iloc[i]
                game_inf = game_info[game_info['game_id']==game_id]
                
                if game_inf['home_team'].iloc[0] == team:
                    string = game_inf['home_record'].iloc[0]
                    if type(string) == str and string != '':
                        win, lose = scorestrip(string)
                        if game_inf['home_win'].iloc[0] == True:
                            
                            winperc = (win-1) / (win-1+lose)
                        else:
                            winperc = win / (win + lose-1)
                        
                    else:
                        winperc=np.nan
                    
                else:
                    string = game_inf['away_record'].iloc[0]
                    if type(string) == str and string != '':
                        win, lose = scorestrip(string)
                        if game_inf['home_win'].iloc[0] == True:
                            winperc = win / (win+lose-1)
                            
                        else:
                            winperc = (win-1) / (win + lose -1)
                    else:
                        winperc= np.nan
                    
                
                values_avg.insert(0,winperc)
                values_avg.insert(0,boxscore['team'].iloc[i])
                values_avg.insert(0,boxscore['game_id'].iloc[i])
                df.loc[len(df)] = values_avg + values_std
            else:
                row = np.empty(34)
                row[:] = np.nan
                df.loc[len(df)] = row
            
    
    df = df.dropna()
    return df,boxscore

def finishdatasetup(df,game_info):
    game_id = df['game_id']
    game_id = game_id.drop_duplicates()
    game_id = list(game_id.dropna())
    dataframe = pd.DataFrame(columns=['win %_r','m % fgm_r','m fgm_r','m % 2pm_r','m 2pm_r','m % 3pm_r','m 3pm_r','m % ftm_r','m ftm_r','m oreb_r','m dreb_r','m ast_r','m stl_r',\
                              'm blk_r','m to_r','m pf_r','pts','s fgm_r','s 2pm_r','s 3pm_r','s ftm_r','s oreb_r','s dreb_r','s ast_r','s stl_r',\
                              's blk_r','s to_r','s pf_r','s % fgm_r','s % 2pm_r','s % 3pm_r', 's % ftm_r'])
    for gid in game_id:
        game = df.loc[df['game_id'] == gid]
        if len(game) != 2:
            continue
        
        game_info_index = game_info.index[game_info['game_id'] == gid].tolist()
        home = game_info['home_team'].iloc[game_info_index[0]]
        keys = list(df.columns)
        
        keys = string_modify(keys,'_r')
        keys[16] = 'pts'
        print(keys[16])
        values = list(np.array(game.iloc[0,2:]) - np.array(game.iloc[1,2:]))
        
        values[16] = game.iloc[0,18]
        
        row1 = dict_from_lists(keys,values)
        
        
        values = list(np.array(game.iloc[1,2:]) - np.array(game.iloc[0,2:]))
        values[16] = game.iloc[1,18]
        row2 = dict_from_lists(keys,values)
        print(row2)
        
        dataframe.loc[len(dataframe)] = row1
        dataframe.loc[len(dataframe)] = row2
        print(dataframe)
    return dataframe


#giant function combining all functions together
def year_to_df(year): #enter the year of basketball games you want, and in ~40 mins a clean dataframe that's ready to put into the ML model is created.
    game_info, boxscore, pbp = s.get_games_season(2023)
    print("Done downloading data...")
    df_teamavgs, boxscore = datasetup(game_info,boxscore)
    df = finishdatasetup(df_teamavgs,game_info)
    if df.columns[0] == 'Unnamed: 0':
        df = df.drop(['Unnamed: 0'],axis=1)
    df.to_csv('NCAAB ' + str(year) + ' Season Data')
    df_teamavgs.to_csv(str(year)+' Boxplot Team Avgs')
    print("Data Saved")
    return boxscore, game_info, pbp, df,df_teamavgs

In [None]:
#this code is optional to run. It will take ~40 minutes and only needs to be run once for each seasons data that you want.
boxscore, game_info, pbp, df, df_teamsvgs = year_to_df(2020)

In [None]:
print(boxscore.tail()) #original boxscore stats taken from scraped data
print('---------------------------------------------------')
datasetup_output = pd.read_csv('2024 Boxplot Team Avgs')
print(datasetup_output.tail()) #data thats output from datasetup
print('----------------------------------------------------')
finishdatasetup_output = pd.read_csv('NCAAB 2024 Season Data')
print(finishdatasetup_output.tail()) #data thats ouput from finishdatasetup

## Functions for fitting and evaluating model

In [None]:
#takes a list of dataframes of each seasons data and splits into training and testing sets.
#data is split so that end of season games are in the testing set, because this more accurately reflects the models goals.
def train_test_split(season_data,perct=.2,cut_off=.1): #list of dataframes, test size percentage in decimal point, and % of beginning games to cut off.
    X_trainl = []
    X_testl = []
    y_trainl = []
    y_testl = []
    for df in season_data:
        if df.columns[0] == 'Unnamed: 0':
            df = df.drop(['Unnamed: 0'],axis=1)
        X = df.drop(['pts'],axis=1)
        cut_off_i = int(len(X)*cut_off)
        
        X = X.iloc[cut_off_i:,:]
        X_columns = X.columns
        y = df['pts']
        y = y.iloc[cut_off_i:]
        test_i = round(len(y) - len(y)*perct)
        
        X_trai = X.iloc[0:test_i,:]
        y_trai = y.iloc[0:test_i]
        X_tes = X.iloc[test_i:len(df),:]
        y_tes = y.iloc[test_i:len(df)]
        X_trainl.append(X_trai)
        X_testl.append(X_tes)
        y_trainl.append(y_trai)
        y_testl.append(y_tes)
        
    X_train = pd.concat(X_trainl)
    X_test = pd.concat(X_testl)
    y_train = pd.concat(y_trainl)
    y_test = pd.concat(y_testl)
    
    scalerX = StandardScaler().fit(X_train)
    scalery = StandardScaler().fit(y_train.values.reshape(-1, 1))
    
    X_train = scalerX.transform(X_train)
    X_train = pd.DataFrame(X_train,columns=X_columns)
    X_test = scalerX.transform(X_test)
    X_test = pd.DataFrame(X_test,columns=X_columns)

    y_train = scalery.transform(y_train.values.reshape(-1, 1))
    y_train = pd.DataFrame(y_train,columns=['pts'])
    y_test = scalery.transform(y_test.values.reshape(-1, 1))
    y_test = pd.DataFrame(y_test,columns=['pts'])
    
    return X_train, y_train, X_test, y_test, scalerX, scalery

def evaluate(X_test,y_test,scalery): #evaluates accuracy after model has been fitted.
    pred = model.predict(X_test).astype(float)
    
    pred = scalery.inverse_transform(pred.reshape(-1, 1))
    
    
    win = [] #1 if model accurately predicted winner, 0 if it didn't.
    y_test1 = scalery.inverse_transform(y_test.values.reshape(-1,1))
    
    for i in range(0,len(y_test1)):
        if i==0:
            continue
            
        
        if (y_test1[i] > y_test1[i-1]) == (pred[i] > pred[i-1]):
            win.append(1)
        else:
            win.append(0)
        
            
    accuracy = sum(win) / len(win)
    
    return accuracy

"""takes the 64 teams in march madness and dataframe with their stats (the df that year_to_df saves as 'year' Boxplot Team Avgs)
and returns a dataframe ready to be fed into model"""
#IMPORTANT!! march_teams list must be formatted so the first value is the top left team in bracket, and second value is the one
#lower than that etc.. Then the 33rd entry is the top right entry and the 34th is the one lower than that etc...
def mm_setup(march_teams,team_stats,scalerX,scalery):
    df_mm_test = pd.DataFrame(columns = ['team','win %_r','m % fgm_r','m fgm_r','m % 2pm_r','m 2pm_r','m % 3pm_r','m 3pm_r','m % ftm_r','m ftm_r','m oreb_r','m dreb_r','m ast_r','m stl_r',\
                              'm blk_r','m to_r','m pf_r','pts','s fgm_r','s 2pm_r','s 3pm_r','s ftm_r','s oreb_r','s dreb_r','s ast_r','s stl_r',\
                              's blk_r','s to_r','s pf_r','s % fgm_r','s % 2pm_r','s % 3pm_r', 's % ftm_r'])
    for i, team in enumerate(march_teams):
        df_team = team_stats[team_stats['team'] == team]
        df_team = df_team.drop(['game_id'],axis=1)
        
        if df_team.columns[0] == 'Unnamed: 0':
            df_team = df_team.drop(['Unnamed: 0'],axis=1)
        df_mm_test.loc[len(df_mm_test)] = list(df_team.iloc[-1,:])
        
    teams = list(df_mm_test['team'])
    df_mm_test = df_mm_test.drop('team',axis=1)
    indexes = np.linspace(0,len(df_mm_test)-2,int(len(df_mm_test)/2))
    k = 0
    for i in indexes:
        i = int(i)
        team1 = list(df_mm_test.iloc[i,:])
        team2 = list(df_mm_test.iloc[i+1,:])
        
        y1 = team1[16]
        y2 = team2[16]
        row1 = np.array(team1) - np.array(team2)
        row2 = np.array(team2) - np.array(team1)
        row1[16] = y1
        row2[16] = y2
        df_mm_test.loc[k] = row1
        df_mm_test.loc[k+1] = row2
        
        k+=2
        
    
    X = df_mm_test.iloc[:,:]
    X = X.drop('pts',axis=1)
    X_columns = X.columns
    X = scalerX.transform(X)
    X = pd.DataFrame(X,columns=X_columns)
    
    y = pd.DataFrame(columns=['pts'])
    y['pts'] = df_mm_test['pts']
    
    y = scalery.transform(y.to_numpy())
    y = pd.DataFrame(y,columns=['pts'])
    df_mm_test = pd.concat([X,y],axis=1)
    
    df_mm_test.insert(0,'team',teams)
    return df_mm_test

"Takes the dataframe set up in mm_setup and the already fit model and predicts a march madness bracket"
def mm_test(df_mm_test,model,scalery):
    teams = list(df_mm_test['team'])
    loops = math.log2(len(teams))
    round1 = [] #32 winners
    round2 = [] #16 winners
    round3 = [] #8 winners
    round4 = [] #4 winners
    round5 = [] #2 winners
    round6 = [] #1 winner
    rounds_list = [round1,round2,round3,round4,round5,round6]
    
    df_mm_test = df_mm_test.drop(['team'],axis=1)
    keys = list(df_mm_test.columns)
    keys.pop(16)
    if int(loops) == loops: #if length of teams is an exponent of 2
        j=0 #counter to make sure loops doesn't go to infity
        while len(df_mm_test) > 1 and j<=10:
            indexes = np.linspace(0,len(df_mm_test)-2,int(len(df_mm_test)/2))
            k = 0 #counter used to know which teams have won.
            for i in indexes:
                i = int(i)
                team1 = list(df_mm_test.iloc[i,:])
                team2 = list(df_mm_test.iloc[i+1,:])

                y1 = team1.pop(16)
                y2 = team2.pop(16)

                #keys_h = string_modify(keys,'_h',vals_to_skip=0)
                #keys_a = string_modify(keys,'_a',vals_to_skip=0)

                
                X_test = pd.DataFrame(columns=[keys])
                y_test = pd.DataFrame(columns=['pts'])

                X_test.loc[0] = team1
                X_test.loc[1] = team2
                y_test.loc[0] = y1
                y_test.loc[1] = y2

                
                #pred = model.predict(X_test)
                pred = model.predict(X_test)
                
                pred = scalery.inverse_transform(pred.reshape(-1,1))
                
                
                row = np.empty(32)
                row[:] = np.nan
                
                if pred[0] > pred[1]: #if team1 > team2
                    df_mm_test.loc[i+1] = row
                    rounds_list[j].append(teams[k])
                    teams.pop(k+1)
                else:
                    df_mm_test.iloc[i] = row
                    rounds_list[j].append(teams[k+1])
                    teams.pop(k)

                k+=1
            df_mm_test = df_mm_test.dropna()
            df_mm_test.reset_index(inplace=True,drop=True)
            
            j+=1
        print('round 1 winners are {}'.format(round1))
        print('round 2 winners are {}'.format(round2))
        print('round 3 winners are {}'.format(round3))
        print('round 4 winners are {}'.format(round4))
        print('round 5 winners are {}'.format(round5))
        print('round 6 winners are {}'.format(round6))
    return pred
            

## Random Forest Regression Model

In [None]:
df_24 = pd.read_csv('NCAAB 2024 Season Data')
df_23 = pd.read_csv('NCAAB 2023 Season Data')
df_22 = pd.read_csv('NCAAB 2022 Season Data')
df_21 = pd.read_csv('NCAAB 2021 Season Data')
df_20 = pd.read_csv('NCAAB 2020 Season Data')
X_train, y_train, X_test, y_test, scalerX, scalery = train_test_split([df_22,df_21,df_23,df_24])
parameters = {'bootstrap': False,
              'min_samples_leaf': 3,
              'n_estimators': 100,
              'min_samples_split': 10,
              'max_features': 'sqrt',
              'max_depth': 8}
model = RandomForestRegressor(**parameters)
model.fit(X_train, y_train)
test_acc = evaluate(X_test,y_test,scalery)
train_acc = evaluate(X_train,y_train,scalery)
print('Test accuracy is {:.3f}'.format(test_acc))
print('Train accuracy is {:.3f}'.format(train_acc))

## Neural Network Model

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

model = keras.Sequential([
      layers.Dense(31),
      layers.Dense(16, activation='relu'),
      layers.Dense(1)
])

model.compile(loss='mean_absolute_error', optimizer=tf.keras.optimizers.Adam(0.001))

history = model.fit(
    X_train,
    y_train,
    validation_split=0.2,
    verbose=0, epochs=100)

In [None]:

test_results = model.evaluate(X_test, y_test, verbose=0)
test_predictions = model.predict(X_test).flatten()
test_acc = evaluate(X_test,y_test,scalery)
print('Test accuracy is {:.3f}'.format(test_acc))

## Predicting March Madness Bracket

In [None]:
df_boxavg = pd.read_csv('2024 Boxplot Team Avgs')

mm_teams = ['UConn Huskies','Stetson Hatters','Florida Atlantic Owls','Northwestern Wildcats','San Diego Toreros','UAB Blazers'\
           ,'Auburn Tigers','Yale Bulldogs','BYU Cougars','Duquesne Dukes','Illinois Fighting Illini','Morehead State Eagles'\
           ,'Washington State Cougars','Drake Bulldogs','Iowa State Cyclones','South Dakota State Jackrabbits','North Carolina Tar Heels'\
           ,'Howard Bison','Mississippi State Bulldogs','Michigan State Spartans',"Saint Mary's Gaels",'Grand Canyon Lopes'\
           ,'Alabama Crimson Tide','Charleston Cougars','Clemson Tigers','New Mexico Lobos','Baylor Bears','Colgate Raiders'\
           ,'Dayton Flyers','Nevada Wolf Pack','Arizona Wildcats','Long Beach State Beach','Houston Cougars','Longwood Lancers'\
           ,'Nebraska Cornhuskers','Texas A&M Aggies','Wisconsin Badgers','James Madison Dukes','Duke Blue Devils','Vermont Catamounts'\
           ,'Texas Tech Red Raiders','NC State Wolfpack','Kentucky Wildcats','Oakland Golden Grizzlies','Florida Gators'\
           ,'Boise State Broncos','Marquette Golden Eagles','Western Kentucky Hilltoppers','Purdue Boilermakers','Montana State Bobcats'\
           ,'Utah State Aggies','TCU Horned Frogs','Gonzaga Bulldogs','McNeese Cowboys','Kansas Jayhawks','Samford Bulldogs'\
           ,'South Carolina Gamecocks','Oregon Ducks','Creighton Bluejays','Akron Zips','Texas Longhorns','Colorado State Rams'\
           ,'Tennessee Volunteers',"Saint Peter's Peacocks"]

#Code to make sure team names in list are the same as team names in df_boxavg
df_team = df_boxavg[df_boxavg['team'] == "Colorado State Rams"]

In [None]:
df_mm_test = mm_setup(mm_teams,df_boxavg,scalerX,scalery)
y_pred = mm_test(df_mm_test,model,scalery)