# Import relevant libraries

In [1]:
import matplotlib.pyplot as plt
plt.style.use('ggplot')

from math import sin, cos, sqrt, atan2, radians
import numpy as np
import pandas as pd
import random
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (auc, classification_report, roc_auc_score, accuracy_score,
                             f1_score, log_loss, roc_curve, confusion_matrix, precision_score, recall_score)
import statsmodels.formula.api as smf

np.random.seed(0)

#Load the Data 

In [2]:
tournament_Data = pd.read_csv('NCAA_Tourney_2002_2025.csv').query("season!=2019").reset_index(drop=True)
tournament_Data.head()

Unnamed: 0,team1_id,team1_score,team2_id,team2_score,WLoc,num_ot,team1_position,team2_position,team1_seed,team2_seed,...,team1_adjoe,team1_de,team1_adjde,team2_tempo,team2_adjtempo,team2_oe,team2_adjoe,team2_de,team2_adjde,game_id
0,1314,81.0,1181,77.0,N,0.0,W08,X02,8,2,...,113.035,100.974,96.9911,68.2765,67.4185,117.152,119.357,98.2346,95.6444,2022-1314-1181
1,1242,81.0,1437,65.0,N,0.0,Y01,Z02,1,2,...,119.388,97.191,93.9009,64.1915,62.5758,112.845,117.921,97.6798,93.8099,2022-1242-1437
2,1242,72.0,1314,69.0,N,0.0,Y01,W08,1,8,...,119.388,97.191,93.9009,70.2181,70.1745,109.416,113.035,100.974,96.9911,2022-1242-1314
3,1242,76.0,1274,50.0,N,0.0,Y01,Y10,1,10,...,119.388,97.191,93.9009,67.5221,67.3101,110.368,114.757,104.983,102.132,2022-1242-1274
4,1314,69.0,1389,49.0,N,0.0,W08,W15,8,15,...,113.035,100.974,96.9911,67.0262,65.9923,98.7557,98.7475,92.3289,94.4745,2022-1314-1389


In [3]:
# The data has team1 as the winner. We need to create balanced training data
# by also including the reverse matchups (team2 vs team1)
print("\nPreparing training data...")



Preparing training data...


In [4]:
df_winner = tournament_Data.copy()
df_winner['target'] = 1  # team1 wins
df_loser = tournament_Data.copy()
df_loser['target'] = 0  # team2 loses

team1_cols = [c for c in tournament_Data.columns if c.startswith('team1_')]
team2_cols = [c for c in tournament_Data.columns if c.startswith('team2_')]


# Put Team 2 data into Team 1's columns
df_loser[team1_cols] = tournament_Data[team2_cols].values

# Put Team 1 data into Team 2's columns
df_loser[team2_cols] = tournament_Data[team1_cols].values


df_training = pd.concat([df_winner, df_loser], axis=0).reset_index(drop=True)
df_training = df_training.sort_values(by=['game_id']).reset_index(drop=True)
df_training.head()

Unnamed: 0,team1_id,team1_score,team2_id,team2_score,WLoc,num_ot,team1_position,team2_position,team1_seed,team2_seed,...,team1_de,team1_adjde,team2_tempo,team2_adjtempo,team2_oe,team2_adjoe,team2_de,team2_adjde,game_id,target
0,1104,86.0,1194,78.0,N,0.0,Y02,Y15,2,15,...,95.2313,93.877,71.2357,71.2446,100.2897,96.8669,98.4183,99.9263,2002-1104-1194,1
1,1194,78.0,1104,86.0,N,0.0,Y15,Y02,15,2,...,98.4183,99.9263,69.8636,69.9001,108.4361,111.4954,95.2313,93.877,2002-1104-1194,0
2,1112,86.0,1364,81.0,N,0.0,Z03,Z14,3,14,...,104.0411,96.9262,63.2345,64.7948,105.2163,105.4534,96.0965,97.6704,2002-1112-1364,1
3,1364,81.0,1112,86.0,N,0.0,Z14,Z03,14,3,...,96.0965,97.6704,74.1462,72.8207,111.0077,117.3877,104.0411,96.9262,2002-1112-1364,0
4,1112,68.0,1461,60.0,N,0.0,Z03,Z11,3,11,...,104.0411,96.9262,69.6172,70.4124,105.3654,106.037,97.0568,96.6601,2002-1112-1461,1
