# Import relevant libraries

In [11]:
#!pip install matplotlib pandas scikit-learn
!pip install statsmodels

Collecting statsmodels
  Downloading statsmodels-0.14.6-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl.metadata (9.5 kB)
Collecting patsy>=0.5.6 (from statsmodels)
  Downloading patsy-1.0.2-py2.py3-none-any.whl.metadata (3.6 kB)
Downloading statsmodels-0.14.6-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl (10.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.3/10.3 MB[0m [31m56.2 MB/s[0m  [33m0:00:00[0m6m0:00:01[0m
[?25hDownloading patsy-1.0.2-py2.py3-none-any.whl (233 kB)
Installing collected packages: patsy, statsmodels
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2/2[0m [statsmodels][0m [statsmodels]
[1A[2KSuccessfully installed patsy-1.0.2 statsmodels-0.14.6

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.3[0m[39;49m -> [0m[32;49m26.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0

In [12]:
import matplotlib.pyplot as plt
plt.style.use('ggplot')

from math import sin, cos, sqrt, atan2, radians
import numpy as np
import pandas as pd
import random
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (auc, classification_report, roc_auc_score, accuracy_score,
                             f1_score, log_loss, roc_curve, confusion_matrix, precision_score, recall_score)
import statsmodels.formula.api as smf

np.random.seed(0)

#Load the Data 

In [13]:
tournament_Data = pd.read_csv('https://raw.githubusercontent.com/SethiDeepika/AIML/main/NCAA_Tourney_2002_2025.csv').query("season!=2019").reset_index(drop=True)
tournament_Data.head()

Unnamed: 0,team1_id,team1_score,team2_id,team2_score,WLoc,num_ot,team1_position,team2_position,team1_seed,team2_seed,...,team1_adjoe,team1_de,team1_adjde,team2_tempo,team2_adjtempo,team2_oe,team2_adjoe,team2_de,team2_adjde,game_id
0,1314,81.0,1181,77.0,N,0.0,W08,X02,8,2,...,113.035,100.974,96.9911,68.2765,67.4185,117.152,119.357,98.2346,95.6444,2022-1314-1181
1,1242,81.0,1437,65.0,N,0.0,Y01,Z02,1,2,...,119.388,97.191,93.9009,64.1915,62.5758,112.845,117.921,97.6798,93.8099,2022-1242-1437
2,1242,72.0,1314,69.0,N,0.0,Y01,W08,1,8,...,119.388,97.191,93.9009,70.2181,70.1745,109.416,113.035,100.974,96.9911,2022-1242-1314
3,1242,76.0,1274,50.0,N,0.0,Y01,Y10,1,10,...,119.388,97.191,93.9009,67.5221,67.3101,110.368,114.757,104.983,102.132,2022-1242-1274
4,1314,69.0,1389,49.0,N,0.0,W08,W15,8,15,...,113.035,100.974,96.9911,67.0262,65.9923,98.7557,98.7475,92.3289,94.4745,2022-1314-1389


In [14]:
# The data has team1 as the winner. We need to create balanced training data
# by also including the reverse matchups (team2 vs team1)
print("\nPreparing training data...")



Preparing training data...


In [None]:
df_winner = tournament_Data.copy()
df_winner['target'] = 1  # team1 wins
df_loser = tournament_Data.copy()
df_loser['target'] = 0  # team2 loses

team1_cols = [c for c in tournament_Data.columns if c.startswith('team1_')]
team2_cols = [c for c in tournament_Data.columns if c.startswith('team2_')]


# Put Team 2 data into Team 1's columns
df_loser[team1_cols] = tournament_Data[team2_cols].values

# Put Team 1 data into Team 2's columns
df_loser[team2_cols] = tournament_Data[team1_cols].values

# Feature engineering
stat_cols_team1 = [
    'team1_fg2pct', 'team1_fg3pct', 'team1_ftpct', 'team1_blockpct',
    'team1_oppfg2pct', 'team1_oppfg3pct', 'team1_arate', 'team1_opparate',
    'team1_stlrate', 'team1_oppstlrate', 'team1_adjoe', 'team1_adjde'
]

stat_cols_team2 = [
    'team2_fg2pct', 'team2_fg3pct', 'team2_ftpct', 'team2_blockpct',
    'team2_oppfg2pct', 'team2_oppfg3pct', 'team2_arate', 'team2_opparate',
    'team2_stlrate', 'team2_oppstlrate', 'team2_adjoe', 'team2_adjde'
]

df_combined = pd.concat([df_winner, df_loser], axis=0).reset_index(drop=True)
df_combined = df_combined.sort_values(by=['game_id']).reset_index(drop=True)
feature_names = []

for t1, t2 in zip(stat_cols_team1, stat_cols_team2):
    base_name = t1.replace('team1_', '')
    df_combined[f'diff_{base_name}'] = df_combined[t1] - df_combined[t2]
    feature_names.append(f'diff_{base_name}')

# Seed difference
df_combined['seed_diff'] = df_combined['team2_seed'] - df_combined['team1_seed']
feature_names.append('seed_diff')

# Net efficiency difference
df_combined['team1_net_eff'] = df_combined['team1_adjoe'] - df_combined['team1_adjde']
df_combined['team2_net_eff'] = df_combined['team2_adjoe'] - df_combined['team2_adjde']
df_combined['diff_net_eff'] = df_combined['team1_net_eff'] - df_combined['team2_net_eff']
feature_names.append('diff_net_eff')



df_combined


Unnamed: 0,team1_id,team1_score,team2_id,team2_score,WLoc,num_ot,team1_position,team2_position,team1_seed,team2_seed,...,diff_arate,diff_opparate,diff_stlrate,diff_oppstlrate,diff_adjoe,diff_adjde,seed_diff,team1_net_eff,team2_net_eff,diff_net_eff
0,1104,86.0,1194,78.0,N,0.0,Y02,Y15,2,15,...,-6.0473,-9.3519,-0.0229,-0.0195,14.6285,-6.0493,13,17.6184,-3.0594,20.6778
1,1194,78.0,1104,86.0,N,0.0,Y15,Y02,15,2,...,6.0473,9.3519,0.0229,0.0195,-14.6285,6.0493,-13,-3.0594,17.6184,-20.6778
2,1112,86.0,1364,81.0,N,0.0,Z03,Z14,3,14,...,-9.3072,-3.6991,-0.0033,0.0061,11.9343,-0.7442,11,20.4615,7.783,12.6785
3,1364,81.0,1112,86.0,N,0.0,Z14,Z03,14,3,...,9.3072,3.6991,0.0033,-0.0061,-11.9343,0.7442,-11,7.783,20.4615,-12.6785
4,1461,60.0,1112,68.0,N,0.0,Z11,Z03,11,3,...,-4.585,-0.9468,0.0125,0.0229,-11.3507,-0.2661,-8,9.3769,20.4615,-11.0846
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2889,1429,47.0,1417,72.0,N,0.0,X10,X07,10,7,...,0.60981,0.522621,0.003288,0.01391,3.885,10.4223,-3,15.225,21.7623,-6.5373
2890,1285,66.0,1458,85.0,N,0.0,W14,W03,14,3,...,-5.538595,-3.733456,0.014429,0.013924,-11.238,14.3529,-11,0.384,25.9749,-25.5909
2891,1458,85.0,1285,66.0,N,0.0,W03,W14,3,14,...,5.538595,3.733456,-0.014429,-0.013924,11.238,-14.3529,11,25.9749,0.384,25.5909
2892,1462,86.0,1400,80.0,N,0.0,X11b,X11a,11,11,...,16.444696,7.771025,0.019248,0.00141,-1.214,-1.2596,0,17.197,17.1514,0.0456


In [None]:


df_training = pd.concat([df_winner, df_loser], axis=0).reset_index(drop=True)
df_training = df_training.sort_values(by=['game_id']).reset_index(drop=True)

#df_training=df_combined[feature_names]
df_training.head()

Unnamed: 0,team1_id,team1_score,team2_id,team2_score,WLoc,num_ot,team1_position,team2_position,team1_seed,team2_seed,...,team1_de,team1_adjde,team2_tempo,team2_adjtempo,team2_oe,team2_adjoe,team2_de,team2_adjde,game_id,target
0,1104,86.0,1194,78.0,N,0.0,Y02,Y15,2,15,...,95.2313,93.877,71.2357,71.2446,100.2897,96.8669,98.4183,99.9263,2002-1104-1194,1
1,1194,78.0,1104,86.0,N,0.0,Y15,Y02,15,2,...,98.4183,99.9263,69.8636,69.9001,108.4361,111.4954,95.2313,93.877,2002-1104-1194,0
2,1112,86.0,1364,81.0,N,0.0,Z03,Z14,3,14,...,104.0411,96.9262,63.2345,64.7948,105.2163,105.4534,96.0965,97.6704,2002-1112-1364,1
3,1364,81.0,1112,86.0,N,0.0,Z14,Z03,14,3,...,96.0965,97.6704,74.1462,72.8207,111.0077,117.3877,104.0411,96.9262,2002-1112-1364,0
4,1112,68.0,1461,60.0,N,0.0,Z03,Z11,3,11,...,104.0411,96.9262,69.6172,70.4124,105.3654,106.037,97.0568,96.6601,2002-1112-1461,1


In [11]:
df_training.shape

(2894, 67)