In [2]:
import pandas as pd
import numpy as np
import random

# Load team data
team_data = pd.read_csv('data/teamData.csv')
# Assuming the team_data has headers now, if not, add header=None and adjust column access by indices.
data2023 = pd.read_csv('data/2024_teamData.csv')

games = pd.read_csv('data/MNCAATourneyDetailedResults.csv')

# # Verify if team_data has the correct column names; if not, adjust accordingly

columns_to_remove = ['School', 'Conf', 'TeamName','PF', 'PA', 'NumOT', 'FGM', 'FGA', 'FGM3', 'FGA3',
       'FTM', 'FTA', 'OR', 'DR', 'Ast', 'TO', 'Stl', 'Blk', 'Fls', 'OppFGM',
       'OppFGA', 'OppFGM3', 'OppFGA3', 'OppFTM', 'OppFTA', 'OppOR', 'OppDR',
       'OppAst', 'OppTO', 'OppStl', 'OppBlk', 'OppFls', 'Wins', 'Loses',  'Tempo_x',
       'TO_per_poss_last_6_last_6', 'Stl_per_poss_last_6_last_6',
       'Blk_per_poss_last_6_last_6', 'PF_per_poss_last_6_last_6', 'FGM3_per_poss_last_6_last_6', 
       'FGA3_per_poss_last_6_last_6', 'Possessions_last_6_last_6',
       'FTM_per_poss_last_6_last_6', 'FTA_per_poss_last_6_last_6',
       'Score_per_poss_last_6_last_6', 'OR_per_poss_last_6_last_6',]

team_data['Trapezoid'] = team_data['AdjEM'] - team_data['Tempo_x']
team_data['Diff Win'] = team_data['Win_last_6'] - team_data['Win %'] 
team_data['3PM_diff'] =team_data['3PTPPos'] - team_data['FGA3_per_poss_last_6_last_6']
team_data['FT_diff'] = team_data['FTPPos'] - team_data['FTM_per_poss_last_6_last_6']
team_data['PPPos_diff'] = team_data['PPPos'] - team_data['PF_per_poss_last_6_last_6']
team_data['Orb_diff'] = team_data['ORPPos'] - team_data['OR_per_poss_last_6_last_6']
team_data['Tov_diff'] = team_data['TOV %'] - team_data['TO_per_poss_last_6_last_6']
team_data['rank_diff'] = team_data['Week_18'] - team_data['Week_1']
team_data['Pom_diff'] = team_data['AdjOE'] - team_data['AdjDE']
team_data['TOV Margin'] = team_data['TOV %'] - team_data['TOV Forced %']

team_data.drop(columns=columns_to_remove, inplace=True)
data2023.drop(columns=columns_to_remove, inplace=True)
team_data.shape[1]

62

In [3]:
team_data.columns

Index(['Season', 'TeamID', 'Games Played', 'Minutes Played', 'Possesions',
       'OppPossesions', 'Win %', 'PPPos', 'PAPPos', 'PPPos Margin', 'FG %',
       'OppFG%', '3PT FG %', '3PTPPos', 'FT %', 'FTPPos', 'OppFTPPos',
       'ORPPos', 'Opp ORPPos', 'DRPG', 'REBPG', 'REB Margin',
       'True Shooting %', 'Effective FG%', 'TOV %', 'TOV Forced %',
       'Foul Margin', 'OppEFG', 'Win_last_6', 'FGM_per_poss_last_6_last_6',
       'FGA_per_poss_last_6_last_6', 'DR_per_poss_last_6_last_6',
       'Ast_per_poss_last_6_last_6', 'Week_1', 'Week_6', 'Week_12', 'Week_18',
       'Tempo_y', 'RankTempo', 'AdjTempo', 'RankAdjTempo', 'OE', 'RankOE',
       'AdjOE', 'RankAdjOE', 'DE', 'RankDE', 'AdjDE', 'RankAdjDE', 'AdjEM',
       'RankAdjEM', 'seed', 'Trapezoid', 'Diff Win', '3PM_diff', 'FT_diff',
       'PPPos_diff', 'Orb_diff', 'Tov_diff', 'rank_diff', 'Pom_diff',
       'TOV Margin'],
      dtype='object')

In [4]:
cols = team_data.shape[1] - 2 + 5
rows = games.shape[0]
dataset = np.zeros((rows, cols))

dataset.shape

(1315, 65)

In [5]:
for i in range(rows):
    season = games.iloc[i]['Season']
    WTeamID = games.iloc[i]['WTeamID']
    LTeamID = games.iloc[i]['LTeamID']
    loc = 0 if games.iloc[i]['WLoc'] == 'N' else 1 if games.iloc[i]['WLoc'] == 'H' else -1
    
    # Extract team statistics for the winning and losing teams
    try:
        WTeamData = team_data[(team_data['Season'] == season) & (team_data['TeamID'] == WTeamID)].iloc[0, 2:].values
        LTeamData = team_data[(team_data['Season'] == season) & (team_data['TeamID'] == LTeamID)].iloc[0, 2:].values
    except IndexError:
        print(f"Data not found for game {i+1} in season {season} for teams {WTeamID} and {LTeamID}. Skipping.")
        continue  # Skip this game if data is missing

    difference = np.subtract(WTeamData.astype(float), LTeamData.astype(float))
    
    # Determine the winner randomly and adjust team order and difference accordingly
    if random.random() > 0.5:
        winner = 1  # Indicating the first team (team1ID) wins
        game_data = np.concatenate(([season, WTeamID, LTeamID, loc], difference, [winner]))
    else:
        winner = 0  # Indicating the second team (team2ID) wins
        game_data = np.concatenate(([season, LTeamID, WTeamID, -loc], -difference, [winner]))
    
    dataset[i] = game_data

# Filter out uninitialized rows in case some games were skipped
dataset = dataset[~np.all(dataset == 0, axis=1)]

Data not found for game 90 in season 2004 for teams 1301 and 1418. Skipping.
Data not found for game 152 in season 2005 for teams 1257 and 1418. Skipping.
Data not found for game 736 in season 2014 for teams 1166 and 1418. Skipping.


In [11]:
original_feature_names = team_data.columns[2:]  # Adjust index if needed

columns = ['Season', 'Team1ID', 'Team2ID', 'Location'] + list(original_feature_names) + ['Winner']

df_dataset = pd.DataFrame(dataset, columns=columns)

columns_to_remove =  ['Season','Team1ID', 'Team2ID', 'Location']

recent_dataset = df_dataset[df_dataset['Season'] >= 2010]
recent_dataset = df_dataset[df_dataset['Season'] < 2023]

df_dataset.drop(columns=columns_to_remove, inplace=True)
recent_dataset.drop(columns=columns_to_remove, inplace=True)

# Save the prepared dataset
df_dataset.to_csv('data/training_dataset.csv', index=False)
recent_dataset.to_csv('data/2010training_dataset.csv', index=False)

recent_dataset

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  recent_dataset.drop(columns=columns_to_remove, inplace=True)


Unnamed: 0,Games Played,Minutes Played,Possesions,OppPossesions,Win %,PPPos,PAPPos,PPPos Margin,FG %,OppFG%,...,Diff Win,3PM_diff,FT_diff,PPPos_diff,Orb_diff,Tov_diff,rank_diff,Pom_diff,TOV Margin,Winner
0,1.0,20.0,62.40,101.52,0.151724,0.026532,-0.123769,0.150300,0.018262,-0.030938,...,0.181609,0.044867,-0.007181,0.038557,0.033336,-0.011599,-0.0,3.6290,0.008731,0.0
1,1.0,40.0,-235.52,-237.16,-0.237685,-0.082587,0.043528,-0.126115,-0.016969,0.005765,...,0.237685,0.036926,0.024997,-0.061590,-0.012431,-0.041030,-0.0,-24.5871,0.036945,0.0
2,0.0,5.0,-30.76,-14.64,-0.172414,0.037544,0.055225,-0.017681,0.040251,0.043212,...,-0.327586,0.096638,0.043939,0.048417,0.002983,0.010187,84.0,-1.0863,0.048109,1.0
3,4.0,165.0,197.64,195.16,0.085684,0.041621,-0.088742,0.130363,-0.005763,-0.023218,...,-0.252351,0.015029,0.086698,0.053183,-0.007664,-0.020495,-81.0,7.9453,-0.092653,0.0
4,1.0,30.0,-32.88,-39.80,-0.124138,0.023384,0.028311,-0.004928,-0.009399,0.009357,...,0.124138,-0.028458,-0.025017,-0.005163,0.045283,-0.025195,76.0,-0.5522,-0.018900,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1240,1.0,50.0,143.60,150.44,0.126560,0.018860,-0.076780,0.095640,0.002544,-0.047430,...,0.040107,0.009384,-0.100809,0.000059,0.023300,0.029830,3.0,12.8725,0.056096,1.0
1241,4.0,165.0,382.76,392.60,0.106583,0.112824,0.085389,0.027435,0.027419,0.046372,...,-0.273250,-0.071058,-0.004861,0.214881,-0.021832,-0.005944,6.0,11.7708,0.016086,1.0
1242,1.0,50.0,259.12,233.52,0.035651,-0.007007,0.004265,-0.011271,0.043959,0.009635,...,-0.035651,0.133032,-0.073488,-0.026849,0.016118,0.028436,-1.0,1.3822,0.005305,1.0
1243,-1.0,-35.0,1.08,-1.16,-0.096257,-0.072218,0.036114,-0.108333,-0.035553,0.016836,...,0.262923,-0.013628,-0.006643,-0.047456,-0.027933,-0.004343,8.0,-7.6660,0.058062,1.0


In [9]:
df_dataset

Unnamed: 0,Games Played,Minutes Played,Possesions,OppPossesions,Win %,PPPos,PAPPos,PPPos Margin,FG %,OppFG%,...,Diff Win,3PM_diff,FT_diff,PPPos_diff,Orb_diff,Tov_diff,rank_diff,Pom_diff,TOV Margin,Winner
0,1.0,20.0,62.40,101.52,0.151724,0.026532,-0.123769,0.150300,0.018262,-0.030938,...,0.181609,0.044867,-0.007181,0.038557,0.033336,-0.011599,-0.0,3.6290,0.008731,0.0
1,1.0,40.0,-235.52,-237.16,-0.237685,-0.082587,0.043528,-0.126115,-0.016969,0.005765,...,0.237685,0.036926,0.024997,-0.061590,-0.012431,-0.041030,-0.0,-24.5871,0.036945,0.0
2,0.0,5.0,-30.76,-14.64,-0.172414,0.037544,0.055225,-0.017681,0.040251,0.043212,...,-0.327586,0.096638,0.043939,0.048417,0.002983,0.010187,84.0,-1.0863,0.048109,1.0
3,4.0,165.0,197.64,195.16,0.085684,0.041621,-0.088742,0.130363,-0.005763,-0.023218,...,-0.252351,0.015029,0.086698,0.053183,-0.007664,-0.020495,-81.0,7.9453,-0.092653,0.0
4,1.0,30.0,-32.88,-39.80,-0.124138,0.023384,0.028311,-0.004928,-0.009399,0.009357,...,0.124138,-0.028458,-0.025017,-0.005163,0.045283,-0.025195,76.0,-0.5522,-0.018900,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1307,-2.0,-90.0,-190.56,-181.92,0.016544,0.046349,0.083774,-0.037425,0.017130,0.029082,...,-0.016544,0.013820,0.022288,0.074333,-0.005556,0.003206,-81.0,-9.6564,0.014621,1.0
1308,-1.0,-40.0,-149.68,-182.80,0.176136,-0.037819,-0.018584,-0.019235,-0.025867,-0.005794,...,0.157197,0.062925,0.035663,-0.051148,-0.024855,0.015070,-14.0,-0.3999,-0.023446,1.0
1309,1.0,35.0,50.44,45.08,-0.023674,-0.004251,-0.091061,0.086810,-0.025980,-0.039580,...,0.190341,-0.071589,0.009394,-0.042800,-0.004899,-0.017202,-3.0,10.4544,0.008398,1.0
1310,0.0,0.0,-70.16,-96.96,-0.093750,-0.064668,0.020532,-0.085200,-0.026823,0.012137,...,-0.072917,0.026236,0.024160,-0.067185,-0.037001,0.011102,1.0,2.0251,-0.030636,1.0


(1312, 61)