Import libraries and Data

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import PassiveAggressiveClassifier
import patsy as pt
from sklearn.metrics import accuracy_score
from sklearn.metrics import brier_score_loss
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from tqdm import tqdm

# Load Data

In [18]:
MTeams = pd.read_csv('MTeams.csv')
MSeasons = pd.read_csv('MSeasons.csv')
MNCAATourneySeeds = pd.read_csv('MNCAATourneySeeds.csv')
MRegularSeasonCompactResults = pd.read_csv('MRegularSeasonCompactResults.csv')
MRegularSeasonDetailedResults = pd.read_csv('MRegularSeasonDetailedResults.csv') 
MNCAATourneyCompactResults = pd.read_csv('MNCAATourneyCompactResults.csv')
MNCAATourneyDetailedResults = pd.read_csv('MNCAATourneyDetailedResults.csv')
SampleSubmissionStage1 = pd.read_csv('SampleSubmissionStage1.csv')
SampleSubmissionStage2 = pd.read_csv('SampleSubmissionStage2.csv')
MMasseyOrdinals = pd.read_csv('MMasseyOrdinals.csv')

In [None]:
SampleSubmissionStage1['Season'] = SampleSubmissionStage1['ID'].apply(lambda x: int(x.split('_')[0]))
SampleSubmissionStage1['Team1'] = SampleSubmissionStage1['ID'].apply(lambda x: int(x.split('_')[1]))
SampleSubmissionStage1['Team2'] = SampleSubmissionStage1['ID'].apply(lambda x: int(x.split('_')[2]))

In [7]:
def regular_season_data(year, teams, data, rankings, template, details):
    teams_lst = teams[(teams['FirstD1Season'] <= year) & (teams['LastD1Season'] >= year)]
    results_data = data[data['Season'] == year]
    rankings_data = rankings[rankings['Season'] == year]
    sample_data = template[template['Season'] == year]
    detailed_data = details[details['Season'] == year]
    return teams_lst, results_data, rankings_data, sample_data, detailed_data


In [9]:
teams_21, results_21, rankings_21, sample_21, detailed_21 = regular_season_data(2021, MTeams, MRegularSeasonCompactResults, MMasseyOrdinals, SampleSubmissionStage1, MRegularSeasonDetailedResults)
teams_22, results_22, rankings_22, sample_22, detailed_22 = regular_season_data(2022, MTeams, MRegularSeasonCompactResults, MMasseyOrdinals, SampleSubmissionStage1, MRegularSeasonDetailedResults)
teams_23, results_23, rankings_23, sample_23, detailed_23 = regular_season_data(2023, MTeams, MRegularSeasonCompactResults, MMasseyOrdinals, SampleSubmissionStage1, MRegularSeasonDetailedResults)
teams_24, results_24, rankings_24, sample_24, detailed_24 = regular_season_data(2024, MTeams, MRegularSeasonCompactResults, MMasseyOrdinals, SampleSubmissionStage1, MRegularSeasonDetailedResults)

# Prepare Data for Training

Create Functions for Key Indicators

In [10]:
def record_wins(Team1, results):
    Team1_wins = 0
    for index, row in results.iterrows():
        if row['WTeamID'] == Team1:
            Team1_wins += 1
    return Team1_wins

def avg_score_difference(Team1, results):
    Team1_score_difference = 0
    games = 0
    for index, row in results.iterrows():
        if row['WTeamID'] == Team1:
            Team1_score_difference += row['WScore'] - row['LScore']
            games += 1
        if row['LTeamID'] == Team1:
            Team1_score_difference += row['LScore'] - row['WScore']
            games += 1
    return Team1_score_difference/games if games != 0 else 0


def weighted_avg_ranking(Team1, ranking_system, rankings_data):
    # Filter data once
    team_data = rankings_data[(rankings_data['TeamID'] == Team1) &
                               (rankings_data['SystemName'] == ranking_system)]
    
    if team_data.empty:
        return 0  # No data available    
    # Get unique ranking days
    unique_days = team_data['RankingDayNum'].unique()
    total_days = len(unique_days)
    
    # Initialize weights
    weights = range(total_days, 0, -1)
    
    # If there are fewer team_data rows than days, we need to adjust
    if len(team_data) < total_days:
        weights = weights[:len(team_data)]
    
    # Calculate weighted average
    weighted_sum = sum(row.OrdinalRank * weight for row, weight in tqdm(zip(team_data.itertuples(), weights), total=len(weights)))
    total_weight = sum(weights)
    
    return weighted_sum / total_weight if total_weight > 0 else 0

def team_stats_differences(Team1, data):
    Team1_FG_PCT = 0
    Team1_FG3_PCT = 0
    Team1_FT_PCT = 0
    Team1_REB = 0
    Team1_AST = 0
    Team1_TO = 0
    Team1_STL = 0
    Team1_BLK = 0
    Team1_PF = 0
    games = 0
    for index, row in data.iterrows():
        if row['WTeamID'] == Team1:
            Team1_FG_PCT += (row['WFGM'] / row['WFGA'] if row['WFGA'] != 0 else 0) - (row['LFGM'] / row['LFGA'] if row['LFGA'] != 0 else 0)
            Team1_FG3_PCT += (row['WFGM3'] / row['WFGA3'] if row['WFGA3'] != 0 else 0) - (row['LFGM3'] / row['LFGA3'] if row['LFGA3'] != 0 else 0)
            Team1_FT_PCT += (row['WFTM'] / row['WFTA'] if row['WFTA'] != 0 else 0) - (row['LFTM'] / row['LFTA'] if row['LFTA'] != 0 else 0)
            Team1_REB += (row['WOR'] + row['WDR']) - (row['LOR'] + row['LDR'])
            Team1_AST += row['WAst'] - row['LAst']
            Team1_TO += row['WTO'] - row['LTO']
            Team1_STL += row['WStl'] - row['LStl']
            Team1_BLK += row['WBlk'] - row['LBlk']
            Team1_PF += row['WPF'] - row['LPF']
            games += 1
        if row['LTeamID'] == Team1:
            Team1_FG_PCT += (row['LFGM'] / row['LFGA'] if row['LFGA'] != 0 else 0) - (row['WFGM'] / row['WFGA'] if row['WFGA'] != 0 else 0)
            Team1_FG3_PCT += (row['LFGM3'] / row['LFGA3'] if row['LFGA3'] != 0 else 0) - (row['WFGM3'] / row['WFGA3'] if row['WFGA3'] != 0 else 0)
            Team1_FT_PCT += (row['LFTM'] / row['LFTA'] if row['LFTA'] != 0 else 0) - (row['WFTM'] / row['WFTA'] if row['WFTA'] != 0 else 0)
            Team1_REB += (row['LOR'] + row['LDR']) - (row['WOR'] + row['WDR'])
            Team1_AST += row['LAst'] - row['WAst']
            Team1_TO += row['LTO'] - row['WTO']
            Team1_STL += row['LStl'] - row['WStl']
            Team1_BLK += row['LBlk'] - row['WBlk']
            Team1_PF += row['LPF'] - row['WPF']
            games += 1
    return (Team1_FG_PCT/games if games != 0 else 0, 
            Team1_FG3_PCT/games if games != 0 else 0, 
            Team1_FT_PCT/games if games != 0 else 0, 
            Team1_REB/games if games != 0 else 0, 
            Team1_AST/games if games != 0 else 0, 
            Team1_TO/games if games != 0 else 0, 
            Team1_STL/games if games != 0 else 0, 
            Team1_BLK/games if games != 0 else 0, 
            Team1_PF/games if games != 0 else 0)

In [11]:
teams_21['Wins'] = teams_21['TeamID'].apply(lambda x: record_wins(x, results_21))
teams_22['Wins'] = teams_22['TeamID'].apply(lambda x: record_wins(x, results_22))
teams_23['Wins'] = teams_23['TeamID'].apply(lambda x: record_wins(x, results_23))
teams_24['Wins'] = teams_24['TeamID'].apply(lambda x: record_wins(x, results_24))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  teams_21['Wins'] = teams_21['TeamID'].apply(lambda x: record_wins(x, results_21))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  teams_22['Wins'] = teams_22['TeamID'].apply(lambda x: record_wins(x, results_22))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  teams_23['Wins'] = teams_23['TeamID'].app

In [12]:
teams_21['AvgScoreDifference'] = teams_21['TeamID'].apply(lambda x: avg_score_difference(x, results_21))
teams_22['AvgScoreDifference'] = teams_22['TeamID'].apply(lambda x: avg_score_difference(x, results_22))
teams_23['AvgScoreDifference'] = teams_23['TeamID'].apply(lambda x: avg_score_difference(x, results_23))
teams_24['AvgScoreDifference'] = teams_24['TeamID'].apply(lambda x: avg_score_difference(x, results_24))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  teams_21['AvgScoreDifference'] = teams_21['TeamID'].apply(lambda x: avg_score_difference(x, results_21))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  teams_22['AvgScoreDifference'] = teams_22['TeamID'].apply(lambda x: avg_score_difference(x, results_22))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-co

In [13]:
teams_21['POM_avg'] = teams_21['TeamID'].apply(lambda x: avg_score_difference(x, results_21))
teams_22['POM_avg'] = teams_22['TeamID'].apply(lambda x: avg_score_difference(x, results_22))
teams_23['POM_avg'] = teams_23['TeamID'].apply(lambda x: avg_score_difference(x, results_23))
teams_24['POM_avg'] = teams_24['TeamID'].apply(lambda x: avg_score_difference(x, results_24))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  teams_21['POM_avg'] = teams_21['TeamID'].apply(lambda x: avg_score_difference(x, results_21))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  teams_22['POM_avg'] = teams_22['TeamID'].apply(lambda x: avg_score_difference(x, results_22))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  teams_23['POM_avg

In [14]:
team_stats = teams_21['TeamID'].apply(lambda x: team_stats_differences(x, detailed_21))
team_stats_df = pd.DataFrame(team_stats.tolist(), columns=['FG_PCT', 'FG3_PCT', 'FT_PCT', 'REB', 'AST', 'TO', 'STL', 'BLK', 'PF'])
teams_21 = pd.concat([teams_21, team_stats_df], axis=1)

team_stats = teams_22['TeamID'].apply(lambda x: team_stats_differences(x, detailed_22))
team_stats_df = pd.DataFrame(team_stats.tolist(), columns=['FG_PCT', 'FG3_PCT', 'FT_PCT', 'REB', 'AST', 'TO', 'STL', 'BLK', 'PF'])
teams_22 = pd.concat([teams_22, team_stats_df], axis=1)

team_stats = teams_23['TeamID'].apply(lambda x: team_stats_differences(x, detailed_23))
team_stats_df = pd.DataFrame(team_stats.tolist(), columns=['FG_PCT', 'FG3_PCT', 'FT_PCT', 'REB', 'AST', 'TO', 'STL', 'BLK', 'PF'])
teams_23 = pd.concat([teams_23, team_stats_df], axis=1)

team_stats = teams_24['TeamID'].apply(lambda x: team_stats_differences(x, detailed_24))
team_stats_df = pd.DataFrame(team_stats.tolist(), columns=['FG_PCT', 'FG3_PCT', 'FT_PCT', 'REB', 'AST', 'TO', 'STL', 'BLK', 'PF'])
teams_24 = pd.concat([teams_24, team_stats_df], axis=1)

In [51]:
# Merge sample_21 with teams_21 for Team1 & Team2 details
merged_sample_21 = sample_21.merge(teams_21, left_on='Team1', right_on='TeamID', suffixes=('', '_Team1'))
merged_sample_21 = merged_sample_21.merge(teams_21, left_on='Team2', right_on='TeamID', suffixes=('_Team1', '_Team2'))
merged_sample_21 = merged_sample_21.drop(columns=['TeamID_Team1', 'TeamID_Team2'])

# Merge sample_22 with teams_22 for Team1 & Team2 details
merged_sample_22 = sample_22.merge(teams_22, left_on='Team1', right_on='TeamID', suffixes=('', '_Team1'))
merged_sample_22 = merged_sample_22.merge(teams_22, left_on='Team2', right_on='TeamID', suffixes=('_Team1', '_Team2'))
merged_sample_22 = merged_sample_22.drop(columns=['TeamID_Team1', 'TeamID_Team2'])

# Merge sample_23 with teams_23 for Team1 & Team2 details
merged_sample_23 = sample_23.merge(teams_23, left_on='Team1', right_on='TeamID', suffixes=('', '_Team1'))
merged_sample_23 = merged_sample_23.merge(teams_23, left_on='Team2', right_on='TeamID', suffixes=('_Team1', '_Team2'))
merged_sample_23 = merged_sample_23.drop(columns=['TeamID_Team1', 'TeamID_Team2'])

# Merge sample_24 with teams_24 for Team1 & Team2 details
merged_sample_24 = sample_24.merge(teams_24, left_on='Team1', right_on='TeamID', suffixes=('', '_Team1'))
merged_sample_24 = merged_sample_24.merge(teams_24, left_on='Team2', right_on='TeamID', suffixes=('_Team1', '_Team2'))
merged_sample_24 = merged_sample_24.drop(columns=['TeamID_Team1', 'TeamID_Team2'])


In [90]:
# Merge all merged samples into a single DataFrame
merged_all_samples = pd.concat([merged_sample_21, merged_sample_22, merged_sample_23, merged_sample_24], ignore_index=True)

In [111]:
results_21

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT
166880,2021,23,1101,70,1190,47,N,0
166881,2021,23,1104,81,1240,57,H,0
166882,2021,23,1111,81,1354,61,A,0
166883,2021,23,1113,94,1348,88,N,0
166884,2021,23,1114,71,1341,66,N,0
...,...,...,...,...,...,...,...,...
170730,2021,132,1104,80,1261,79,N,0
170731,2021,132,1159,85,1259,72,H,0
170732,2021,132,1222,91,1153,54,N,0
170733,2021,132,1228,91,1326,88,N,1


In [114]:
data_21 = pd.concat([
    merged_sample_21.merge(results_21, how='inner', left_on=['Team1', 'Team2'], right_on=['WTeamID', 'LTeamID']),
    merged_sample_21.merge(results_21, how='inner', left_on=['Team1', 'Team2'], right_on=['LTeamID', 'WTeamID'])
], ignore_index=True)

data_22 = pd.concat([
    merged_sample_22.merge(results_22, how='inner', left_on=['Team1', 'Team2'], right_on=['WTeamID', 'LTeamID']),
    merged_sample_22.merge(results_22, how='inner', left_on=['Team1', 'Team2'], right_on=['LTeamID', 'WTeamID'])
], ignore_index=True)

data_23 = pd.concat([
    merged_sample_23.merge(results_23, how='inner', left_on=['Team1', 'Team2'], right_on=['WTeamID', 'LTeamID']),
    merged_sample_23.merge(results_23, how='inner', left_on=['Team1', 'Team2'], right_on=['LTeamID', 'WTeamID'])
], ignore_index=True)

data_24 = pd.concat([
    merged_sample_24.merge(results_24, how='inner', left_on=['Team1', 'Team2'], right_on=['WTeamID', 'LTeamID']),
    merged_sample_24.merge(results_24, how='inner', left_on=['Team1', 'Team2'], right_on=['LTeamID', 'WTeamID'])
], ignore_index=True)

In [115]:
data_21.tail()

Unnamed: 0,ID,Pred,Season_x,Team1,Team2,TeamName_Team1,FirstD1Season_Team1,LastD1Season_Team1,Wins_Team1,AvgScoreDifference_Team1,POM_avg_Team1,FG_PCT_Team1,FG3_PCT_Team1,FT_PCT_Team1,REB_Team1,AST_Team1,TO_Team1,STL_Team1,BLK_Team1,PF_Team1,TeamName_Team2,FirstD1Season_Team2,LastD1Season_Team2,Wins_Team2,AvgScoreDifference_Team2,POM_avg_Team2,FG_PCT_Team2,FG3_PCT_Team2,FT_PCT_Team2,REB_Team2,AST_Team2,TO_Team2,STL_Team2,BLK_Team2,PF_Team2,Season_y,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT
3850,2021_1460_1464,0.5,2021,1460,1464,Wright St,1988.0,2025.0,18.0,14.541667,14.541667,,,,,,,,,,Youngstown St,1985.0,2025.0,11.0,-1.608696,-1.608696,,,,,,,,,,2021,67,1464,74,1460,72,A,0
3851,2021_1465_1469,0.5,2021,1465,1469,Cal Baptist,2019.0,2025.0,9.0,-0.210526,-0.210526,,,,,,,,,,Utah Tech,2021.0,2025.0,6.0,-10.578947,-10.578947,,,,,,,,,,2021,89,1469,79,1465,75,A,0
3852,2021_1466_1468,0.5,2021,1466,1468,North Alabama,2019.0,2025.0,9.0,-3.75,-3.75,,,,,,,,,,Bellarmine,2021.0,2025.0,11.0,4.277778,4.277778,,,,,,,,,,2021,102,1468,66,1466,64,A,0
3853,2021_1466_1468,0.5,2021,1466,1468,North Alabama,2019.0,2025.0,9.0,-3.75,-3.75,,,,,,,,,,Bellarmine,2021.0,2025.0,11.0,4.277778,4.277778,,,,,,,,,,2021,103,1468,87,1466,63,A,0
3854,2021_1469_1470,0.5,2021,1469,1470,Utah Tech,2021.0,2025.0,6.0,-10.578947,-10.578947,,,,,,,,,,Tarleton St,2021.0,2025.0,5.0,-4.266667,-4.266667,,,,,,,,,,2021,109,1470,77,1469,59,A,0


Extract matchups with regular season results to train with

In [116]:
def prepare_training_data(sample_submission):
    train_data = sample_submission[sample_submission['DayNum'].notnull()]

    def classify(row):
        return 1 if row['Team1'] == row['WTeamID'] else 0

    train_data['Outcome'] = train_data.apply(classify, axis=1)
    return train_data


In [117]:
train_21 = prepare_training_data(data_21)
train_22 = prepare_training_data(data_22)
train_23 = prepare_training_data(data_23)
train_24 = prepare_training_data(data_24)

In [118]:
train_21['Outcome'].value_counts()

Outcome
0    1943
1    1912
Name: count, dtype: int64

# Model Training

In [119]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import brier_score_loss
from sklearn.metrics import f1_score

# Define features and target
features = ['Wins_Team1', 'AvgScoreDifference_Team1', 'POM_avg_Team1', 'FG_PCT_Team1', 'FG3_PCT_Team1', 'FT_PCT_Team1', 
            'REB_Team1', 'AST_Team1', 'TO_Team1', 'STL_Team1', 'BLK_Team1', 'PF_Team1', 
            'Wins_Team2', 'AvgScoreDifference_Team2', 'POM_avg_Team2', 'FG_PCT_Team2', 'FG3_PCT_Team2', 'FT_PCT_Team2', 
            'REB_Team2', 'AST_Team2', 'TO_Team2', 'STL_Team2', 'BLK_Team2', 'PF_Team2']

# Split the data into training and testing sets
train_data_21, test_data_21 = train_test_split(train_21, test_size=0.2, random_state=42)

X_train_21 = train_data_21[features]
y_train_21 = train_data_21['Outcome']

train_data_22, test_data_22 = train_test_split(train_22, test_size=0.2, random_state=42)

X_train_22 = train_data_22[features]
y_train_22 = train_data_22['Outcome']

# Split the 2023 training data into training and testing sets
train_data_23, test_data_23 = train_test_split(train_23, test_size=0.2, random_state=42)

X_train_23 = train_data_23[features]
y_train_23 = train_data_23['Outcome']

# Split the 2024 training data into training and testing sets
train_data_24, test_data_24 = train_test_split(train_24, test_size=0.2, random_state=42)

X_train_24 = train_data_24[features]
y_train_24 = train_data_24['Outcome']

Random Forest

In [120]:
# Train the Random Forest model
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train_21, y_train_21)

print("Model trained successfully.")

# Predict on training data
y_train_pred_21 = rf_model.predict(X_train_21)

# Calculate accuracy
train_accuracy = accuracy_score(y_train_21, y_train_pred_21)
print(f"Random Forest Model accuracy on train_data: {train_accuracy:.2f}")

Model trained successfully.
Random Forest Model accuracy on train_data: 0.89


In [121]:
#RF model 22 Training
# set warm_start and increase num of estimators
rf_model.set_params(warm_start=True, n_estimators=rf_model.n_estimators + 50)  # Add 50 more trees
rf_model.fit(X_train_22, y_train_22) # fit additional 10 trees
print(len(rf_model.estimators_)) # Print the number of trees

# Predict on the train set
y_train_pred_22 = rf_model.predict(X_train_22)

# Calculate accuracy on the train set
train_accuracy_22 = accuracy_score(y_train_22, y_train_pred_22)
print(f"Train set accuracy: {train_accuracy_22:.2f}")

150
Train set accuracy: 0.84


In [122]:
# RF model 23 Training
# Set warm_start and increase the number of estimators
rf_model.set_params(warm_start=True, n_estimators=rf_model.n_estimators + 50)  # Add 50 more trees
rf_model.fit(X_train_23, y_train_23)  # Fit additional trees
print(len(rf_model.estimators_))  # Print the number of trees

# Predict on the train set
y_train_pred_23 = rf_model.predict(X_train_23)

# Calculate accuracy on the train set
train_accuracy_23 = accuracy_score(y_train_23, y_train_pred_23)
print(f"Train set accuracy for 2023: {train_accuracy_23:.2f}")

200
Train set accuracy for 2023: 0.81


In [123]:
# RF model 24 Training
# Set warm_start and increase the number of estimators
rf_model.set_params(warm_start=True, n_estimators=rf_model.n_estimators + 50)  # Add 50 more trees
rf_model.fit(X_train_24, y_train_24)  # Fit additional trees
print(len(rf_model.estimators_))  # Print the number of trees

# Predict on the train set
y_train_pred_24 = rf_model.predict(X_train_24)

# Calculate accuracy on the train set
train_accuracy_24 = accuracy_score(y_train_24, y_train_pred_24)
print(f"Train set accuracy for 2024: {train_accuracy_24:.2f}")

250
Train set accuracy for 2024: 0.79


In [124]:
# Predict on 2021 test data
y_test_pred_rf_21 = rf_model.predict(test_data_21[features])

# Calculate accuracy on 2021 test data
test_accuracy = brier_score_loss(test_data_21['Outcome'], y_test_pred_rf_21) #Assess RF
f1_score_rf = f1_score(test_data_21['Outcome'], y_test_pred_rf_21)
print(f"RF model brier score loss on 2021 test_data: {test_accuracy:.2f}")
print(f"F1 Score for RF model: {f1_score_rf:.2f}")

# Predict on 2022 test data
y_test_pred_rf_22 = rf_model.predict(test_data_22[features])

# Calculate accuracy on 2022 test data
test_accuracy_22 = brier_score_loss(test_data_22['Outcome'], y_test_pred_rf_22) #Assess RF
f1_score_rf_22 = f1_score(test_data_22['Outcome'], y_test_pred_rf_22)
print(f"RF model brier score loss on 2022 test_data: {test_accuracy_22:.2f}")
print(f"F1 Score for RF model: {f1_score_rf_22:.2f}")

# Predict on 2023 test data
y_test_pred_rf_23 = rf_model.predict(test_data_23[features])

# Calculate accuracy on 2023 test data
test_accuracy_23 = brier_score_loss(test_data_23['Outcome'], y_test_pred_rf_23) #Assess RF
f1_score_rf_23 = f1_score(test_data_23['Outcome'], y_test_pred_rf_23)
print(f"RF model brier score loss on 2023 test_data: {test_accuracy_23:.2f}")
print(f"F1 Score for RF model: {f1_score_rf_23:.2f}")

# Predict on 2024 test data
y_test_pred_rf_24 = rf_model.predict(test_data_24[features])

# Calculate accuracy on 2024 test data
test_accuracy_24 = brier_score_loss(test_data_24['Outcome'], y_test_pred_rf_24) #Assess RF
f1_score_rf_24 = f1_score(test_data_24['Outcome'], y_test_pred_rf_24)
print(f"RF model brier score loss on 2024 test_data: {test_accuracy_24:.2f}")
print(f"F1 Score for RF model: {f1_score_rf_24:.2f}")

RF model brier score loss on 2021 test_data: 0.29
F1 Score for RF model: 0.72
RF model brier score loss on 2022 test_data: 0.26
F1 Score for RF model: 0.72
RF model brier score loss on 2023 test_data: 0.29
F1 Score for RF model: 0.71
RF model brier score loss on 2024 test_data: 0.30
F1 Score for RF model: 0.70


In [None]:
# Predict on merged_all_samples
merged_all_samples['Predicted_Outcome'] = rf_model.predict(merged_all_samples_features)

# Display the first few rows of the predictions
merged_all_samples.fillna(0, inplace=True)

# Update 'Pred' feature with probabilities
merged_all_samples['Pred'] = rf_model.predict_proba(merged_all_samples[features])[:, 1]

# Display the first few rows with updated probabilities
merged_all_samples.head()

Unnamed: 0,ID,Pred,Season,Team1,Team2,TeamName_Team1,FirstD1Season_Team1,LastD1Season_Team1,Wins_Team1,AvgScoreDifference_Team1,POM_avg_Team1,FG_PCT_Team1,FG3_PCT_Team1,FT_PCT_Team1,REB_Team1,AST_Team1,TO_Team1,STL_Team1,BLK_Team1,PF_Team1,TeamName_Team2,FirstD1Season_Team2,LastD1Season_Team2,Wins_Team2,AvgScoreDifference_Team2,POM_avg_Team2,FG_PCT_Team2,FG3_PCT_Team2,FT_PCT_Team2,REB_Team2,AST_Team2,TO_Team2,STL_Team2,BLK_Team2,PF_Team2,Predicted_Outcome
0,2021_1101_1102,0.8536,2021,1101,1102,Abilene Chr,2014.0,2025.0,19.0,14.565217,14.565217,0.05646,0.092365,-0.005793,2.0,8.130435,-4.869565,2.26087,-0.782609,0.086957,Air Force,1985.0,2025.0,5.0,-13.2,-13.2,-0.054251,-0.032093,0.002815,-11.36,-2.4,-0.2,1.08,0.36,-0.12,1
1,2021_1101_1103,0.678629,2021,1101,1103,Abilene Chr,2014.0,2025.0,19.0,14.565217,14.565217,0.05646,0.092365,-0.005793,2.0,8.130435,-4.869565,2.26087,-0.782609,0.086957,Akron,1985.0,2025.0,13.0,3.809524,3.809524,0.005164,0.023954,0.037146,1.714286,0.380952,0.238095,-1.0,1.333333,0.095238,1
2,2021_1101_1104,0.4684,2021,1101,1104,Abilene Chr,2014.0,2025.0,19.0,14.565217,14.565217,0.05646,0.092365,-0.005793,2.0,8.130435,-4.869565,2.26087,-0.782609,0.086957,Alabama,1985.0,2025.0,24.0,9.8,9.8,0.024123,0.068166,0.025954,1.766667,3.1,-1.2,0.633333,-0.5,0.166667,0
3,2021_1101_1105,0.799133,2021,1101,1105,Abilene Chr,2014.0,2025.0,19.0,14.565217,14.565217,0.05646,0.092365,-0.005793,2.0,8.130435,-4.869565,2.26087,-0.782609,0.086957,Alabama A&M,2000.0,2025.0,6.0,-3.866667,-3.866667,0.002321,-0.012357,-0.047485,-1.0,-1.533333,2.2,-2.6,0.933333,1.466667,1
4,2021_1101_1106,0.8396,2021,1101,1106,Abilene Chr,2014.0,2025.0,19.0,14.565217,14.565217,0.05646,0.092365,-0.005793,2.0,8.130435,-4.869565,2.26087,-0.782609,0.086957,Alabama St,1985.0,2025.0,4.0,-7.333333,-7.333333,-0.055723,-0.009602,-0.043857,-1.777778,-2.888889,1.277778,-1.611111,-1.777778,-2.5,1


In [127]:
merged_all_samples[['ID', 'Pred']].to_csv('submission.csv', index=False)

# Predict 2025

In [133]:
SampleSubmissionStage2['Season'] = SampleSubmissionStage2['ID'].apply(lambda x: int(x.split('_')[0]))
SampleSubmissionStage2['Team1'] = SampleSubmissionStage2['ID'].apply(lambda x: int(x.split('_')[1]))
SampleSubmissionStage2['Team2'] = SampleSubmissionStage2['ID'].apply(lambda x: int(x.split('_')[2]))

In [134]:
# Prepare data for the 2025 season
teams_25, results_25, rankings_25, sample_25, detailed_25 = regular_season_data(
    2025, MTeams, MRegularSeasonCompactResults, MMasseyOrdinals, SampleSubmissionStage2, MRegularSeasonDetailedResults
)

# Add calculated features for the 2025 season
teams_25['Wins'] = teams_25['TeamID'].apply(lambda x: record_wins(x, results_25))
teams_25['AvgScoreDifference'] = teams_25['TeamID'].apply(lambda x: avg_score_difference(x, results_25))
teams_25['POM_avg'] = teams_25['TeamID'].apply(lambda x: avg_score_difference(x, results_25))

team_stats = teams_25['TeamID'].apply(lambda x: team_stats_differences(x, detailed_25))
team_stats_df = pd.DataFrame(team_stats.tolist(), columns=['FG_PCT', 'FG3_PCT', 'FT_PCT', 'REB', 'AST', 'TO', 'STL', 'BLK', 'PF'])
teams_25 = pd.concat([teams_25, team_stats_df], axis=1)

# Merge sample_25 with teams_25 for Team1 & Team2 details
merged_sample_25 = sample_25.merge(teams_25, left_on='Team1', right_on='TeamID', suffixes=('', '_Team1'))
merged_sample_25 = merged_sample_25.merge(teams_25, left_on='Team2', right_on='TeamID', suffixes=('_Team1', '_Team2'))
merged_sample_25 = merged_sample_25.drop(columns=['TeamID_Team1', 'TeamID_Team2'])

# Extract matchups with regular season results for training
data_25 = pd.concat([
    merged_sample_25.merge(results_25, how='inner', left_on=['Team1', 'Team2'], right_on=['WTeamID', 'LTeamID']),
    merged_sample_25.merge(results_25, how='inner', left_on=['Team1', 'Team2'], right_on=['LTeamID', 'WTeamID'])
], ignore_index=True)

# Prepare training data for the 2025 season
train_25 = prepare_training_data(data_25)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  teams_25['Wins'] = teams_25['TeamID'].apply(lambda x: record_wins(x, results_25))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  teams_25['AvgScoreDifference'] = teams_25['TeamID'].apply(lambda x: avg_score_difference(x, results_25))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  teams_25['POM_avg'

In [135]:
# Define features and target for train_25
X_train_25 = train_25[features]
y_train_25 = train_25['Outcome']

# Set warm_start and increase the number of estimators
rf_model.set_params(warm_start=True, n_estimators=rf_model.n_estimators + 50)  # Add 50 more trees
rf_model.fit(X_train_25, y_train_25)  # Fit additional trees
print(len(rf_model.estimators_))  # Print the number of trees

# Predict on the train set
y_train_pred_25 = rf_model.predict(X_train_25)

# Calculate accuracy on the train set
train_accuracy_25 = accuracy_score(y_train_25, y_train_pred_25)
print(f"Train set accuracy for 2025: {train_accuracy_25:.2f}")

300
Train set accuracy for 2025: 0.79


In [138]:
# Define the features for prediction
merged_sample_25_features = merged_sample_25[features]

# Make predictions
merged_sample_25['Predicted_Outcome'] = rf_model.predict(merged_sample_25_features)

# Add probabilities for the predictions
merged_sample_25['Pred'] = rf_model.predict_proba(merged_sample_25_features)[:, 1]

# Display the first few rows of the predictions
merged_sample_25.head()

Unnamed: 0,ID,Pred,Season,Team1,Team2,TeamName_Team1,FirstD1Season_Team1,LastD1Season_Team1,Wins_Team1,AvgScoreDifference_Team1,POM_avg_Team1,FG_PCT_Team1,FG3_PCT_Team1,FT_PCT_Team1,REB_Team1,AST_Team1,TO_Team1,STL_Team1,BLK_Team1,PF_Team1,TeamName_Team2,FirstD1Season_Team2,LastD1Season_Team2,Wins_Team2,AvgScoreDifference_Team2,POM_avg_Team2,FG_PCT_Team2,FG3_PCT_Team2,FT_PCT_Team2,REB_Team2,AST_Team2,TO_Team2,STL_Team2,BLK_Team2,PF_Team2,Predicted_Outcome,Predicted_Probability
0,2025_1101_1102,0.692151,2025,1101,1102,Abilene Chr,2014.0,2025.0,12.0,-3.153846,-3.153846,-0.027903,-0.037382,0.005856,-3.363636,-0.409091,-1.5,1.909091,-1.727273,3.181818,Air Force,1985.0,2025.0,4.0,-10.866667,-10.866667,-0.053142,-0.031143,-0.066576,-4.423077,0.576923,2.076923,-0.923077,-0.153846,-0.230769,1,0.692151
1,2025_1101_1103,0.127448,2025,1101,1103,Abilene Chr,2014.0,2025.0,12.0,-3.153846,-3.153846,-0.027903,-0.037382,0.005856,-3.363636,-0.409091,-1.5,1.909091,-1.727273,3.181818,Akron,1985.0,2025.0,22.0,7.107143,7.107143,0.039659,0.050411,0.020265,3.958333,6.166667,-0.041667,0.125,1.0,2.083333,0,0.127448
2,2025_1101_1104,0.144611,2025,1101,1104,Abilene Chr,2014.0,2025.0,12.0,-3.153846,-3.153846,-0.027903,-0.037382,0.005856,-3.363636,-0.409091,-1.5,1.909091,-1.727273,3.181818,Alabama,1985.0,2025.0,23.0,10.896552,10.896552,0.065714,0.0465,0.003326,7.84,5.0,2.6,-1.76,0.8,-2.76,0,0.144611
3,2025_1101_1105,0.586688,2025,1101,1105,Abilene Chr,2014.0,2025.0,12.0,-3.153846,-3.153846,-0.027903,-0.037382,0.005856,-3.363636,-0.409091,-1.5,1.909091,-1.727273,3.181818,Alabama A&M,2000.0,2025.0,7.0,-10.0,-10.0,-0.075168,-0.06407,-0.0442,-2.304348,-3.347826,1.043478,-0.913043,0.869565,3.086957,1,0.586688
4,2025_1101_1106,0.398028,2025,1101,1106,Abilene Chr,2014.0,2025.0,12.0,-3.153846,-3.153846,-0.027903,-0.037382,0.005856,-3.363636,-0.409091,-1.5,1.909091,-1.727273,3.181818,Alabama St,1985.0,2025.0,13.0,-1.214286,-1.214286,-0.040304,-0.016624,-0.0114,-5.56,-1.72,-2.68,1.52,-0.28,2.2,0,0.398028


In [139]:
merged_sample_25[['ID', 'Pred']].to_csv('submission_2025.csv', index=False)