In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier

# Data Load

In [None]:
WTeams = pd.read_csv('WTeams.csv')
WSeasons = pd.read_csv('WSeasons.csv')
WRegularSeasonCompactResults = pd.read_csv('WRegularSeasonCompactResults.csv')
WRegularSeasonDetailedResults = pd.read_csv('WRegularSeasonDetailedResults.csv')
SampleSubmissionStage1 = pd.read_csv('SampleSubmissionStage1.csv')

In [None]:
SampleSubmissionStage1['Season'] = SampleSubmissionStage1['ID'].apply(lambda x: int(x.split('_')[0]))
SampleSubmissionStage1['Team1'] = SampleSubmissionStage1['ID'].apply(lambda x: int(x.split('_')[1]))
SampleSubmissionStage1['Team2'] = SampleSubmissionStage1['ID'].apply(lambda x: int(x.split('_')[2]))
SampleSubmissionStage1 = SampleSubmissionStage1[SampleSubmissionStage1['Team1'].astype(str).str.startswith('3') & SampleSubmissionStage1['Team2'].astype(str).str.startswith('3')]

# Prepare Data for Training

In [None]:
from tqdm import tqdm

def record_record(Team1, data):
    Team1_wins = 0
    for index, row in data.iterrows():
        if row['WTeamID'] == Team1:
            Team1_wins += 1
    return Team1_wins

def avg_score_difference(Team1, data):
    Team1_score_difference = 0
    games = 0
    for index, row in data.iterrows():
        if row['WTeamID'] == Team1:
            Team1_score_difference += row['WScore'] - row['LScore']
            games += 1
        if row['LTeamID'] == Team1:
            Team1_score_difference += row['LScore'] - row['WScore']
            games += 1
    return Team1_score_difference/games if games != 0 else 0

def team_stats_differences(Team1, data):
    Team1_FG_PCT = 0
    Team1_FG3_PCT = 0
    Team1_FT_PCT = 0
    Team1_REB = 0
    Team1_AST = 0
    Team1_TO = 0
    Team1_STL = 0
    Team1_BLK = 0
    Team1_PF = 0
    games = 0
    for index, row in data.iterrows():
        if row['WTeamID'] == Team1:
            Team1_FG_PCT += (row['WFGM'] / row['WFGA'] if row['WFGA'] != 0 else 0) - (row['LFGM'] / row['LFGA'] if row['LFGA'] != 0 else 0)
            Team1_FG3_PCT += (row['WFGM3'] / row['WFGA3'] if row['WFGA3'] != 0 else 0) - (row['LFGM3'] / row['LFGA3'] if row['LFGA3'] != 0 else 0)
            Team1_FT_PCT += (row['WFTM'] / row['WFTA'] if row['WFTA'] != 0 else 0) - (row['LFTM'] / row['LFTA'] if row['LFTA'] != 0 else 0)
            Team1_REB += (row['WOR'] + row['WDR']) - (row['LOR'] + row['LDR'])
            Team1_AST += row['WAst'] - row['LAst']
            Team1_TO += row['WTO'] - row['LTO']
            Team1_STL += row['WStl'] - row['LStl']
            Team1_BLK += row['WBlk'] - row['LBlk']
            Team1_PF += row['WPF'] - row['LPF']
            games += 1
        if row['LTeamID'] == Team1:
            Team1_FG_PCT += (row['LFGM'] / row['LFGA'] if row['LFGA'] != 0 else 0) - (row['WFGM'] / row['WFGA'] if row['WFGA'] != 0 else 0)
            Team1_FG3_PCT += (row['LFGM3'] / row['LFGA3'] if row['LFGA3'] != 0 else 0) - (row['WFGM3'] / row['WFGA3'] if row['WFGA3'] != 0 else 0)
            Team1_FT_PCT += (row['LFTM'] / row['LFTA'] if row['LFTA'] != 0 else 0) - (row['WFTM'] / row['WFTA'] if row['WFTA'] != 0 else 0)
            Team1_REB += (row['LOR'] + row['LDR']) - (row['WOR'] + row['WDR'])
            Team1_AST += row['LAst'] - row['WAst']
            Team1_TO += row['LTO'] - row['WTO']
            Team1_STL += row['LStl'] - row['WStl']
            Team1_BLK += row['LBlk'] - row['WBlk']
            Team1_PF += row['LPF'] - row['WPF']
            games += 1
    return (Team1_FG_PCT/games if games != 0 else 0, 
            Team1_FG3_PCT/games if games != 0 else 0, 
            Team1_FT_PCT/games if games != 0 else 0, 
            Team1_REB/games if games != 0 else 0, 
            Team1_AST/games if games != 0 else 0, 
            Team1_TO/games if games != 0 else 0, 
            Team1_STL/games if games != 0 else 0, 
            Team1_BLK/games if games != 0 else 0, 
            Team1_PF/games if games != 0 else 0)

In [None]:
ModelInput21 = pd.DataFrame(WTeams['TeamID'])

In [None]:
ModelInput21['Wins'] = ModelInput21['TeamID'].apply(lambda x: record_record(x, WRegularSeasonCompactResults[WRegularSeasonCompactResults['Season'] == 2021]))

In [None]:
ModelInput21['avg_score_diff'] = ModelInput21['TeamID'].apply(lambda x: avg_score_difference(x, WRegularSeasonCompactResults[WRegularSeasonCompactResults['Season'] == 2021]))

In [None]:
team_stats = ModelInput21['TeamID'].apply(lambda x: team_stats_differences(x, WRegularSeasonDetailedResults[WRegularSeasonDetailedResults['Season'] == 2021]))
team_stats_df = pd.DataFrame(team_stats.tolist(), columns=['FG_PCT', 'FG3_PCT', 'FT_PCT', 'REB', 'AST', 'TO', 'STL', 'BLK', 'PF'])
ModelInput21 = pd.concat([ModelInput21, team_stats_df], axis=1)

In [None]:
WRegularSeasonCompactResults['Outcome'] = np.where(
    WRegularSeasonCompactResults['WTeamID'] > WRegularSeasonCompactResults['LTeamID'], 
    1, 
    0
)

In [None]:
ModelInput22 = pd.DataFrame(WTeams['TeamID'])
ModelInput23 = pd.DataFrame(WTeams['TeamID'])
ModelInput24 = pd.DataFrame(WTeams['TeamID'])

In [None]:
ModelInput22['Wins'] = ModelInput22['TeamID'].apply(lambda x: record_record(x, WRegularSeasonCompactResults[WRegularSeasonCompactResults['Season'] == 2022]))
ModelInput23['Wins'] = ModelInput23['TeamID'].apply(lambda x: record_record(x, WRegularSeasonCompactResults[WRegularSeasonCompactResults['Season'] == 2023]))
ModelInput24['Wins'] = ModelInput24['TeamID'].apply(lambda x: record_record(x, WRegularSeasonCompactResults[WRegularSeasonCompactResults['Season'] == 2024]))


In [None]:
ModelInput22['avg_score_diff'] = ModelInput22['TeamID'].apply(lambda x: avg_score_difference(x, WRegularSeasonCompactResults[WRegularSeasonCompactResults['Season'] == 2022]))
ModelInput23['avg_score_diff'] = ModelInput23['TeamID'].apply(lambda x: avg_score_difference(x, WRegularSeasonCompactResults[WRegularSeasonCompactResults['Season'] == 2023]))
ModelInput24['avg_score_diff'] = ModelInput24['TeamID'].apply(lambda x: avg_score_difference(x, WRegularSeasonCompactResults[WRegularSeasonCompactResults['Season'] == 2024]))


In [None]:
team_stats = ModelInput22['TeamID'].apply(lambda x: team_stats_differences(x, WRegularSeasonDetailedResults[WRegularSeasonDetailedResults['Season'] == 2022]))
team_stats_df = pd.DataFrame(team_stats.tolist(), columns=['FG_PCT', 'FG3_PCT', 'FT_PCT', 'REB', 'AST', 'TO', 'STL', 'BLK', 'PF'])
ModelInput22 = pd.concat([ModelInput22, team_stats_df], axis=1)
team_stats = ModelInput23['TeamID'].apply(lambda x: team_stats_differences(x, WRegularSeasonDetailedResults[WRegularSeasonDetailedResults['Season'] == 2023]))
team_stats_df = pd.DataFrame(team_stats.tolist(), columns=['FG_PCT', 'FG3_PCT', 'FT_PCT', 'REB', 'AST', 'TO', 'STL', 'BLK', 'PF'])
ModelInput23 = pd.concat([ModelInput23, team_stats_df], axis=1)
team_stats = ModelInput22['TeamID'].apply(lambda x: team_stats_differences(x, WRegularSeasonDetailedResults[WRegularSeasonDetailedResults['Season'] == 2024]))
team_stats_df = pd.DataFrame(team_stats.tolist(), columns=['FG_PCT', 'FG3_PCT', 'FT_PCT', 'REB', 'AST', 'TO', 'STL', 'BLK', 'PF'])
ModelInput24 = pd.concat([ModelInput24, team_stats_df], axis=1)

In [None]:
# Prepare training data for the year 2021
train_data = SampleSubmissionStage1[SampleSubmissionStage1['Season'] == 2021]
train_data = train_data.merge(ModelInput21, left_on='Team1', right_on='TeamID')
train_data = train_data.merge(ModelInput21, left_on='Team2', right_on='TeamID', suffixes=('_Team1', '_Team2'))
train_data = train_data.merge(
    WRegularSeasonCompactResults[['Season', 'WTeamID', 'LTeamID', 'Outcome']],
    how='inner',
    left_on=['Season', 'Team1', 'Team2'],
    right_on=['Season', 'WTeamID', 'LTeamID']
).merge(
    WRegularSeasonCompactResults[['Season', 'WTeamID', 'LTeamID', 'Outcome']],
    how='left',
    left_on=['Season', 'Team2', 'Team1'],
    right_on=['Season', 'WTeamID', 'LTeamID'],
    suffixes=('_Team1_Win', '_Team2_Win')
)

In [None]:
train_data['Outcome_Team2_Win'].fillna(0, inplace=True)
train_data.loc[train_data['Outcome_Team2_Win'] == 0, 'Outcome_Team1_Win'] = 1
train_data['Outcome'] = np.where(
    train_data['Outcome_Team1_Win'] == 1,
    1,
    0
)
train_data.drop(columns=['WTeamID_Team2_Win', 'LTeamID_Team2_Win'], inplace=True)

In [None]:
# Prepare training data for 2022
train_data_2022 = SampleSubmissionStage1[SampleSubmissionStage1['Season'] == 2022]
train_data_2022 = train_data_2022.merge(ModelInput22, left_on='Team1', right_on='TeamID')
train_data_2022 = train_data_2022.merge(ModelInput22, left_on='Team2', right_on='TeamID', suffixes=('_Team1', '_Team2'))
train_data_2022 = train_data_2022.merge(
    WRegularSeasonCompactResults[['Season', 'WTeamID', 'LTeamID', 'Outcome']],
    how='inner',
    left_on=['Season', 'Team1', 'Team2'],
    right_on=['Season', 'WTeamID', 'LTeamID']
).merge(
    WRegularSeasonCompactResults[['Season', 'WTeamID', 'LTeamID', 'Outcome']],
    how='left',
    left_on=['Season', 'Team2', 'Team1'],
    right_on=['Season', 'WTeamID', 'LTeamID'],
    suffixes=('_Team1_Win', '_Team2_Win')
)

train_data_2022['Outcome_Team2_Win'].fillna(0, inplace=True)
train_data_2022.loc[train_data_2022['Outcome_Team2_Win'] == 0, 'Outcome_Team1_Win'] = 1
train_data_2022['Outcome'] = np.where(
    train_data_2022['Outcome_Team1_Win'] == 1,
    1,
    0
)
train_data_2022.drop(columns=['WTeamID_Team2_Win', 'LTeamID_Team2_Win'], inplace=True)

In [32]:
# Prepare training data for 2023
train_data_2023 = SampleSubmissionStage1[SampleSubmissionStage1['Season'] == 2023]
train_data_2023 = train_data_2023.merge(ModelInput23, left_on='Team1', right_on='TeamID')
train_data_2023 = train_data_2023.merge(ModelInput23, left_on='Team2', right_on='TeamID', suffixes=('_Team1', '_Team2'))
train_data_2023 = train_data_2023.merge(
    WRegularSeasonCompactResults[['Season', 'WTeamID', 'LTeamID', 'Outcome']],
    how='inner',
    left_on=['Season', 'Team1', 'Team2'],
    right_on=['Season', 'WTeamID', 'LTeamID']
).merge(
    WRegularSeasonCompactResults[['Season', 'WTeamID', 'LTeamID', 'Outcome']],
    how='left',
    left_on=['Season', 'Team2', 'Team1'],
    right_on=['Season', 'WTeamID', 'LTeamID'],
    suffixes=('_Team1_Win', '_Team2_Win')
)

train_data_2023['Outcome_Team2_Win'].fillna(0, inplace=True)
train_data_2023.loc[train_data_2023['Outcome_Team2_Win'] == 0, 'Outcome_Team1_Win'] = 1
train_data_2023['Outcome'] = np.where(
    train_data_2023['Outcome_Team1_Win'] == 1,
    1,
    0
)
train_data_2023.drop(columns=['WTeamID_Team2_Win', 'LTeamID_Team2_Win'], inplace=True)

print("Data formatting for 2023 model input completed.")

Data formatting for 2023 model input completed.


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_data_2023['Outcome_Team2_Win'].fillna(0, inplace=True)


In [33]:
# Prepare training data for 2024
train_data_2024 = SampleSubmissionStage1[SampleSubmissionStage1['Season'] == 2024]
train_data_2024 = train_data_2024.merge(ModelInput24, left_on='Team1', right_on='TeamID')
train_data_2024 = train_data_2024.merge(ModelInput24, left_on='Team2', right_on='TeamID', suffixes=('_Team1', '_Team2'))
train_data_2024 = train_data_2024.merge(
    WRegularSeasonCompactResults[['Season', 'WTeamID', 'LTeamID', 'Outcome']],
    how='inner',
    left_on=['Season', 'Team1', 'Team2'],
    right_on=['Season', 'WTeamID', 'LTeamID']
).merge(
    WRegularSeasonCompactResults[['Season', 'WTeamID', 'LTeamID', 'Outcome']],
    how='left',
    left_on=['Season', 'Team2', 'Team1'],
    right_on=['Season', 'WTeamID', 'LTeamID'],
    suffixes=('_Team1_Win', '_Team2_Win')
)

train_data_2024['Outcome_Team2_Win'].fillna(0, inplace=True)
train_data_2024.loc[train_data_2024['Outcome_Team2_Win'] == 0, 'Outcome_Team1_Win'] = 1
train_data_2024['Outcome'] = np.where(
    train_data_2024['Outcome_Team1_Win'] == 1,
    1,
    0
)
train_data_2024.drop(columns=['WTeamID_Team2_Win', 'LTeamID_Team2_Win'], inplace=True)

print("Data formatting for 2024 model input completed.")

Data formatting for 2024 model input completed.


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_data_2024['Outcome_Team2_Win'].fillna(0, inplace=True)


# Train Models

In [35]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import brier_score_loss
from sklearn.metrics import f1_score

# Define features and target
features = ['Wins_Team1', 'avg_score_diff_Team1', 'FG_PCT_Team1', 'FG3_PCT_Team1', 'FT_PCT_Team1', 
            'REB_Team1', 'AST_Team1', 'TO_Team1', 'STL_Team1', 'BLK_Team1', 'PF_Team1', 
            'Wins_Team2', 'avg_score_diff_Team2', 'FG_PCT_Team2', 'FG3_PCT_Team2', 'FT_PCT_Team2', 
            'REB_Team2', 'AST_Team2', 'TO_Team2', 'STL_Team2', 'BLK_Team2', 'PF_Team2']

# Split the data into training and testing sets
train_data_21, test_data_21 = train_test_split(train_data, test_size=0.2, random_state=42)

X_train_21 = train_data[features]
y_train_21 = train_data['Outcome']

train_data_22, test_data_22 = train_test_split(train_data_2022, test_size=0.2, random_state=42)

X_train_22 = train_data_22[features]
y_train_22 = train_data_22['Outcome']

# Split the 2023 training data into training and testing sets
train_data_23, test_data_23 = train_test_split(train_data_2023, test_size=0.2, random_state=42)

X_train_23 = train_data_23[features]
y_train_23 = train_data_23['Outcome']

# Split the 2024 training data into training and testing sets
train_data_24, test_data_24 = train_test_split(train_data_2024, test_size=0.2, random_state=42)

X_train_24 = train_data_24[features]
y_train_24 = train_data_24['Outcome']

Random Forest Classifier

In [37]:
# Train the Random Forest model
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train_21, y_train_21)

print("Model trained successfully.")

# Predict on training data
y_train_pred_21 = rf_model.predict(X_train_21)

# Calculate accuracy
train_accuracy = accuracy_score(y_train_21, y_train_pred_21)
print(f"Random Forest Model accuracy on train_data: {train_accuracy:.2f}")

Model trained successfully.
Random Forest Model accuracy on train_data: 1.00


In [38]:
#RF model 22 Training
# set warm_start and increase num of estimators
rf_model.set_params(warm_start=True, n_estimators=rf_model.n_estimators + 50)  # Add 50 more trees
rf_model.fit(X_train_22, y_train_22) # fit additional 10 trees
print(len(rf_model.estimators_)) # Print the number of trees

# Predict on the train set
y_train_pred_22 = rf_model.predict(X_train_22)

# Calculate accuracy on the train set
train_accuracy_22 = accuracy_score(y_train_22, y_train_pred_22)
print(f"Train set accuracy: {train_accuracy_22:.2f}")

150
Train set accuracy: 0.92


In [39]:
#RF model 23 Training
# Update the Random Forest model for warm start
rf_model.set_params(warm_start=True, n_estimators=rf_model.n_estimators + 50)  # Add 50 more trees
rf_model.fit(X_train_23, y_train_23)
print(len(rf_model.estimators_)) # Print the number of trees

# Predict on the train set
y_train_pred_23 = rf_model.predict(X_train_23)

# Calculate accuracy on the train set
train_accuracy_2023 = accuracy_score(y_train_23, y_train_pred_23)
print(f"Train set accuracy after retraining with 2023 data: {train_accuracy_2023:.2f}")

200
Train set accuracy after retraining with 2023 data: 0.89


In [41]:
#RF model 24 Training
# Update the Random Forest model for warm start
rf_model.set_params(warm_start=True, n_estimators=rf_model.n_estimators + 50)  # Add 50 more trees
rf_model.fit(X_train_24, y_train_24)
print(len(rf_model.estimators_)) # Print the number of trees

# Predict on the train set
y_train_pred_24 = rf_model.predict(X_train_24)

# Calculate accuracy on the train set
train_accuracy_2024 = accuracy_score(y_train_24, y_train_pred_24)
print(f"Train set accuracy after retraining with 2024 data: {train_accuracy_2024:.2f}")

300
Train set accuracy after retraining with 2024 data: 0.94


Logistic Regression Classifier

In [None]:
from sklearn.linear_model import LogisticRegression
log_reg_model = LogisticRegression(random_state=42)
log_reg_model.fit(X_train_21, y_train_21)

print("Model trained successfully.")

# Predict on training data
y_train_pred_21 = log_reg_model.predict(X_train_21)

# Calculate accuracy
train_accuracy = accuracy_score(y_train_21, y_train_pred_21)
print(f"Random Forest Model accuracy on train_data: {train_accuracy:.2f}")

In [44]:
# Logistic Regression model for 2022 Training with warm start
log_reg_model.set_params(max_iter=200, warm_start=True)  # Enable warm start and update max_iter if needed
log_reg_model.fit(X_train_22, y_train_22)

# Predict on the train set
y_train_pred_22 = log_reg_model.predict(X_train_22)

# Calculate accuracy on the train set
train_accuracy_22 = accuracy_score(y_train_22, y_train_pred_22)
print(f"Train set accuracy for Logistic Regression (2022): {train_accuracy_22:.2f}")


Train set accuracy for Logistic Regression (2022): 0.81


In [46]:
# Logistic Regression model for 2022 Training with warm start
log_reg_model.set_params(max_iter=200, warm_start=True)  # Enable warm start and update max_iter if needed
log_reg_model.fit(X_train_23, y_train_23)

# Predict on the train set
y_train_pred_23 = log_reg_model.predict(X_train_23)

# Calculate accuracy on the train set
train_accuracy_23 = accuracy_score(y_train_23, y_train_pred_23)
print(f"Train set accuracy for Logistic Regression (2023): {train_accuracy_23:.2f}")


Train set accuracy for Logistic Regression (2023): 0.79


In [47]:
# Logistic Regression model for 2022 Training with warm start
log_reg_model.set_params(max_iter=200, warm_start=True)  # Enable warm start and update max_iter if needed
log_reg_model.fit(X_train_24, y_train_24)

# Predict on the train set
y_train_pred_24 = log_reg_model.predict(X_train_24)

# Calculate accuracy on the train set
train_accuracy_24 = accuracy_score(y_train_24, y_train_pred_24)
print(f"Train set accuracy for Logistic Regression (2024): {train_accuracy_22:.2f}")


Train set accuracy for Logistic Regression (2024): 0.81


Histogram Gradient Boosting Classifier

In [61]:
from sklearn.ensemble import HistGradientBoostingClassifier
model = HistGradientBoostingClassifier(
    max_iter=100,  # Number of trees
    learning_rate=0.1,
    max_depth=10,
    min_samples_leaf=20,
    random_state=42
)
model.fit(X_train_21, y_train_21)

# Predict on the train set
y_train_pred_21 = model.predict(X_train_21)

# Calculate accuracy on the train set
train_accuracy_21 = accuracy_score(y_train_21, y_train_pred_21)
print(f"Train set accuracy for HGB Classifier (2021): {train_accuracy_22:.2f}")

Train set accuracy for HGB Classifier (2021): 0.78


In [62]:
# Histogram Gradient Boosting model for 2022 Training with warm start
model.set_params(warm_start=True, max_iter=model.max_iter + 100)  # Increment iterations
model.fit(X_train_22, y_train_22)

# Predict on the train set
y_train_pred_22 = model.predict(X_train_22)

# Calculate accuracy on the train set
train_accuracy_22 = accuracy_score(y_train_22, y_train_pred_22)
print(f"Train set accuracy for HGB Classifier (2022): {train_accuracy_22:.2f}")

Train set accuracy for HGB Classifier (2022): 0.99


In [63]:
# Histogram Gradient Boosting model for 2023 Training with warm start
model.set_params(warm_start=True, max_iter=model.max_iter + 100)  # Increment iterations
model.fit(X_train_23, y_train_23)

# Predict on the train set
y_train_pred_23 = model.predict(X_train_23)

# Calculate accuracy on the train set
train_accuracy_23 = accuracy_score(y_train_23, y_train_pred_23)
print(f"Train set accuracy for HGB Classifier (2023): {train_accuracy_23:.2f}")

Train set accuracy for HGB Classifier (2023): 0.97


In [64]:
# Histogram Gradient Boosting model for 2024 Training with warm start
model.set_params(warm_start=True, max_iter=model.max_iter + 100)  # Increment iterations
model.fit(X_train_24, y_train_24)

# Predict on the train set
y_train_pred_24 = model.predict(X_train_24)

# Calculate accuracy on the train set
train_accuracy_24 = accuracy_score(y_train_24, y_train_pred_24)
print(f"Train set accuracy for HGB Classifier (2024): {train_accuracy_24:.2f}")

Train set accuracy for HGB Classifier (2024): 0.96


Passive Aggressive Classifier

In [59]:
from sklearn.linear_model import PassiveAggressiveClassifier
pa_classifier = PassiveAggressiveClassifier(C=1.0, max_iter=1000, random_state=42)
pa_classifier.fit(X_train_21, y_train_21)

# Predict on the train set
y_train_pred_21 = model.predict(X_train_21)

# Calculate accuracy on the train set
train_accuracy_21 = accuracy_score(y_train_21, y_train_pred_21)
print(f"Train set accuracy for passive aggressive classifier (2021): {train_accuracy_21:.2f}")

Train set accuracy for passive aggressive classifier (2021): 0.83


In [60]:
# Passive Aggressive Classifier for 2022 Training with warm start
pa_classifier.set_params(max_iter=pa_classifier.max_iter + 100, warm_start=True)  # Enable warm start and add iterations
pa_classifier.fit(X_train_22, y_train_22)

# Predict on the train set
y_train_pred_22 = pa_classifier.predict(X_train_22)

# Calculate accuracy on the train set
train_accuracy_22 = accuracy_score(y_train_22, y_train_pred_22)
print(f"Train set accuracy for Passive Aggressive Classifier (2022): {train_accuracy_22:.2f}")

Train set accuracy for Passive Aggressive Classifier (2022): 0.78


In [65]:
# Passive Aggressive Classifier for 2023 Training with warm start
pa_classifier.set_params(max_iter=pa_classifier.max_iter + 100, warm_start=True)  # Enable warm start and add iterations
pa_classifier.fit(X_train_23, y_train_23)

# Predict on the train set
y_train_pred_23 = pa_classifier.predict(X_train_23)

# Calculate accuracy on the train set
train_accuracy_23 = accuracy_score(y_train_23, y_train_pred_23)
print(f"Train set accuracy for Passive Aggressive Classifier (2023): {train_accuracy_23:.2f}")

Train set accuracy for Passive Aggressive Classifier (2023): 0.75


In [66]:
# Passive Aggressive Classifier for 2024 Training with warm start
pa_classifier.set_params(max_iter=pa_classifier.max_iter + 100, warm_start=True)  # Enable warm start and add iterations
pa_classifier.fit(X_train_24, y_train_24)

# Predict on the train set
y_train_pred_24 = pa_classifier.predict(X_train_24)

# Calculate accuracy on the train set
train_accuracy_24 = accuracy_score(y_train_24, y_train_pred_24)
print(f"Train set accuracy for Passive Aggressive Classifier (2024): {train_accuracy_24:.2f}")

Train set accuracy for Passive Aggressive Classifier (2024): 0.54


Assess The Models

2021

In [69]:
# Predict on 2021 test data
y_test_pred_rf_21 = rf_model.predict(test_data_21[features])
y_test_pred_log_reg_21 = log_reg_model.predict(test_data_21[features])
y_test_pred_hgb_21 = model.predict(test_data_21[features])
y_test_pred_pa_21 = pa_classifier.predict(test_data_21[features])

# Calculate accuracy on 2021 test data
test_accuracy = brier_score_loss(test_data_21['Outcome'], y_test_pred_rf_21) #Assess RF
f1_score_rf = f1_score(test_data_21['Outcome'], y_test_pred_rf_21)
print(f"RF model brier score loss on 2021 test_data: {test_accuracy:.2f}")
print(f"F1 Score for RF model: {f1_score_rf:.2f}")
test_accuracy = brier_score_loss(test_data_21['Outcome'], y_test_pred_log_reg_21) #Assess Logistic Regression
f1_score_log_reg = f1_score(test_data_21['Outcome'], y_test_pred_log_reg_21)
print(f"Logistic Regression model brier score loss on 2021 test_data: {test_accuracy:.2f}")
print(f"F1 Score for Logistic Regression model: {f1_score_log_reg:.2f}")
test_accuracy = brier_score_loss(test_data_21['Outcome'], y_test_pred_hgb_21) #Assess HGB
f1_score_hgb = f1_score(test_data_21['Outcome'], y_test_pred_hgb_21)
print(f"HGB model brier score loss on 2021 test_data: {test_accuracy:.2f}")
print(f"F1 Score for HGB model: {f1_score_hgb:.2f}")
test_accuracy = brier_score_loss(test_data_21['Outcome'], y_test_pred_pa_21) #Assess Passive Aggressive
f1_score_pa = f1_score(test_data_21['Outcome'], y_test_pred_pa_21)
print(f"Passive Aggressive model brier score loss on 2021 test_data: {test_accuracy:.2f}")
print(f"F1 Score for Passive Aggressive model: {f1_score_pa:.2f}")


RF model brier score loss on 2021 test_data: 0.11
F1 Score for RF model: 0.93
Logistic Regression model brier score loss on 2021 test_data: 0.25
F1 Score for Logistic Regression model: 0.85
HGB model brier score loss on 2021 test_data: 0.15
F1 Score for HGB model: 0.90
Passive Aggressive model brier score loss on 2021 test_data: 0.41
F1 Score for Passive Aggressive model: 0.67


2022

In [70]:
# Predict on 2022 test data
y_test_pred_rf_22 = rf_model.predict(test_data_22[features])
y_test_pred_log_reg_22 = log_reg_model.predict(test_data_22[features])
y_test_pred_hgb_22 = model.predict(test_data_22[features])
y_test_pred_pa_22 = pa_classifier.predict(test_data_22[features])

# Calculate accuracy on 2022 test data
test_accuracy = brier_score_loss(test_data_22['Outcome'], y_test_pred_rf_22)  # Assess RF
f1_score_rf = f1_score(test_data_22['Outcome'], y_test_pred_rf_22)
print(f"RF model brier score loss on 2022 test_data: {test_accuracy:.2f}")
print(f"F1 Score for RF model: {f1_score_rf:.2f}")

test_accuracy = brier_score_loss(test_data_22['Outcome'], y_test_pred_log_reg_22)  # Assess Logistic Regression
f1_score_log_reg = f1_score(test_data_22['Outcome'], y_test_pred_log_reg_22)
print(f"Logistic Regression model brier score loss on 2022 test_data: {test_accuracy:.2f}")
print(f"F1 Score for Logistic Regression model: {f1_score_log_reg:.2f}")

test_accuracy = brier_score_loss(test_data_22['Outcome'], y_test_pred_hgb_22)  # Assess HGB
f1_score_hgb = f1_score(test_data_22['Outcome'], y_test_pred_hgb_22)
print(f"HGB model brier score loss on 2022 test_data: {test_accuracy:.2f}")
print(f"F1 Score for HGB model: {f1_score_hgb:.2f}")

test_accuracy = brier_score_loss(test_data_22['Outcome'], y_test_pred_pa_22)  # Assess Passive Aggressive
f1_score_pa = f1_score(test_data_22['Outcome'], y_test_pred_pa_22)
print(f"Passive Aggressive model brier score loss on 2022 test_data: {test_accuracy:.2f}")
print(f"F1 Score for Passive Aggressive model: {f1_score_pa:.2f}")

RF model brier score loss on 2022 test_data: 0.20
F1 Score for RF model: 0.88
Logistic Regression model brier score loss on 2022 test_data: 0.23
F1 Score for Logistic Regression model: 0.87
HGB model brier score loss on 2022 test_data: 0.22
F1 Score for HGB model: 0.86
Passive Aggressive model brier score loss on 2022 test_data: 0.49
F1 Score for Passive Aggressive model: 0.57


2023

In [71]:
# Predict on 2023 test data
y_test_pred_rf_23 = rf_model.predict(test_data_23[features])
y_test_pred_log_reg_23 = log_reg_model.predict(test_data_23[features])
y_test_pred_hgb_23 = model.predict(test_data_23[features])
y_test_pred_pa_23 = pa_classifier.predict(test_data_23[features])

# Calculate accuracy on 2023 test data
test_accuracy = brier_score_loss(test_data_23['Outcome'], y_test_pred_rf_23)  # Assess RF
f1_score_rf = f1_score(test_data_23['Outcome'], y_test_pred_rf_23)
print(f"RF model brier score loss on 2023 test_data: {test_accuracy:.2f}")
print(f"F1 Score for RF model: {f1_score_rf:.2f}")

test_accuracy = brier_score_loss(test_data_23['Outcome'], y_test_pred_log_reg_23)  # Assess Logistic Regression
f1_score_log_reg = f1_score(test_data_23['Outcome'], y_test_pred_log_reg_23)
print(f"Logistic Regression model brier score loss on 2023 test_data: {test_accuracy:.2f}")
print(f"F1 Score for Logistic Regression model: {f1_score_log_reg:.2f}")

test_accuracy = brier_score_loss(test_data_23['Outcome'], y_test_pred_hgb_23)  # Assess HGB
f1_score_hgb = f1_score(test_data_23['Outcome'], y_test_pred_hgb_23)
print(f"HGB model brier score loss on 2023 test_data: {test_accuracy:.2f}")
print(f"F1 Score for HGB model: {f1_score_hgb:.2f}")

test_accuracy = brier_score_loss(test_data_23['Outcome'], y_test_pred_pa_23)  # Assess Passive Aggressive
f1_score_pa = f1_score(test_data_23['Outcome'], y_test_pred_pa_23)
print(f"Passive Aggressive model brier score loss on 2023 test_data: {test_accuracy:.2f}")
print(f"F1 Score for Passive Aggressive model: {f1_score_pa:.2f}")

RF model brier score loss on 2023 test_data: 0.20
F1 Score for RF model: 0.88
Logistic Regression model brier score loss on 2023 test_data: 0.22
F1 Score for Logistic Regression model: 0.87
HGB model brier score loss on 2023 test_data: 0.19
F1 Score for HGB model: 0.88
Passive Aggressive model brier score loss on 2023 test_data: 0.50
F1 Score for Passive Aggressive model: 0.57


In [72]:
# Predict on 2024 test data
y_test_pred_rf_24 = rf_model.predict(test_data_24[features])
y_test_pred_log_reg_24 = log_reg_model.predict(test_data_24[features])
y_test_pred_hgb_24 = model.predict(test_data_24[features])
y_test_pred_pa_24 = pa_classifier.predict(test_data_24[features])

# Calculate accuracy on 2024 test data
test_accuracy = brier_score_loss(test_data_24['Outcome'], y_test_pred_rf_24)  # Assess RF
f1_score_rf = f1_score(test_data_24['Outcome'], y_test_pred_rf_24)
print(f"RF model brier score loss on 2024 test_data: {test_accuracy:.2f}")
print(f"F1 Score for RF model: {f1_score_rf:.2f}")

test_accuracy = brier_score_loss(test_data_24['Outcome'], y_test_pred_log_reg_24)  # Assess Logistic Regression
f1_score_log_reg = f1_score(test_data_24['Outcome'], y_test_pred_log_reg_24)
print(f"Logistic Regression model brier score loss on 2024 test_data: {test_accuracy:.2f}")
print(f"F1 Score for Logistic Regression model: {f1_score_log_reg:.2f}")

test_accuracy = brier_score_loss(test_data_24['Outcome'], y_test_pred_hgb_24)  # Assess HGB
f1_score_hgb = f1_score(test_data_24['Outcome'], y_test_pred_hgb_24)
print(f"HGB model brier score loss on 2024 test_data: {test_accuracy:.2f}")
print(f"F1 Score for HGB model: {f1_score_hgb:.2f}")

test_accuracy = brier_score_loss(test_data_24['Outcome'], y_test_pred_pa_24)  # Assess Passive Aggressive
f1_score_pa = f1_score(test_data_24['Outcome'], y_test_pred_pa_24)
print(f"Passive Aggressive model brier score loss on 2024 test_data: {test_accuracy:.2f}")
print(f"F1 Score for Passive Aggressive model: {f1_score_pa:.2f}")

RF model brier score loss on 2024 test_data: 0.19
F1 Score for RF model: 0.89
Logistic Regression model brier score loss on 2024 test_data: 0.23
F1 Score for Logistic Regression model: 0.87
HGB model brier score loss on 2024 test_data: 0.22
F1 Score for HGB model: 0.86
Passive Aggressive model brier score loss on 2024 test_data: 0.45
F1 Score for Passive Aggressive model: 0.62


# Prepare Data for Stage 1 Predictions

In [73]:
# Merge ModelInput tables with SampleSubmissionStage1 based on Team1 and Team2
merged_data_2021 = SampleSubmissionStage1[SampleSubmissionStage1['Season'] == 2021]
merged_data_2021 = merged_data_2021.merge(ModelInput21.add_suffix('_Team1'), left_on='Team1', right_on='TeamID_Team1', how='left')

In [74]:
merged_data_2021 = merged_data_2021.merge(ModelInput21.add_suffix('_Team2'), left_on='Team2', right_on='TeamID_Team2', how='left')

In [75]:
merged_data_2022 = SampleSubmissionStage1[SampleSubmissionStage1['Season'] == 2022]
merged_data_2022 = merged_data_2022.merge(ModelInput22.add_suffix('_Team1'), left_on='Team1', right_on='TeamID_Team1', how='left')

In [76]:
merged_data_2022 = merged_data_2022.merge(ModelInput22.add_suffix('_Team2'), left_on='Team2', right_on='TeamID_Team2', how='left')

In [78]:
merged_data_2023 = SampleSubmissionStage1[SampleSubmissionStage1['Season'] == 2023]
merged_data_2023 = merged_data_2023.merge(ModelInput23.add_suffix('_Team1'), left_on='Team1', right_on='TeamID_Team1', how='left')


In [79]:
merged_data_2023 = merged_data_2023.merge(ModelInput23.add_suffix('_Team2'), left_on='Team2', right_on='TeamID_Team2', how='left')


In [80]:
merged_data_2024 = SampleSubmissionStage1[SampleSubmissionStage1['Season'] == 2024]
merged_data_2024 = merged_data_2024.merge(ModelInput24.add_suffix('_Team1'), left_on='Team1', right_on='TeamID_Team1', how='left')


In [81]:
merged_data_2024 = merged_data_2024.merge(ModelInput24.add_suffix('_Team2'), left_on='Team2', right_on='TeamID_Team2', how='left')

In [82]:
# Merge ModelInput tables with SampleSubmissionStage1 based on Team1 and Team2
merged_data_2021 = SampleSubmissionStage1[SampleSubmissionStage1['Season'] == 2021]
merged_data_2021 = merged_data_2021.merge(ModelInput21.add_suffix('_Team1'), left_on='Team1', right_on='TeamID_Team1', how='left')
merged_data_2021 = merged_data_2021.merge(ModelInput21.add_suffix('_Team2'), left_on='Team2', right_on='TeamID_Team2', how='left')

merged_data_2022 = SampleSubmissionStage1[SampleSubmissionStage1['Season'] == 2022]
merged_data_2022 = merged_data_2022.merge(ModelInput22.add_suffix('_Team1'), left_on='Team1', right_on='TeamID_Team1', how='left')
merged_data_2022 = merged_data_2022.merge(ModelInput22.add_suffix('_Team2'), left_on='Team2', right_on='TeamID_Team2', how='left')

merged_data_2023 = SampleSubmissionStage1[SampleSubmissionStage1['Season'] == 2023]
merged_data_2023 = merged_data_2023.merge(ModelInput23.add_suffix('_Team1'), left_on='Team1', right_on='TeamID_Team1', how='left')
merged_data_2023 = merged_data_2023.merge(ModelInput23.add_suffix('_Team2'), left_on='Team2', right_on='TeamID_Team2', how='left')

merged_data_2024 = SampleSubmissionStage1[SampleSubmissionStage1['Season'] == 2024]
merged_data_2024 = merged_data_2024.merge(ModelInput24.add_suffix('_Team1'), left_on='Team1', right_on='TeamID_Team1', how='left')
merged_data_2024 = merged_data_2024.merge(ModelInput24.add_suffix('_Team2'), left_on='Team2', right_on='TeamID_Team2', how='left')

In [83]:
merged_data_all = pd.concat([merged_data_2021, merged_data_2022, merged_data_2023, merged_data_2024], ignore_index=True)

# Stage 1 Predictions

In [84]:
# Ensure the features used for training are present in merged_data_all
features = ['Wins_Team1', 'avg_score_diff_Team1', 'FG_PCT_Team1', 'FG3_PCT_Team1', 'FT_PCT_Team1', 
            'REB_Team1', 'AST_Team1', 'TO_Team1', 'STL_Team1', 'BLK_Team1', 'PF_Team1', 
            'Wins_Team2', 'avg_score_diff_Team2', 'FG_PCT_Team2', 'FG3_PCT_Team2', 'FT_PCT_Team2', 
            'REB_Team2', 'AST_Team2', 'TO_Team2', 'STL_Team2', 'BLK_Team2', 'PF_Team2']

# Make predictions
merged_data_all['Pred'] = rf_model.predict_proba(merged_data_all[features])[:, 1]

# Display the first few rows of the predictions
merged_data_all[['ID', 'Pred']].head()

Unnamed: 0,ID,Pred
0,2021_3101_3102,0.656667
1,2021_3101_3103,0.703333
2,2021_3101_3104,0.52
3,2021_3101_3105,0.536667
4,2021_3101_3106,0.48


In [85]:
merged_data_all[['ID', 'Pred']].to_csv('womens_submission.csv', index=False)

In [None]:
merged_data_all.to_csv("WPred_Data.csv", index=False)

# Predict on 2025

In [86]:
ModelInput25 = pd.DataFrame(WTeams['TeamID'])

In [87]:
ModelInput25['Wins'] = ModelInput25['TeamID'].apply(lambda x: record_record(x, WRegularSeasonCompactResults[WRegularSeasonCompactResults['Season'] == 2025]))
ModelInput25['avg_score_diff'] = ModelInput25['TeamID'].apply(lambda x: avg_score_difference(x, WRegularSeasonCompactResults[WRegularSeasonCompactResults['Season'] == 2025]))

In [88]:
team_stats = ModelInput25['TeamID'].apply(lambda x: team_stats_differences(x, WRegularSeasonDetailedResults[WRegularSeasonDetailedResults['Season'] == 2025]))
team_stats_df = pd.DataFrame(team_stats.tolist(), columns=['FG_PCT', 'FG3_PCT', 'FT_PCT', 'REB', 'AST', 'TO', 'STL', 'BLK', 'PF'])
ModelInput25 = pd.concat([ModelInput25, team_stats_df], axis=1)

In [90]:
SampleSubmissionStage2 = pd.read_csv('SampleSubmissionStage2.csv')

In [91]:
# Prepare data for Stage 2
SampleSubmissionStage2['Season'] = SampleSubmissionStage2['ID'].apply(lambda x: int(x.split('_')[0]))
SampleSubmissionStage2['Team1'] = SampleSubmissionStage2['ID'].apply(lambda x: int(x.split('_')[1]))
SampleSubmissionStage2['Team2'] = SampleSubmissionStage2['ID'].apply(lambda x: int(x.split('_')[2]))
SampleSubmissionStage2 = SampleSubmissionStage2[SampleSubmissionStage2['Team1'].astype(str).str.startswith('3') & SampleSubmissionStage2['Team2'].astype(str).str.startswith('3')]


In [92]:
# Merge with ModelInput25 for Team1 and Team2
SampleSubmissionStage2 = SampleSubmissionStage2.merge(ModelInput25.add_suffix('_Team1'), left_on='Team1', right_on='TeamID_Team1', how='left')
SampleSubmissionStage2 = SampleSubmissionStage2.merge(ModelInput25.add_suffix('_Team2'), left_on='Team2', right_on='TeamID_Team2', how='left')

In [94]:
SampleSubmissionStage2.tail()

Unnamed: 0,ID,Pred,Season,Team1,Team2,TeamID_Team1,Wins_Team1,avg_score_diff_Team1,FG_PCT_Team1,FG3_PCT_Team1,...,avg_score_diff_Team2,FG_PCT_Team2,FG3_PCT_Team2,FT_PCT_Team2,REB_Team2,AST_Team2,TO_Team2,STL_Team2,BLK_Team2,PF_Team2
65336,2025_3477_3479,0.5,2025,3477,3479,3477,5,-11.296296,-0.04314,0.031054,...,-7.708333,-0.03862,0.041574,0.023294,-5.083333,-1.916667,2.458333,-3.208333,-1.166667,-0.958333
65337,2025_3477_3480,0.5,2025,3477,3480,3477,5,-11.296296,-0.04314,0.031054,...,-5.285714,-0.029374,-0.015804,-0.040822,1.214286,-0.321429,2.392857,-0.357143,-0.714286,-0.928571
65338,2025_3478_3479,0.5,2025,3478,3479,3478,7,-15.096774,-0.080031,-0.019995,...,-7.708333,-0.03862,0.041574,0.023294,-5.083333,-1.916667,2.458333,-3.208333,-1.166667,-0.958333
65339,2025_3478_3480,0.5,2025,3478,3480,3478,7,-15.096774,-0.080031,-0.019995,...,-5.285714,-0.029374,-0.015804,-0.040822,1.214286,-0.321429,2.392857,-0.357143,-0.714286,-0.928571
65340,2025_3479_3480,0.5,2025,3479,3480,3479,6,-7.708333,-0.03862,0.041574,...,-5.285714,-0.029374,-0.015804,-0.040822,1.214286,-0.321429,2.392857,-0.357143,-0.714286,-0.928571


In [None]:
# Make predictions
SampleSubmissionStage2['Pred'] = rf_model.predict_proba(SampleSubmissionStage2[features])[:, 1]

# Save predictions to a CSV file
SampleSubmissionStage2[['ID', 'Pred']].to_csv('womens_submission_stage2.csv', index=False)

# Display the first few rows of the predictions
SampleSubmissionStage2[['ID', 'Pred']].head()

In [96]:
# Add predictions from SampleSubmissionStage2 to the main dataset
SampleResults = SampleSubmissionStage2.merge(WTeams[['TeamID', 'TeamName']], left_on='Team1', right_on='TeamID', how='left').rename(columns={'TeamName': 'Team1_Name'})
SampleResults = SampleResults.merge(WTeams[['TeamID', 'TeamName']], left_on='Team2', right_on='TeamID', how='left').rename(columns={'TeamName': 'Team2_Name'})

# Select relevant columns and display the predictions
SampleResults[['ID', 'Team1_Name', 'Team2_Name', 'Pred']].head()

Unnamed: 0,ID,Team1_Name,Team2_Name,Pred
0,2025_3101_3102,Abilene Chr,Air Force,0.64
1,2025_3101_3103,Abilene Chr,Akron,0.91
2,2025_3101_3104,Abilene Chr,Alabama,0.473333
3,2025_3101_3105,Abilene Chr,Alabama A&M,0.516667
4,2025_3101_3106,Abilene Chr,Alabama St,0.933333


In [99]:
SampleSubmissionStage2.head()

Unnamed: 0,ID,Pred,Season,Team1,Team2,TeamID_Team1,Wins_Team1,avg_score_diff_Team1,FG_PCT_Team1,FG3_PCT_Team1,...,avg_score_diff_Team2,FG_PCT_Team2,FG3_PCT_Team2,FT_PCT_Team2,REB_Team2,AST_Team2,TO_Team2,STL_Team2,BLK_Team2,PF_Team2
0,2025_3101_3102,0.64,2025,3101,3102,3101,17,4.310345,-0.004245,-0.022957,...,1.533333,-0.043636,-0.004991,0.021159,-4.733333,0.233333,-5.3,3.2,-1.333333,1.1
1,2025_3101_3103,0.91,2025,3101,3103,3101,17,4.310345,-0.004245,-0.022957,...,-7.689655,-0.045665,-0.066817,-0.040376,3.448276,-2.275862,3.448276,-0.965517,-0.413793,0.482759
2,2025_3101_3104,0.473333,2025,3101,3104,3101,17,4.310345,-0.004245,-0.022957,...,15.645161,0.089239,0.119739,-0.028259,3.258065,3.387097,-2.322581,0.870968,1.903226,-1.225806
3,2025_3101_3105,0.516667,2025,3101,3105,3101,17,4.310345,-0.004245,-0.022957,...,3.2,-0.011725,0.00215,0.020367,-3.3,-2.533333,-3.666667,2.933333,-0.566667,-4.2
4,2025_3101_3106,0.933333,2025,3101,3106,3101,17,4.310345,-0.004245,-0.022957,...,-23.642857,-0.121476,-0.086632,0.025751,-5.357143,-7.285714,4.571429,-4.071429,-0.928571,2.142857
