Import libraries and Data

In [131]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier

In [132]:
MTeams = pd.read_csv('MTeams.csv')
MSeasons = pd.read_csv('MSeasons.csv')
MNCAATourneySeeds = pd.read_csv('MNCAATourneySeeds.csv')
MRegularSeasonCompactResults = pd.read_csv('MRegularSeasonCompactResults.csv') 
MNCAATourneyCompactResults = pd.read_csv('MNCAATourneyCompactResults.csv')
SampleSubmissionStage1 = pd.read_csv('SampleSubmissionStage1.csv')
SampleSubmissionStage2 = pd.read_csv('SampleSubmissionStage2.csv')
MMasseyOrdinals = pd.read_csv('MMasseyOrdinals.csv')

Data Exploration

In [133]:
MTeams.head()

Unnamed: 0,TeamID,TeamName,FirstD1Season,LastD1Season
0,1101,Abilene Chr,2014,2025
1,1102,Air Force,1985,2025
2,1103,Akron,1985,2025
3,1104,Alabama,1985,2025
4,1105,Alabama A&M,2000,2025


In [134]:
MSeasons.head()

Unnamed: 0,Season,DayZero,RegionW,RegionX,RegionY,RegionZ
0,1985,10/29/1984,East,West,Midwest,Southeast
1,1986,10/28/1985,East,Midwest,Southeast,West
2,1987,10/27/1986,East,Southeast,Midwest,West
3,1988,11/02/1987,East,Midwest,Southeast,West
4,1989,10/31/1988,East,West,Midwest,Southeast


In [135]:
MNCAATourneySeeds.head()

Unnamed: 0,Season,Seed,TeamID
0,1985,W01,1207
1,1985,W02,1210
2,1985,W03,1228
3,1985,W04,1260
4,1985,W05,1374


In [136]:
MRegularSeasonCompactResults.tail()

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT
192492,2025,120,1433,71,1182,62,A,0
192493,2025,120,1436,79,1107,71,H,0
192494,2025,120,1438,60,1199,57,H,0
192495,2025,120,1452,71,1428,69,A,0
192496,2025,120,1460,98,1237,85,H,0


In [137]:
MNCAATourneyCompactResults.tail()

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT
2513,2024,146,1301,76,1181,64,N,0
2514,2024,146,1345,72,1397,66,N,0
2515,2024,152,1163,86,1104,72,N,0
2516,2024,152,1345,63,1301,50,N,0
2517,2024,154,1163,75,1345,60,N,0


In [138]:
SampleSubmissionStage1.head()

Unnamed: 0,ID,Pred
0,2021_1101_1102,0.5
1,2021_1101_1103,0.5
2,2021_1101_1104,0.5
3,2021_1101_1105,0.5
4,2021_1101_1106,0.5


In [139]:
SampleSubmissionStage2.head()

Unnamed: 0,ID,Pred
0,2025_1101_1102,0.5
1,2025_1101_1103,0.5
2,2025_1101_1104,0.5
3,2025_1101_1105,0.5
4,2025_1101_1106,0.5


In [140]:
SampleSubmissionStage1['Season'] = SampleSubmissionStage1['ID'].apply(lambda x: int(x.split('_')[0]))
SampleSubmissionStage1['Team1'] = SampleSubmissionStage1['ID'].apply(lambda x: int(x.split('_')[1]))
SampleSubmissionStage1['Team2'] = SampleSubmissionStage1['ID'].apply(lambda x: int(x.split('_')[2]))
SampleSubmissionStage1.head()

Unnamed: 0,ID,Pred,Season,Team1,Team2
0,2021_1101_1102,0.5,2021,1101,1102
1,2021_1101_1103,0.5,2021,1101,1103
2,2021_1101_1104,0.5,2021,1101,1104
3,2021_1101_1105,0.5,2021,1101,1105
4,2021_1101_1106,0.5,2021,1101,1106


# Develop Model based on 2021 season

Explore 2021 Data

In [None]:
MRegularSeason2021 = MRegularSeasonCompactResults[MRegularSeasonCompactResults['Season'] == 2021]
MRegularSeason2021.head()
Model_Input_21 = MTeams[MTeams['FirstD1Season'] <= 2021]

In [None]:
MMasseyOrdinals.head()

In [None]:
MMasseyOrdinals[MMasseyOrdinals['Season'] == 2021]['SystemName'].unique()

In [None]:
MMasseyOrdinals[MMasseyOrdinals['Season'] == 2021]['RankingDayNum'].value_counts()

Create Functions for Key Indicators

In [None]:
from tqdm import tqdm

def record_record(Team1):
    Team1_wins = 0
    Team1_losses = 0
    for index, row in MRegularSeason2021.iterrows():
        if row['WTeamID'] == Team1:
            Team1_wins += 1
        if row['LTeamID'] == Team1:
            Team1_losses += 1
    return Team1_wins, Team1_losses

def avg_score_difference(Team1):
    Team1_score_difference = 0
    games = 0
    for index, row in MRegularSeason2021.iterrows():
        if row['WTeamID'] == Team1:
            Team1_score_difference += row['WScore'] - row['LScore']
            games += 1
        if row['LTeamID'] == Team1:
            Team1_score_difference += row['LScore'] - row['WScore']
            games += 1
    return Team1_score_difference/games if games != 0 else 0


from tqdm import tqdm
def weighted_avg_ranking(Team1, ranking_system, season):
    # Filter data once
    team_data = MMasseyOrdinals[(MMasseyOrdinals['Season'] == season) & 
                               (MMasseyOrdinals['TeamID'] == Team1) &
                               (MMasseyOrdinals['SystemName'] == ranking_system)]
    
    if team_data.empty:
        return 0  # Or another appropriate default value
    
    # Get unique ranking days
    unique_days = team_data['RankingDayNum'].unique()
    total_days = len(unique_days)
    
    # Initialize weights
    weights = range(total_days, 0, -1)
    
    # If there are fewer team_data rows than days, we need to adjust
    if len(team_data) < total_days:
        weights = weights[:len(team_data)]
    
    # Calculate weighted average
    weighted_sum = sum(row.OrdinalRank * weight for row, weight in tqdm(zip(team_data.itertuples(), weights), total=len(weights)))
    total_weight = sum(weights)
    
    return weighted_sum / total_weight if total_weight > 0 else 0

Apply Functions

In [None]:

Model_Input_21['Wins'] = Model_Input_21['TeamID'].apply(lambda x: record_record(x)[0])

In [None]:
Model_Input_21['avg_score_difference'] = Model_Input_21['TeamID'].apply(lambda x: avg_score_difference(x))

In [None]:
# Use POM ranking for now
Model_Input_21['POM_avg'] = Model_Input_21['TeamID'].apply(lambda x: weighted_avg_ranking(x, 'POM', 2021))

Merge new features with submission table with 2021 values

In [None]:
SampleSubmissionStage1_2021 = SampleSubmissionStage1[SampleSubmissionStage1['Season'] == 2021]
SampleSubmissionStage1_2021.head()

# Merge for Team1
SampleSubmissionStage1_2021 = SampleSubmissionStage1_2021.merge(
    Model_Input_21[['TeamID', 'Wins', 'avg_score_difference', 'POM_avg']],
    left_on='Team1', right_on='TeamID', how='left'
).rename(columns={
    'Wins': 'Team1_wins',
    'avg_score_difference': 'Team1_avg_score_difference',
    'POM_avg': 'Team1_POM_avg'
}).drop(columns=['TeamID'])

# Merge for Team2
SampleSubmissionStage1_2021 = SampleSubmissionStage1_2021.merge(
    Model_Input_21[['TeamID', 'Wins', 'avg_score_difference', 'POM_avg']],
    left_on='Team2', right_on='TeamID', how='left'
).rename(columns={
    'Wins': 'Team2_wins',
    'avg_score_difference': 'Team2_avg_score_difference',
    'POM_avg': 'Team2_POM_avg'
}).drop(columns=['TeamID'])

SampleSubmissionStage1_2021.head()

In [None]:
# Merge for Team1
SampleSubmissionStage1_2021 = SampleSubmissionStage1_2021.merge(
    MRegularSeasonCompactResults[['Season', 'DayNum', 'WTeamID', 'WScore', 'LTeamID', 'LScore']],
    left_on=['Season', 'Team1', 'Team2'],
    right_on=['Season', 'WTeamID', 'LTeamID'],
    how='left'
).rename(columns={
    'DayNum': 'Game1_DayNum',
    'WScore': 'Game1_WScore',
    'LScore': 'Game1_LScore',
    'WTeamID': 'Game1_WTeamID',
}).drop(columns=['LTeamID'])

# Merge for Team2
SampleSubmissionStage1_2021 = SampleSubmissionStage1_2021.merge(
    MRegularSeasonCompactResults[['Season', 'DayNum', 'WTeamID', 'WScore', 'LTeamID', 'LScore']],
    left_on=['Season', 'Team2', 'Team1'],
    right_on=['Season', 'WTeamID', 'LTeamID'],
    how='left'
).rename(columns={
    'DayNum': 'Game2_DayNum',
    'WScore': 'Game2_WScore',
    'LScore': 'Game2_LScore',
    'WTeamID': 'Game2_WTeamID',
}).drop(columns=['LTeamID'])

SampleSubmissionStage1_2021.head()

Extract matchups with regular season results to train with

In [None]:
train_21 = SampleSubmissionStage1_2021[SampleSubmissionStage1_2021['Game1_DayNum'].notnull()]
train_21.head()

In [None]:
def classify(row):
    if not pd.isna(row['Game2_WTeamID']):
        return 1 if row['Game2_WTeamID'] == row['Team1'] else 0
    else:
        return 1 if row['Game1_WTeamID'] == row['Team1'] else 0

train_21['Classification'] = train_21.apply(classify, axis=1)
train_21.head()

Train the Model

In [None]:
import patsy as pt

from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

In [None]:
from sklearn.metrics import brier_score_loss

# Define the features and target variable
features = ['Team1_wins', 'Team1_avg_score_difference', 'Team1_POM_avg', 
            'Team2_wins', 'Team2_avg_score_difference', 'Team2_POM_avg']
target = 'Classification'

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(train_21[features], train_21[target], test_size=0.2, random_state=42)

# Initialize the DecisionTreeClassifier
clf = RandomForestClassifier(random_state=40)

# Train the classifier
clf.fit(X_train, y_train)

# Predict probabilities
y_pred_proba = clf.predict_proba(X_test)[:, 1]

# Calculate the Brier score
brier_score = brier_score_loss(y_test, y_pred_proba)
print(f'Brier Score: {brier_score}')

In [None]:
# Predict the probabilities
y_pred_proba = clf.predict_proba(X_test)[:, 1]

# Predict the classifications
y_pred = clf.predict(X_test)

# Calculate the accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')

# Display the predicted probabilities
predicted_probabilities = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred, 'Probability': y_pred_proba})
predicted_probabilities.head()

In [None]:
import joblib

# Save the model to a file
joblib.dump(clf, 'random_forest_model.pkl')

Predict the entire sample set

In [None]:
# Define the features for the entire 2021 sample set
features_2021 = SampleSubmissionStage1_2021[['Team1_wins', 'Team1_avg_score_difference', 'Team1_POM_avg', 
                                             'Team2_wins', 'Team2_avg_score_difference', 'Team2_POM_avg']]

# Predict probabilities for the entire 2021 sample set
SampleSubmissionStage1_2021['Pred'] = clf.predict_proba(features_2021)[:, 1]

# Use a 0.5 cutoff to create a 0/1 classification
SampleSubmissionStage1_2021['Classification'] = (SampleSubmissionStage1_2021['Pred'] > 0.5).astype(int)

# Display the updated dataframe with predictions
SampleSubmissionStage1_2021.head()

Check Predictions against extracted 2021 tourney results

In [None]:
# Filter for the 2021 season
tourney_2021_results = MNCAATourneyCompactResults[MNCAATourneyCompactResults['Season'] == 2021][['WTeamID', 'LTeamID','DayNum']]
tourney_2021_results['Team1'] = np.where(tourney_2021_results['WTeamID'] < tourney_2021_results['LTeamID'], 
                                         tourney_2021_results['WTeamID'], 
                                         tourney_2021_results['LTeamID'])
tourney_2021_results['Team2'] = np.where(tourney_2021_results['WTeamID'] < tourney_2021_results['LTeamID'], 
                                         tourney_2021_results['LTeamID'], 
                                         tourney_2021_results['WTeamID'])
tourney_2021_results.drop(columns=['LTeamID'], inplace=True)
tourney_2021_results.head()

In [None]:
# Merge the dataframes on Team1 and Team2
merged_results = SampleSubmissionStage1_2021.merge(
    tourney_2021_results,
    left_on=['Team1', 'Team2'],
    right_on=['Team1', 'Team2'],
    how='inner'
)

# Create the result column
merged_results['Result'] = merged_results.apply(lambda row: 1 if row['Team1'] == row['WTeamID'] else 0, axis=1)

# Select the required columns
final_results = merged_results[['Team1', 'Team2', 'WTeamID', 'Pred', 'Classification', 'Result', 'DayNum']]

# Order the results by DayNum
final_results = final_results.sort_values(by='DayNum').reset_index(drop=True)

final_results.head(50)

In [None]:
from sklearn.metrics import accuracy_score, brier_score_loss

# Extract the actual results and predicted probabilities
y_true = final_results['Result']
y_pred_proba = final_results['Pred']
y_pred = final_results['Classification']

# Calculate accuracy
accuracy = accuracy_score(y_true, y_pred)
print(f'Accuracy: {accuracy}')

# Calculate Brier score
brier_score = brier_score_loss(y_true, y_pred_proba)
print(f'Brier Score: {brier_score}')