## Imports

In [16]:
import pandas as pd
import patsy
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [3]:
bracket_training = pd.read_csv('bracket_training_with_names.csv')
bracket_training.head()

Unnamed: 0.1,Unnamed: 0,CustomerID,CustomerAreaCode,CustomerPostalCode,CustomerPostalCodeLatitude,CustomerPostalCodeLongitude,CustomerDMACode,CustomerDMADescription,NCAACustomerRecordCreated,BracketEntryId,BracketEntryCreatedDate,RegionWinner_East,RegionWinner_West,RegionWinner_South,RegionWinner_Midwest,SemifinalWinner_East_West,SemifinalWinner_South_Midwest,NationalChampion
0,0,47028,,36093,32.5622,-86.0994,698.0,MONTGOMERY (SELMA),2021-12-25,1723503,2024-03-19 10:27:15 -0400,UConn,Arizona,James Madison,Tennessee,UConn,Tennessee,Tennessee
1,1,3511,616.0,49464,42.8256,-86.0104,563.0,GRAND RAPIDS - KALMZOO - B. CRK,2021-04-02,963479,2024-03-18 10:16:39 -0400,UConn,Baylor,Kentucky,Kansas,UConn,Kentucky,UConn
2,2,58445,703.0,22210,38.8808,-77.1129,511.0,"WASHINGTON, DC (HAGRSTWN)",2021-04-02,810038,2024-03-18 00:21:47 -0400,UConn,Baylor,Houston,Purdue,UConn,Purdue,Purdue
3,3,28833,,78218,29.4969,-98.4032,641.0,SAN ANTONIO,2023-11-16,3384825,2024-03-21 10:28:56 -0400,Iowa St.,Saint Mary's,NC State,Purdue,Iowa St.,NC State,NC State
4,4,37899,,14212,42.8946,-78.8245,514.0,BUFFALO,2022-03-16,2828017,2024-03-20 20:14:52 -0400,Auburn,North Carolina,Marquette,Creighton,North Carolina,Marquette,North Carolina


In [11]:
bracket_test = pd.read_csv('bracket_test_with_names.csv')
bracket_test['SemifinalWinner_East_West'] = ""
bracket_test.head()

Unnamed: 0.1,Unnamed: 0,CustomerID,CustomerAreaCode,CustomerPostalCode,CustomerPostalCodeLatitude,CustomerPostalCodeLongitude,CustomerDMACode,CustomerDMADescription,NCAACustomerRecordCreated,BracketEntryId,BracketEntryCreatedDate,RegionWinner_East,RegionWinner_West,RegionWinner_South,RegionWinner_Midwest,SemifinalWinner_East_West
0,0,73662,919.0,27539,35.7225,-78.8408,560.0,RALEIGH - DURHAM (FAYETVLLE),3/29/20,2074118,3/19/24 18:50,UConn,North Carolina,Houston,Purdue,
1,1,6679,360.0,97206,45.484,-122.5973,820.0,"PORTLAND, OR",4/2/24,2692634,3/20/24 16:56,UConn,North Carolina,Duke,Kansas,
2,2,63024,270.0,42754,37.4603,-86.3249,529.0,LOUISVILLE,12/8/21,1252684,3/18/24 15:13,Iowa St.,Arizona,Kentucky,Creighton,
3,3,60371,206.0,98178,47.4924,-122.2359,819.0,SEATTLE - TACOMA,3/22/23,1950205,3/19/24 15:21,UConn,North Carolina,Houston,Purdue,
4,4,18415,717.0,19038,40.1096,-75.155,504.0,PHILADELPHIA,2/20/24,2756293,3/20/24 18:40,UConn,North Carolina,Marquette,Creighton,


In [18]:
bracket_training.shape

(130002, 18)

In [19]:
bracket_test.shape

(14445, 16)

## Formulating SMF

In [4]:
semifinal1_formula = "C(SemifinalWinner_East_West) ~ C(CustomerDMACode) + C(RegionWinner_East) + C(RegionWinner_West) + C(RegionWinner_South) + C(RegionWinner_Midwest)"
semifinal2_formula = "C(SemifinalWinner_South_Midwest) ~ C(CustomerDMACode) + C(RegionWinner_East) + C(RegionWinner_West) + C(RegionWinner_South) + C(RegionWinner_Midwest)"

## GBDT

In [14]:
# Prepare Full Dataset
y, X = patsy.dmatrices(semifinal1_formula, data=bracket_training, return_type='dataframe')
y = y.iloc[:, 0]  # Convert target to 1D

# Split into Training (70%) and Testing (30%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Train CatBoost Model
model = CatBoostClassifier(iterations=500, depth=6, learning_rate=0.1, loss_function='Logloss', verbose=0)
model.fit(X_train, y_train)

# Predict on Test Data
y_pred = model.predict(X_test)

# Store Predictions in DataFrame
test_results = X_test.copy()


In [17]:
# Compute evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')  # Use 'weighted' for multi-class
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

# Print the results
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")

Accuracy: 0.9826
Precision: 0.9790
Recall: 0.9826
F1-Score: 0.9797


In [25]:
import pandas as pd
import matplotlib.pyplot as plt

# Extract feature importance from the trained model
feature_importance = model.feature_importances_

# Create a DataFrame with feature names and importance scores
importance_df = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': feature_importance
})

# Sort by importance score (descending)
importance_df = importance_df.sort_values(by="Importance", ascending=False)

importance_df

Unnamed: 0,Feature,Importance
224,C(RegionWinner_West)[T.Arizona],36.051250
236,C(RegionWinner_West)[T.North Carolina],33.658897
225,C(RegionWinner_West)[T.Baylor],13.252173
237,C(RegionWinner_West)[T.Saint Mary's],4.216350
232,C(RegionWinner_West)[T.Michigan St.],2.055929
...,...,...
69,C(CustomerDMACode)[T.571.0],0.000000
73,C(CustomerDMACode)[T.576.0],0.000000
125,C(CustomerDMACode)[T.651.0],0.000000
77,C(CustomerDMACode)[T.583.0],0.000000
