## Imports

In [3]:
import pandas as pd
import numpy as np
import catboost
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

In [4]:
bracket_training = pd.read_csv('bracket_training_with_names.csv')
bracket_training.head()

Unnamed: 0.1,Unnamed: 0,CustomerID,CustomerAreaCode,CustomerPostalCode,CustomerPostalCodeLatitude,CustomerPostalCodeLongitude,CustomerDMACode,CustomerDMADescription,NCAACustomerRecordCreated,BracketEntryId,BracketEntryCreatedDate,RegionWinner_East,RegionWinner_West,RegionWinner_South,RegionWinner_Midwest,SemifinalWinner_East_West,SemifinalWinner_South_Midwest,NationalChampion
0,0,47028,,36093,32.5622,-86.0994,698.0,MONTGOMERY (SELMA),2021-12-25,1723503,2024-03-19 10:27:15 -0400,UConn,Arizona,James Madison,Tennessee,UConn,Tennessee,Tennessee
1,1,3511,616.0,49464,42.8256,-86.0104,563.0,GRAND RAPIDS - KALMZOO - B. CRK,2021-04-02,963479,2024-03-18 10:16:39 -0400,UConn,Baylor,Kentucky,Kansas,UConn,Kentucky,UConn
2,2,58445,703.0,22210,38.8808,-77.1129,511.0,"WASHINGTON, DC (HAGRSTWN)",2021-04-02,810038,2024-03-18 00:21:47 -0400,UConn,Baylor,Houston,Purdue,UConn,Purdue,Purdue
3,3,28833,,78218,29.4969,-98.4032,641.0,SAN ANTONIO,2023-11-16,3384825,2024-03-21 10:28:56 -0400,Iowa St.,Saint Mary's,NC State,Purdue,Iowa St.,NC State,NC State
4,4,37899,,14212,42.8946,-78.8245,514.0,BUFFALO,2022-03-16,2828017,2024-03-20 20:14:52 -0400,Auburn,North Carolina,Marquette,Creighton,North Carolina,Marquette,North Carolina


In [5]:
bracket_test = pd.read_csv('bracket_test_with_names.csv')
bracket_test['SemifinalWinner_East_West'] = ""
bracket_test.head()

Unnamed: 0.1,Unnamed: 0,CustomerID,CustomerAreaCode,CustomerPostalCode,CustomerPostalCodeLatitude,CustomerPostalCodeLongitude,CustomerDMACode,CustomerDMADescription,NCAACustomerRecordCreated,BracketEntryId,BracketEntryCreatedDate,RegionWinner_East,RegionWinner_West,RegionWinner_South,RegionWinner_Midwest,SemifinalWinner_East_West
0,0,73662,919.0,27539,35.7225,-78.8408,560.0,RALEIGH - DURHAM (FAYETVLLE),3/29/20,2074118,3/19/24 18:50,UConn,North Carolina,Houston,Purdue,
1,1,6679,360.0,97206,45.484,-122.5973,820.0,"PORTLAND, OR",4/2/24,2692634,3/20/24 16:56,UConn,North Carolina,Duke,Kansas,
2,2,63024,270.0,42754,37.4603,-86.3249,529.0,LOUISVILLE,12/8/21,1252684,3/18/24 15:13,Iowa St.,Arizona,Kentucky,Creighton,
3,3,60371,206.0,98178,47.4924,-122.2359,819.0,SEATTLE - TACOMA,3/22/23,1950205,3/19/24 15:21,UConn,North Carolina,Houston,Purdue,
4,4,18415,717.0,19038,40.1096,-75.155,504.0,PHILADELPHIA,2/20/24,2756293,3/20/24 18:40,UConn,North Carolina,Marquette,Creighton,


In [6]:
bracket_training.shape

(130002, 18)

In [7]:
bracket_test.shape

(14445, 16)

## Convert Dataset into CatBoost compatible format

In [8]:
bracket_training_simple = bracket_training[['CustomerDMACode', 'RegionWinner_East', 'RegionWinner_West', 'SemifinalWinner_East_West']]
bracket_training_simple['CustomerDMACode'] = bracket_training_simple['CustomerDMACode'].astype(str).str[:-2]
bracket_training_simple.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  bracket_training_simple['CustomerDMACode'] = bracket_training_simple['CustomerDMACode'].astype(str).str[:-2]


Unnamed: 0,CustomerDMACode,RegionWinner_East,RegionWinner_West,SemifinalWinner_East_West
0,698,UConn,Arizona,UConn
1,563,UConn,Baylor,UConn
2,511,UConn,Baylor,UConn
3,641,Iowa St.,Saint Mary's,Iowa St.
4,514,Auburn,North Carolina,North Carolina


In [11]:
bracket_test_simple = bracket_test[['CustomerDMACode', 'RegionWinner_East', 'RegionWinner_West', 'SemifinalWinner_East_West']]
bracket_test_simple['CustomerDMACode'] = bracket_test_simple['CustomerDMACode'].astype(str).str[:-2]
bracket_test_simple.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  bracket_test_simple['CustomerDMACode'] = bracket_test_simple['CustomerDMACode'].astype(str).str[:-2]


Unnamed: 0,CustomerDMACode,RegionWinner_East,RegionWinner_West,SemifinalWinner_East_West
0,560,UConn,North Carolina,
1,820,UConn,North Carolina,
2,529,Iowa St.,Arizona,
3,819,UConn,North Carolina,
4,504,UConn,North Carolina,


In [12]:
# Define training data (from bracket_training_simple)
X_train = bracket_training_simple[['CustomerDMACode', 'RegionWinner_East', 'RegionWinner_West']]
y_train = bracket_training_simple['SemifinalWinner_East_West']

# Define test data (from bracket_test_simple)
X_test = bracket_test_simple[['CustomerDMACode', 'RegionWinner_East', 'RegionWinner_West']]

# Define categorical features
cat_features = ['CustomerDMACode', 'RegionWinner_East', 'RegionWinner_West']

# Initialize and train the model
model = CatBoostClassifier(iterations=100, depth=6, learning_rate=0.1, cat_features=cat_features, verbose=2)
model.fit(X_train, y_train)

# Make predictions on bracket_test_simple
bracket_test_simple['SemifinalWinner_East_West'] = model.predict(X_test)

0:	learn: 2.0179613	total: 2.1s	remaining: 3m 28s
2:	learn: 1.6024645	total: 6s	remaining: 3m 14s
4:	learn: 1.3746278	total: 9.68s	remaining: 3m 3s
6:	learn: 1.2203340	total: 13.9s	remaining: 3m 4s
8:	learn: 1.1084844	total: 17.9s	remaining: 3m 1s
10:	learn: 1.0203695	total: 22s	remaining: 2m 57s
12:	learn: 0.9518755	total: 26.2s	remaining: 2m 55s
14:	learn: 0.8995257	total: 30.2s	remaining: 2m 51s
16:	learn: 0.8552939	total: 34.4s	remaining: 2m 48s
18:	learn: 0.8186093	total: 38.7s	remaining: 2m 44s
20:	learn: 0.7901622	total: 42.9s	remaining: 2m 41s
22:	learn: 0.7654370	total: 47.3s	remaining: 2m 38s
24:	learn: 0.7491074	total: 52s	remaining: 2m 35s
26:	learn: 0.7331035	total: 56.1s	remaining: 2m 31s
28:	learn: 0.7182396	total: 1m	remaining: 2m 28s
30:	learn: 0.7042140	total: 1m 5s	remaining: 2m 25s
32:	learn: 0.6934283	total: 1m 10s	remaining: 2m 22s
34:	learn: 0.6854112	total: 1m 15s	remaining: 2m 19s
36:	learn: 0.6790353	total: 1m 20s	remaining: 2m 17s
38:	learn: 0.6729365	total: 