In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Utils

In [None]:
import pandas as pd
def load (path):  
  df = pd.read_csv(path)    
  return df 

In [None]:
def features_type (df) : 
  numerical = df.dtypes[df.dtypes != "object"].index
  non_num = []
  for x in  list(df.columns) : 
      if x not in numerical : 
          non_num.append(x)
  return numerical , non_num

In [None]:
def splitting_data (df) : 
  df = df.sample(frac=1).reset_index(drop=True)
  train,test = df[:2700] , df[2700:]
  return train,test


Live data

In [None]:
data = load ('/content/drive/MyDrive/Tennis.csv')

In [None]:
train,test = splitting_data (data)

In [None]:
train.head(1)

Unnamed: 0.1,Unnamed: 0,tourney_name,surface,draw_size,tourney_level,tourney_date,match_num,best_of,round,Match,Player_Side1,Player_Side2,PS1_age,PS1_country,PS1_hand,PS1_height,PS1_points,PS1_rank,PS2_age,PS2_country,PS2_hand,PS2_height,PS2_points,PS2_rank,winner
0,811,Us Open,Hard,128,G,20200831,218,5,R16,Alex De Minaur vs Vasek Pospisil,Alex De Minaur,Vasek Pospisil,20.884326,AUS,R,183.0,1775.0,18.0,29.557837,CAN,R,193.0,360.0,146.0,1.0


In [None]:
_ , non_num = features_type (data.drop(['winner','Unnamed: 0','tourney_level','draw_size'],axis=1))

In [None]:
data.winner = data.winner.astype('int64')

Adding features

In [None]:
data['rank_difference'] = data['PS1_rank']**2 - data['PS2_rank']**2
data['points_difference'] = data['PS1_points']**2 - data['PS2_points']**2

Model

In [None]:

class CFG_Catboost :
  SEED = 42
  n_splits = 5
  
  catboost_params = {'learning_rate':0.05,'iterations':10000,'eval_metric':'Accuracy',
                      'use_best_model' :True,'verbose':100,'random_seed': 0,'max_depth':3}
  remove_features = ['winner','Unnamed: 0','tourney_level','draw_size']
  categ_features = non_num
  TARGET_COL = 'winner'

In [None]:
def features_utils (train):
  features_columns = [col for col in train.columns if col not in CFG_Catboost.remove_features]
  return features_columns

In [None]:
features_columns = features_utils (train)

In [None]:
features_columns

['tourney_name',
 'surface',
 'tourney_date',
 'match_num',
 'best_of',
 'round',
 'Match',
 'Player_Side1',
 'Player_Side2',
 'PS1_age',
 'PS1_country',
 'PS1_hand',
 'PS1_height',
 'PS1_points',
 'PS1_rank',
 'PS2_age',
 'PS2_country',
 'PS2_hand',
 'PS2_height',
 'PS2_points',
 'PS2_rank']

In [None]:
def divide_train (train):
  skf = StratifiedKFold(n_splits=CFG_Catboost.n_splits,shuffle=True, random_state=CFG_Catboost.SEED)
  X , y   = train[features_columns] , train[CFG_Catboost.TARGET_COL]
  return X,y,skf

In [None]:
from sklearn.model_selection import StratifiedKFold
X,y,skf= divide_train (train)

In [None]:
!pip install catboost==0.22 --quiet


In [None]:
from catboost import CatBoostClassifier , Pool
def StratifiedKFold_Train(X,y):
  estimators = []  
  for fold_, (trn_idx, val_idx) in enumerate(skf.split(X,y)):
      print(50*'-')
      print('Fold:',fold_+1)
      X_train, y_train = X.iloc[trn_idx,:], y[trn_idx] 
      X_test, y_test = X.iloc[val_idx,:], y[val_idx] 
        
      estimator = CatBoostClassifier(**CFG_Catboost.catboost_params)
      estimator.fit(Pool(X_train,y_train,cat_features = CFG_Catboost.categ_features),
                    eval_set = Pool(X_test,y_test,cat_features = CFG_Catboost.categ_features),
                    early_stopping_rounds=100)
      estimators.append(estimator)
      print(50*'-')
      print()
  return estimators


In [None]:
estimators = StratifiedKFold_Train(X,y)

--------------------------------------------------
Fold: 1
0:	learn: 0.5949074	test: 0.5444444	best: 0.5444444 (0)	total: 6.47ms	remaining: 1m 4s
100:	learn: 0.7541667	test: 0.6537037	best: 0.6629630 (86)	total: 551ms	remaining: 54s
200:	learn: 0.7824074	test: 0.6703704	best: 0.6740741 (198)	total: 1.15s	remaining: 56.1s
300:	learn: 0.7990741	test: 0.6870370	best: 0.6888889 (262)	total: 1.73s	remaining: 55.8s
400:	learn: 0.8087963	test: 0.6851852	best: 0.6925926 (367)	total: 2.3s	remaining: 55.1s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.6925925926
bestIteration = 367

Shrink model to first 368 iterations.
--------------------------------------------------

--------------------------------------------------
Fold: 2
0:	learn: 0.6134259	test: 0.6203704	best: 0.6203704 (0)	total: 6.1ms	remaining: 1m
100:	learn: 0.7666667	test: 0.7166667	best: 0.7222222 (75)	total: 564ms	remaining: 55.3s
200:	learn: 0.7861111	test: 0.7296296	best: 0.7370370 (178)	total: 1.13s	rem

In [None]:
def predict (estimators):
  cat_preds = []
  for estimator in estimators : 
    y_pred_test = estimator.predict(test[features_columns])
    cat_preds.append(y_pred_test)
  catboost_preds = np.mean(cat_preds,axis=0)
  return catboost_preds


In [None]:
import numpy as np
catboost_preds = predict (estimators)


In [None]:
import catboost
for idx, estimator in enumerate(estimators) :
  estimator.save_model( 'catboost{}'.format(idx + 1))

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(catboost_preds.astype('int64'),test.winner)

0.7229508196721312

In [None]:
X.columns

Index(['tourney_name', 'surface', 'tourney_date', 'match_num', 'best_of',
       'round', 'Match', 'Player_Side1', 'Player_Side2', 'PS1_age',
       'PS1_country', 'PS1_hand', 'PS1_height', 'PS1_points', 'PS1_rank',
       'PS2_age', 'PS2_country', 'PS2_hand', 'PS2_height', 'PS2_points',
       'PS2_rank'],
      dtype='object')

In [None]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2700 entries, 0 to 2699
Data columns (total 21 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   tourney_name  2700 non-null   object 
 1   surface       2700 non-null   object 
 2   tourney_date  2700 non-null   int64  
 3   match_num     2700 non-null   int64  
 4   best_of       2700 non-null   int64  
 5   round         2700 non-null   object 
 6   Match         2700 non-null   object 
 7   Player_Side1  2700 non-null   object 
 8   Player_Side2  2700 non-null   object 
 9   PS1_age       2700 non-null   float64
 10  PS1_country   2700 non-null   object 
 11  PS1_hand      2700 non-null   object 
 12  PS1_height    2700 non-null   float64
 13  PS1_points    2700 non-null   float64
 14  PS1_rank      2700 non-null   float64
 15  PS2_age       2700 non-null   float64
 16  PS2_country   2700 non-null   object 
 17  PS2_hand      2700 non-null   object 
 18  PS2_height    2700 non-null 

In [None]:
X.head(1)

Unnamed: 0,tourney_name,surface,tourney_date,match_num,best_of,round,Match,Player_Side1,Player_Side2,PS1_age,PS1_country,PS1_hand,PS1_height,PS1_points,PS1_rank,PS2_age,PS2_country,PS2_hand,PS2_height,PS2_points,PS2_rank
0,Us Open,Hard,20200831,218,5,R16,Alex De Minaur vs Vasek Pospisil,Alex De Minaur,Vasek Pospisil,20.884326,AUS,R,183.0,1775.0,18.0,29.557837,CAN,R,193.0,360.0,146.0
