In [55]:
import pandas as pd
import numpy as np
import openpyxl
import random
import copy

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score
from xgboost import XGBClassifier

df = pd.read_csv("atp_tennis.csv")


objs = df.select_dtypes("object").columns
print(", ".join(objs))

Tournament, Date, Series, Court, Surface, Round, Player_1, Player_2, Winner, Score


# Data Frame Corrections

In [56]:
df = df[~((df["Odd_1"] == -1) | (df["Odd_2"] == -1))]
df = df.drop(columns=["Pts_1", "Pts_2", "Score"])
df.head()

Unnamed: 0,Tournament,Date,Series,Court,Surface,Round,Best of,Player_1,Player_2,Winner,Rank_1,Rank_2,Odd_1,Odd_2
2876,AAPT Championships,1/1/2001,International,Outdoor,Hard,1st Round,3,Haas T.,Smith L.,Haas T.,23,485,1.11,3.75
2878,AAPT Championships,1/1/2001,International,Outdoor,Hard,1st Round,3,Hewitt L.,Arthurs W.,Hewitt L.,7,83,1.2,3.55
2890,AAPT Championships,1/1/2001,International,Outdoor,Hard,2nd Round,3,Haas T.,Malisse X.,Haas T.,23,127,1.2825,2.925
2892,AAPT Championships,1/1/2001,International,Outdoor,Hard,2nd Round,3,Hewitt L.,Phau B.,Hewitt L.,7,208,1.13,5.5
2896,AAPT Championships,1/1/2001,International,Outdoor,Hard,2nd Round,3,Massu N.,Clement A.,Massu N.,87,18,2.4075,1.43


In [57]:

df["player1SurfaceProp"] = 0.0
df["player2SurfaceProp"] = 0.0
df["player1H2H"] = 0.0
df["player2H2H"] = 0.0
df["player1RoundProp"] = 0.0
df["player2RoundProp"] = 0.0
df["player1BestOfProp"] = 0.0
df["player2BestOfProp"] = 0.0
df["Target"] = 0.0

def create_tracker_base(col):
    unique_stuff = df[col].unique().tolist()
    base = {}
    for i in unique_stuff:
        base[i] = [0, 0]
    return base

surface_tracker = create_tracker_base("Surface")
round_tracker = create_tracker_base("Round")
bestof_tracker = create_tracker_base("Best of")
# first index - total number of games won
# second index - total number of games played
tracker_list = [0, 0, surface_tracker, round_tracker, bestof_tracker]
print(tracker_list)

[0, 0, {'Hard': [0, 0], 'Carpet': [0, 0], 'Clay': [0, 0], 'Grass': [0, 0]}, {'1st Round': [0, 0], '2nd Round': [0, 0], 'Quarterfinals': [0, 0], 'The Final': [0, 0], 'Semifinals': [0, 0], '3rd Round': [0, 0], '4th Round': [0, 0], 'Round Robin': [0, 0]}, {3: [0, 0], 5: [0, 0]}]


In [58]:
tracker = {}
h2h = {}

def calc_prop(w, g):
    return (w + 1)/(g + 2)

def update_df(i, df, player, surface, bestof, rounds, player_num, tracker, tracking_stuff):
    if player not in tracker:
        tracker[player] = copy.deepcopy(tracker_list)
    df.at[df.index[i], "player" + str(player_num) + "SurfaceProp"] = calc_prop(tracker[player][2][surface][0], tracker[player][2][surface][1])
    df.at[df.index[i], "player" + str(player_num) + "RoundProp"] = calc_prop(tracker[player][3][rounds][0], tracker[player][3][rounds][1])
    df.at[df.index[i], "player" + str(player_num) + "BestOfProp"] = calc_prop(tracker[player][4][bestof][0], tracker[player][4][bestof][1])
    for j in range(len(tracking_stuff)):
        tracker[player][j + 2][tracking_stuff[j]][1] += 1
    return df, tracker

for i in range(len(df)):
    p1 = df.iloc[i, df.columns.get_loc("Player_1")]
    p2 = df.iloc[i, df.columns.get_loc("Player_2")]
    surface = df.iloc[i, df.columns.get_loc("Surface")]
    bestof = df.iloc[i, df.columns.get_loc("Best of")]
    rounds = df.iloc[i, df.columns.get_loc("Round")]
    w = df.iloc[i, df.columns.get_loc("Winner")]
    players = frozenset([p1, p2])
    tracking_stuff = [surface, rounds, bestof]

    if players not in h2h:
        h2h[players] = {p1: 0, p2: 0}


    df, tracker = update_df(i, df, p1, surface, bestof, rounds, 1, tracker, tracking_stuff)
    df, tracker = update_df(i, df, p2, surface, bestof, rounds, 2, tracker, tracking_stuff)



    for j in range(len(tracking_stuff)):
        tracker[w][j + 2][tracking_stuff[j]][0] += 1


    df.at[df.index[i], "player1H2H"] = calc_prop(h2h[players][p1], (h2h[players][p1] + h2h[players][p2]))
    df.at[df.index[i], "player2H2H"] = calc_prop(h2h[players][p2], (h2h[players][p1] + h2h[players][p2]))

    h2h[players][w] += 1

    if df.iloc[i, df.columns.get_loc("Winner")] == p1:
        df.at[df.index[i], "Target"] = 1
    else:
        df.at[df.index[i], "Target"] = 0

df.head()


Unnamed: 0,Tournament,Date,Series,Court,Surface,Round,Best of,Player_1,Player_2,Winner,...,Odd_2,player1SurfaceProp,player2SurfaceProp,player1H2H,player2H2H,player1RoundProp,player2RoundProp,player1BestOfProp,player2BestOfProp,Target
2876,AAPT Championships,1/1/2001,International,Outdoor,Hard,1st Round,3,Haas T.,Smith L.,Haas T.,...,3.75,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,1.0
2878,AAPT Championships,1/1/2001,International,Outdoor,Hard,1st Round,3,Hewitt L.,Arthurs W.,Hewitt L.,...,3.55,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,1.0
2890,AAPT Championships,1/1/2001,International,Outdoor,Hard,2nd Round,3,Haas T.,Malisse X.,Haas T.,...,2.925,0.666667,0.5,0.5,0.5,0.5,0.5,0.666667,0.5,1.0
2892,AAPT Championships,1/1/2001,International,Outdoor,Hard,2nd Round,3,Hewitt L.,Phau B.,Hewitt L.,...,5.5,0.666667,0.5,0.5,0.5,0.5,0.5,0.666667,0.5,1.0
2896,AAPT Championships,1/1/2001,International,Outdoor,Hard,2nd Round,3,Massu N.,Clement A.,Massu N.,...,1.43,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,1.0


In [63]:
df[["Day", "Month", "Year"]] = df["Date"].str.split("/", expand=True).astype(int)

actualDf = df[["Target", "Series", "Surface", "Round", "Best of",
                "player1SurfaceProp", "player2SurfaceProp",
                "player1H2H", "player2H2H", "player1RoundProp",
                "player2RoundProp", "player1BestOfProp", "player2BestOfProp",
                "Rank_1", "Rank_2", "Odd_1", "Odd_2", "Tournament"]].copy()

actualDf = pd.get_dummies(actualDf, columns=["Series", "Surface", "Round", "Tournament"])


actualDf.head()

Unnamed: 0,Target,Best of,player1SurfaceProp,player2SurfaceProp,player1H2H,player2H2H,player1RoundProp,player2RoundProp,player1BestOfProp,player2BestOfProp,...,Tournament_Viking International,Tournament_Western & Southern Financial Group Masters,Tournament_Wimbledon,Tournament_Winston-Salem Open at Wake Forest University,Tournament_Zhuhai Championships,Tournament_Zhuhai Open,Tournament_adidas International,Tournament_bet-at-home Open,Tournament_bett1HULKS Championship,Tournament_bett1HULKS Indoors
2876,1.0,3,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,...,False,False,False,False,False,False,False,False,False,False
2878,1.0,3,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,...,False,False,False,False,False,False,False,False,False,False
2890,1.0,3,0.666667,0.5,0.5,0.5,0.5,0.5,0.666667,0.5,...,False,False,False,False,False,False,False,False,False,False
2892,1.0,3,0.666667,0.5,0.5,0.5,0.5,0.5,0.666667,0.5,...,False,False,False,False,False,False,False,False,False,False
2896,1.0,3,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,...,False,False,False,False,False,False,False,False,False,False


In [72]:
from sklearn.model_selection import GridSearchCV, StratifiedKFold, train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score
from xgboost import XGBClassifier
import xgboost as xgb
from packaging import version
import numpy as np

from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import roc_auc_score, make_scorer
import numpy as np

from sklearn.model_selection import GridSearchCV, StratifiedKFold, train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, roc_auc_score
import xgboost as xgb
import numpy as np
from packaging import version

import numpy as np, xgboost as xgb, sklearn
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold, train_test_split
from sklearn.metrics import roc_auc_score, accuracy_score
from xgboost import XGBRegressor

X = actualDf.drop(columns=["Target"])
y = actualDf["Target"].astype(int)

X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

param_grid = {
    'max_depth': [2],
    'learning_rate': [0.01],
    'n_estimators': [480,490,500]
}

xgb = XGBRegressor(n_estimators=1000, objective='binary:logistic', random_state=42)

grid_search = GridSearchCV(estimator=xgb, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

print(f"Best parameters: {grid_search.best_params_}")
print(f"Best score: {grid_search.best_score_}")

best_model = grid_search.best_estimator_


y_proba = best_model.predict(X_test)[:, 1]

from sklearn.metrics import roc_auc_score
test_auc = roc_auc_score(y_test, y_proba)
print("Test AUC:", test_auc)

Fitting 3 folds for each of 3 candidates, totalling 9 fits
[CV] END ..learning_rate=0.01, max_depth=2, n_estimators=480; total time=  50.0s
[CV] END ..learning_rate=0.01, max_depth=2, n_estimators=480; total time=  50.2s
[CV] END ..learning_rate=0.01, max_depth=2, n_estimators=480; total time=  50.4s
[CV] END ..learning_rate=0.01, max_depth=2, n_estimators=490; total time=  51.2s
[CV] END ..learning_rate=0.01, max_depth=2, n_estimators=490; total time=  51.2s
[CV] END ..learning_rate=0.01, max_depth=2, n_estimators=490; total time=  51.3s
[CV] END ..learning_rate=0.01, max_depth=2, n_estimators=500; total time=  51.9s
[CV] END ..learning_rate=0.01, max_depth=2, n_estimators=500; total time=  52.0s
[CV] END ..learning_rate=0.01, max_depth=2, n_estimators=500; total time=  52.1s
Best parameters: {'learning_rate': 0.01, 'max_depth': 2, 'n_estimators': 500}
Best score: 0.21852145592371622


IndexError: too many indices for array: array is 1-dimensional, but 2 were indexed

In [70]:
y_proba = best_model.predict(X_test)

from sklearn.metrics import roc_auc_score
test_auc = roc_auc_score(y_test, y_proba)
print("Test AUC:", test_auc)

Test AUC: 0.767449841932701
