In [1]:
import warnings
warnings.simplefilter(action='ignore')

from utils import data_handler,features_handler

import pandas as pd
import numpy as np

from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LinearRegression,LogisticRegression

np.random.seed(100)

TEST_SIZE = 0.25
SWITCH_SIZE = 0.50
START_YEAR = 2014
BASE_PATH = "./tennis"

# HANDLING DATA

In [2]:
# in 2019 I deleted the data after Australian Open and Australian Open data is used for production
# 2014 dataset contains data from 2013 too becouse of ATP500
DATASET_PATHS=[]
for i in range(START_YEAR,2020):
    if i > 2003:
        if i < 2013:
            DATASET_PATHS.append(f"{BASE_PATH}/{i}.xls")
        else:
            DATASET_PATHS.append(f"{BASE_PATH}/{i}.xlsx")

In [3]:
relevant_columns =[
        "Date","Tournament","Court","Surface",
        "Round","Best of","Winner","Loser",
        "WRank","LRank","LPts","WPts","Series",
        "AvgL","AvgW"
]   

In [4]:
# Read all the data from the specified years
matches_data = pd.concat([pd.read_excel(f) for f in DATASET_PATHS], ignore_index=True,sort=True)
matches_data.head()

Unnamed: 0,ATP,AvgL,AvgW,B365L,B365W,Best of,Comment,Court,Date,EXL,...,Tournament,W1,W2,W3,W4,W5,WPts,WRank,Winner,Wsets
0,1,1.94,1.82,2.0,1.72,3,Completed,Outdoor,2013-12-30,2.0,...,Brisbane International,5.0,6.0,7.0,,,774.0,61.0,Matosevic M.,2.0
1,1,3.32,1.32,3.5,1.28,3,Completed,Outdoor,2013-12-30,3.2,...,Brisbane International,6.0,6.0,,,,1090.0,39.0,Nieminen J.,2.0
2,1,2.93,1.39,3.0,1.36,3,Completed,Outdoor,2013-12-30,2.9,...,Brisbane International,6.0,7.0,6.0,,,1130.0,37.0,Cilic M.,2.0
3,1,1.83,1.92,1.8,1.9,3,Completed,Outdoor,2013-12-30,1.85,...,Brisbane International,7.0,6.0,,,,960.0,46.0,Querrey S.,2.0
4,1,3.53,1.29,3.75,1.25,3,Completed,Outdoor,2013-12-30,3.4,...,Brisbane International,6.0,6.0,,,,1520.0,23.0,Dimitrov G.,2.0


In [5]:
# This takes some time (5 6 min)
matches_data = data_handler(matches_data,relevant_columns)
matches_data = features_handler(matches_data,matches_data)

Finished renaming columns!
Finish switching columns!
Finish dealing with unwanted values!
Finish dealing with non numerical values!
Finish dealing with experience feature!
Finish dealing with W/L feature!


### Deal with the train/test data

In [6]:
matches_data.head()

Unnamed: 0,Date,Tournament,Court,Surface,Round,Best of,Series,P1_won,P1,P2,P1Rank,P2Rank,P1Pts,P2Pts,AvgP2,AvgP1,P1_Experince,P2_Experince,P1_W/L,P2_W/L
0,2013-12-30,0,0,0,0,3,0,False,Benneteau J.,Matosevic M.,35.0,61.0,1160.0,774.0,1.94,1.82,0,0,100.0,100.0
1,2013-12-30,0,0,0,0,3,0,True,Nieminen J.,Duckworth J.,39.0,136.0,1090.0,425.0,3.32,1.32,0,0,100.0,100.0
2,2013-12-30,0,0,0,0,3,0,True,Cilic M.,Istomin D.,37.0,45.0,1130.0,965.0,2.93,1.39,0,0,100.0,100.0
3,2013-12-30,0,0,0,0,3,0,False,Tursunov D.,Querrey S.,29.0,46.0,1244.0,960.0,1.83,1.92,0,0,100.0,100.0
4,2013-12-30,0,0,0,0,3,0,True,Dimitrov G.,Haase R.,23.0,43.0,1520.0,977.0,3.53,1.29,0,0,100.0,100.0


In [7]:
features = [
    "Tournament","Court",
    "Surface","Round",
    "Best of","Series",
    "P1Rank","P2Rank",
    "P1_Experince","P2_Experince",
    "P1_W/L","P2_W/L",
    "P1Pts","P2Pts",
    "AvgP1","AvgP2",
]
# mark the relevant features for train

In [8]:
# Split the test and train data 
X_train, X_test, y_train, y_test = train_test_split(matches_data[features], matches_data["P1_won"], test_size = TEST_SIZE) 
print(f"Nr of training data:{len(X_train)}")
print(f"Nr of testing data:{len(X_test)}")

Nr of training data:9948
Nr of testing data:3317


# RANDOM FOREST

In [9]:
# Create the RandomForest Classifier with gini and fit the training data
forest_gini = RandomForestClassifier(n_jobs=2,n_estimators=100,criterion = "gini")
forest_gini.fit(X_train,y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=2,
                       oob_score=False, random_state=None, verbose=0,
                       warm_start=False)

In [10]:
# Create the RandomForest Classifier with entropy and fit the training data
forest_entropy = RandomForestClassifier(n_jobs=2,n_estimators=100,criterion = "entropy")
forest_entropy.fit(X_train,y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='entropy', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=2,
                       oob_score=False, random_state=None, verbose=0,
                       warm_start=False)

In [11]:
# Compares the 2 RandomForestClassifier and choses the best

score_gini = forest_gini.score(X_test, y_test)
score_entropy = forest_entropy.score(X_test, y_test)

print(f"The score for the RandomForestClassifier with gini: {score_gini}")
print(f"The score for the RandomForestClassifier with entropy: {score_entropy}")

forest = forest_gini  if score_gini > score_entropy else forest_entropy

print(forest.score(X_test, y_test))

The score for the RandomForestClassifier with gini: 0.82363581549593
The score for the RandomForestClassifier with entropy: 0.8203195658727767
0.82363581549593


In [12]:
# Draw a little table to see the actual results
preds = forest.predict(X_test)
pd.crosstab(y_test, preds, rownames=['Actial Wins'],colnames=['Predicted Wins'])

Predicted Wins,False,True
Actial Wins,Unnamed: 1_level_1,Unnamed: 2_level_1
False,1351,288
True,297,1381


# REGRESION

In [13]:
# Adding regresion
regr = LogisticRegression(max_iter=2000)

regr.fit(X_train,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=2000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [14]:
regr.score(X_test, y_test)

0.6499849261380766

In [15]:
preds = regr.predict(X_test)
pd.crosstab(y_test, preds, rownames=['Actial Wins'],colnames=['Predicted Wins'])

Predicted Wins,False,True
Actial Wins,Unnamed: 1_level_1,Unnamed: 2_level_1
False,1089,550
True,611,1067


# k-NN

In [16]:
import math

#  Chosing the k with the thumb rule

neigh = KNeighborsClassifier(n_neighbors= int(math.sqrt(len(X_test))))
neigh.fit(X_train,y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=57, p=2,
                     weights='uniform')

In [17]:
neigh.score(X_test, y_test)

0.6421465179378957

In [18]:
preds = neigh.predict(X_test)
pd.crosstab(y_test, preds, rownames=['Actial Wins'],colnames=['Predicted Wins'])

Predicted Wins,False,True
Actial Wins,Unnamed: 1_level_1,Unnamed: 2_level_1
False,1065,574
True,613,1065


## Trying the classifiers score on the 2019 data (cheating)

In [19]:
TEST_PATH = f"{BASE_PATH}/FullAO.xlsx"
test_df = pd.concat([pd.read_excel(TEST_PATH)], ignore_index=True,sort=True)

In [20]:
test_df = data_handler(test_df,relevant_columns)
test_df = features_handler(test_df,matches_data)

Finished renaming columns!
Finish switching columns!
Finish dealing with unwanted values!
Finish dealing with non numerical values!
Finish dealing with experience feature!
Finish dealing with W/L feature!


In [21]:
features = [
    "Tournament","Court",
    "Surface","Round",
    "Best of","Series",
    "P1Rank","P2Rank",
    "P1_Experince","P2_Experince",
    "P1_W/L","P2_W/L",
    "P1Pts","P2Pts",
    "AvgP1","AvgP2",
]

In [22]:
print(f"The score for testing on the 2019 AO with LogisticRegression is: { regr.score(test_df[features], test_df['P1_won']) } ")
print(f"The score for testing on the 2019 AO with RandomForestClassifier is: { forest.score(test_df[features], test_df['P1_won']) } ")
print(f"The score for testing on the 2019 AO with k-NN is: { neigh.score(test_df[features], test_df['P1_won']) } ")

The score for testing on the 2019 AO with LogisticRegression is: 0.7007874015748031 
The score for testing on the 2019 AO with RandomForestClassifier is: 0.8110236220472441 
The score for testing on the 2019 AO with k-NN is: 0.6692913385826772 


## Chosing the best method

In [23]:
# We have the RandomForestClassifier the LogisticRegression and k NN
score_list = [regr.score(X_test, y_test),forest.score(X_test, y_test),neigh.score(X_test, y_test)]

pd.DataFrame.from_dict(
    {
        "Name":["LogisticalRegression","RandomForestClassifier","k-NN"],
        "Score":score_list
    }
)

Unnamed: 0,Name,Score
0,LogisticalRegression,0.649985
1,RandomForestClassifier,0.823636
2,k-NN,0.642147


The choice is RandomForestClassifier with the best score

# PREDICTION WITH BEST CLASSIFIER

In [24]:
PROD_PATH = f"{BASE_PATH}/Prod.xlsx"
prod_df = pd.concat([pd.read_excel(PROD_PATH)], ignore_index=True,sort=True)

In [25]:
from utils import rename_columns

In [26]:
# We apply the same as before but know P1_won will only be a switch criteria
prod_df = data_handler(prod_df,relevant_columns)
prod_df = features_handler(prod_df,matches_data)

Finished renaming columns!
Finish switching columns!
Finish dealing with unwanted values!
Finish dealing with non numerical values!
Finish dealing with experience feature!
Finish dealing with W/L feature!


In [27]:
#  Put P1_won back to 0 since it was only a switch criteria
prod_df["P1_won"] = [0 for i in range(len(prod_df))] 
prod_df.head()

Unnamed: 0,Date,Tournament,Court,Surface,Round,Best of,Series,P1_won,P1,P2,P1Rank,P2Rank,P1Pts,P2Pts,AvgP2,AvgP1,P1_Experince,P2_Experince,P1_W/L,P2_W/L
0,2019-01-14,0,0,0,0,5,0,0,Anderson K.,Mannarino A.,6.0,42.0,4810.0,1045.0,9.96,1.06,295,267,58.602151,108.59375
1,2019-01-14,0,0,0,0,5,0,0,Basilashvili N.,Eubanks C.,20.0,170.0,1820.0,312.0,4.28,1.23,133,8,114.516129,300.0
2,2019-01-14,0,0,0,0,5,0,0,Albot R.,Mmoh M.,98.0,107.0,585.0,526.0,1.72,2.14,85,21,226.923077,200.0
3,2019-01-14,0,0,0,0,5,0,0,Andreozzi G.,Travaglia S.,77.0,137.0,708.0,401.0,2.82,1.44,17,15,183.333333,275.0
4,2019-01-14,0,0,0,0,5,0,0,Verdasco F.,Kecmanovic M.,28.0,125.0,1410.0,458.0,2.72,1.47,268,4,88.732394,300.0


#### The following Australian Open Structure will be considerd
- 1st Round: 64 matches
- 2nd Round: 32 matches
- 3rd Round: 16 matches
- 4th Round: 8 matches
- Quarterfinals: 4 matches
- Semifinals: 2 matches
- Final: 1 matches

In [28]:
features = [
    "Tournament","Court",
    "Surface","Round",
    "Best of","Series",
    "P1Rank","P2Rank",
    "P1_Experince","P2_Experince",
    "P1_W/L","P2_W/L",
    "P1Pts","P2Pts",
    "AvgP1","AvgP2",
]

results = [prod_df]
results[0].head()

Unnamed: 0,Date,Tournament,Court,Surface,Round,Best of,Series,P1_won,P1,P2,P1Rank,P2Rank,P1Pts,P2Pts,AvgP2,AvgP1,P1_Experince,P2_Experince,P1_W/L,P2_W/L
0,2019-01-14,0,0,0,0,5,0,0,Anderson K.,Mannarino A.,6.0,42.0,4810.0,1045.0,9.96,1.06,295,267,58.602151,108.59375
1,2019-01-14,0,0,0,0,5,0,0,Basilashvili N.,Eubanks C.,20.0,170.0,1820.0,312.0,4.28,1.23,133,8,114.516129,300.0
2,2019-01-14,0,0,0,0,5,0,0,Albot R.,Mmoh M.,98.0,107.0,585.0,526.0,1.72,2.14,85,21,226.923077,200.0
3,2019-01-14,0,0,0,0,5,0,0,Andreozzi G.,Travaglia S.,77.0,137.0,708.0,401.0,2.82,1.44,17,15,183.333333,275.0
4,2019-01-14,0,0,0,0,5,0,0,Verdasco F.,Kecmanovic M.,28.0,125.0,1410.0,458.0,2.72,1.47,268,4,88.732394,300.0


In [29]:
def merge_rows(df,i,j):
    new_row = {
        "Best of" : [df.iloc[i]["Best of"]],
        "Court" : [df.iloc[i]["Court"]],
        "Date" : [df.iloc[i]["Date"]],
        "Round" : [df.iloc[i]["Round"]],
        "Series" : [df.iloc[i]["Series"]],
        "Surface" : [df.iloc[i]["Surface"]],
        "Tournament" : [df.iloc[i]["Tournament"]]
    }
    
    if df.iloc[i]["P1_won"]:
        winner = "P1"
    else:
        winner = "P2"
    
    new_row["P1"] = [df.iloc[i][winner]]
    new_row["P1Pts"] = [df.iloc[i][winner+"Pts"]]
    new_row["P1Rank"] = [df.iloc[i][winner+"Rank"]]
    new_row["P1_Experince"] = [df.iloc[i][winner+"_Experince"]]
    new_row["P1_W/L"] = [df.iloc[i][winner+"_W/L"]]
    new_row["AvgP1"] = [df.iloc[i]["Avg"+winner]]
    new_row["P1_won"] = 0
    
    if df.iloc[j]["P1_won"]:
        winner = "P1"
    else:
        winner = "P2"
    
    new_row["P2"] = [df.iloc[j][winner]]
    new_row["P2Pts"] = [df.iloc[j][winner+"Pts"]]
    new_row["P2Rank"] = [df.iloc[j][winner+"Rank"]]
    new_row["P2_Experince"] = [df.iloc[j][winner+"_Experince"]]
    new_row["P2_W/L"] = [df.iloc[j][winner+"_W/L"]]
    new_row["AvgP2"] = [df.iloc[i]["Avg"+winner]]
    return new_row
        



In [30]:
def simulate_round(df):
    i=0
    new_df = pd.DataFrame(columns=df.columns)
    while i<len(df)-1:
        new_df = new_df.append(pd.DataFrame(merge_rows(df,i,i+1)))
        i += 2
        
    return new_df


In [31]:
results = [prod_df]
for i in range(6):
    round_sim = simulate_round(results[-1])
    round_sim["P1_won"] = forest.predict(round_sim[features])
    results.append(round_sim)
    
        

### Matches


### The indexes for the matches:
- 1st Round: 0
- 2nd Round: 1
- 3rd Round: 2
- 4th Round: 3
- Quarterfinals: 4
- Semifinals: 5
- Final: 6

In [34]:
results[6]

Unnamed: 0,AvgP1,AvgP2,Best of,Court,Date,P1,P1Pts,P1Rank,P1_Experince,P1_W/L,P1_won,P2,P2Pts,P2Rank,P2_Experince,P2_W/L,Round,Series,Surface,Tournament
0,9.96,9.96,5,0,2019-01-14,Dimitrov G.,1790.0,21.0,301,52.791878,True,Zverev A.,6385.0,4.0,255,49.122807,0,0,0,0
