In [2]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
from datetime import datetime,timedelta

import os

os.chdir('Python')
from past_features import *
from elo_features import *
from categorical_features import *
from utilities import *
from additional_stuff import *
os.chdir('../')
import numpy as np
import pandas as pd

# Preprocessing

## Load data
The initial data was downloaded from [http://tennis-data.co.uk/alldata.php](http://tennis-data.co.uk/alldata.php) and stored in data folder. (Further work could be to automate this)

At first load the data from the files and put them together in a dataframe

In [2]:
import glob
#filenames=list(glob.glob("../Data/20*.xls*"))
filenames=list(glob.glob("Data/20*.xls*"))
l = [pd.read_excel(filename,encoding='latin-1') for filename in filenames]
data=pd.concat(l,0,sort=False)

data = data[list(data.columns)[:13]+["Wsets","Lsets","Comment"] +["PSW","PSL","B365W","B365L"]]

## Data Cleaning (#1)

In [3]:
### Data cleaning
data=data.sort_values("Date")
data["WRank"]=data["WRank"].replace(np.nan,0)
data["WRank"]=data["WRank"].replace("NR",2000)
data["LRank"]=data["LRank"].replace(np.nan,0)
data["LRank"]=data["LRank"].replace("NR",2000)
data["WRank"]=data["WRank"].astype(int)
data["LRank"]=data["LRank"].astype(int)
data["Wsets"]=data["Wsets"].astype(float)
data["Lsets"]=data["Lsets"].replace("`1",1)
data["Lsets"]=data["Lsets"].astype(float)

data['Winner']=data['Winner'].str.strip()
data['Loser']=data['Loser'].str.strip()

# the dict 'names_to_correct' is in Python/additional_stuff.py
data = data.replace(names_to_correct)

data=data.reset_index(drop=True)

### Storage of the raw dataset
#data.to_csv("../Generated Data/atp_data_raw.csv",index=False)
#dump(data,"../Generated Data/atp_data_raw")
dump(data,"Generated Data/atp_data_raw")

## Preproc. Elo-Rating

The elo-score / elo-rating is a value that describes the performance in the past of the player. It takes into account the wins and losses in the past but also the performance of the opponent. So a success against a strong comtetitor has a larger gain as against a weak opponent. Further information on [Wiki](https://en.wikipedia.org/wiki/Elo_rating_system)

The calculation-method of the elo-score is not unique and the algorithm is empirical with variats. Here is a rather simple implementation of the calculation. The *K-factor* e.g. is set constant to 32, and could be seen as *hyer-parameter*. (Working on that is for the future) 

The function is stored in [Python/elo_features.py](Python/elo_features.py)

It figured out that this feature belongs to the most important ones in this project.

(TODO: Create also Surface-dependant elo-scores for each underground)

In [3]:
### Elo rankings data

# Computing of the elo ranking of each player at the beginning of each match.
elo_rankings = compute_elo_rankings(data)
data = pd.concat([data,elo_rankings],1)

### Storage of the dataset with elo
#data.to_csv("../Generated Data/atp_data.csv",index=False)
#dump(data,"../Generated Data/atp_data")
dump(data,"Generated Data/atp_data")

NameError: name 'data' is not defined

In [6]:
#######################################################
# Until here basic with elo                ############
#######################################################

In [4]:
# Loading data if necessary

#data = pd.read_csv("../Generated Data/atp_data.csv")
#data.Date = data.Date.apply(lambda x: datetime.datetime.strptime(x, '%Y-%m-%d'))
#data = load("../Generated Data/atp_data")
data = load("Generated Data/atp_data")

Just for visual checking if data distribution is ok. Loading data sometimes gives strange datasets. 
An indication that something is wrong is when a year has only few matches (<1000). If something is wrong latest the splitting function will give errors.


In [None]:
data.groupby(pd.Grouper(key='Date',freq='Y')).agg('count')

# Feature engineering

## Selecting data to proceed
For further procedure not all data is necessary. Of course it depends on how much we want to train on. 
The earlier data was primaly needed to calculate the elo-scores. Even we start here in 2008 there will remain more than 30k matches.

In [11]:
#beg = datetime.datetime(2008,1,1) 
#end = data.Date.iloc[-1]
#indices = data[(data.Date>beg)&(data.Date<=end)].index

beg_year = 2008
#beg_year = 2017
indices = data[(data.Date.dt.year >= beg_year)].index

data_for_saving = data.iloc[indices,:].reset_index(drop=True)
dump(data_for_saving,"Generated Data/atp_data_f")

## Calculate the other features
Most features look in the past of the player and his performance. 
For the moment the most features analyze the past **150** and **300** days. 
I don't know if larger time spans may cause better results. The time span(s) is a kind of a hyperparameter.

The functions below for the feature calculation is stored in [Python/past_features.py](Python/past_features.py)

Note: There are cases where nothing is to say about the past. That means that *missing values / NaN* are produced in the features. Some classifieres can handle them (XGBoost) others not (AdaBoost with DecisionTree e.g.). These missing values will be filtered out later after splitting.

(TODO: These functions take their time. It would be nice to run them in parallel to accelerate the building. I tried and gave up. It seems that it is not as easy to use *multiprocessing* or similar in an iPython enviroment. It would have to be outsourced to a py-file and run in a shell.)

In [4]:
#features_player      = features_past_generation(features_player_creation,5,"playerft",data,indices)
#features_player_20   = features_past_generation(features_player_creation,20,"playerft",data,indices)

#features_duo         = features_past_generation(features_duo_creation,150,"duoft",data,indices)
#features_duo_300     = features_past_generation(features_duo_creation,300,"duoft_300",data,indices)

#features_general     = features_past_generation(features_general_creation,150,"generalft",data,indices)
#features_general_300 = features_past_generation(features_general_creation,300,"generalft_300",data,indices)

features_recent      = features_past_generation(features_recent_creation,150,"recentft",data,indices)
#features_recent_300  = features_past_generation(features_recent_creation,300,"recentft_300",data,indices)



0/31748 matches treated. features_recent_creation150
100/31748 matches treated. features_recent_creation150
200/31748 matches treated. features_recent_creation150
300/31748 matches treated. features_recent_creation150
400/31748 matches treated. features_recent_creation150
500/31748 matches treated. features_recent_creation150
600/31748 matches treated. features_recent_creation150
700/31748 matches treated. features_recent_creation150
800/31748 matches treated. features_recent_creation150
900/31748 matches treated. features_recent_creation150
1000/31748 matches treated. features_recent_creation150
1100/31748 matches treated. features_recent_creation150
1200/31748 matches treated. features_recent_creation150
1300/31748 matches treated. features_recent_creation150
1400/31748 matches treated. features_recent_creation150
1500/31748 matches treated. features_recent_creation150
1600/31748 matches treated. features_recent_creation150
1700/31748 matches treated. features_recent_creation150
1800

14600/31748 matches treated. features_recent_creation150
14700/31748 matches treated. features_recent_creation150
14800/31748 matches treated. features_recent_creation150
14900/31748 matches treated. features_recent_creation150
15000/31748 matches treated. features_recent_creation150
15100/31748 matches treated. features_recent_creation150
15200/31748 matches treated. features_recent_creation150
15300/31748 matches treated. features_recent_creation150
15400/31748 matches treated. features_recent_creation150
15500/31748 matches treated. features_recent_creation150
15600/31748 matches treated. features_recent_creation150
15700/31748 matches treated. features_recent_creation150
15800/31748 matches treated. features_recent_creation150
15900/31748 matches treated. features_recent_creation150
16000/31748 matches treated. features_recent_creation150
16100/31748 matches treated. features_recent_creation150
16200/31748 matches treated. features_recent_creation150
16300/31748 matches treated. fe

29000/31748 matches treated. features_recent_creation150
29100/31748 matches treated. features_recent_creation150
29200/31748 matches treated. features_recent_creation150
29300/31748 matches treated. features_recent_creation150
29400/31748 matches treated. features_recent_creation150
29500/31748 matches treated. features_recent_creation150
29600/31748 matches treated. features_recent_creation150
29700/31748 matches treated. features_recent_creation150
29800/31748 matches treated. features_recent_creation150
29900/31748 matches treated. features_recent_creation150
30000/31748 matches treated. features_recent_creation150
30100/31748 matches treated. features_recent_creation150
30200/31748 matches treated. features_recent_creation150
30300/31748 matches treated. features_recent_creation150
30400/31748 matches treated. features_recent_creation150
30500/31748 matches treated. features_recent_creation150
30600/31748 matches treated. features_recent_creation150
30700/31748 matches treated. fe

In [50]:
players_encoded = features_players_encoding_2(data, indices)

In [23]:
# Putting together the features of 150 and 300 days
features_recent = pd.concat([features_recent,features_recent_300], sort=False, axis=1)
features_general = pd.concat([features_general,features_general_300], sort=False, axis=1)
features_duo = pd.concat([features_duo,features_duo_300], sort=False, axis=1)

In [5]:
# save the features
#dump(features_player,"Generated Data/player_features")
#dump(features_player_20,"Generated Data/player_features_20")
#dump(features_duo,"Generated Data/duo_features")
#dump(features_general,"Generated Data/general_features")
dump(features_recent,"Generated Data/recent_features")

In [31]:

# load features to go ahead
features_player=load("Generated Data/player_features")
features_player_20=load("Generated Data/player_features_20")
features_duo=load("Generated Data/duo_features")
features_general=load("Generated Data/general_features")
features_recent=load("Generated Data/recent_features")


In [7]:
#features_general.shape
features_recent.shape

(63496, 11)

In [12]:
data = data.iloc[indices,:].reset_index(drop=True)
odds = data[["PSW","PSL"]]


# odds feature
features_odds = pd.Series(odds.values.flatten(),name="odds")
features_odds = pd.DataFrame(features_odds)

In [16]:
eval_odds = np.array(data[["PSW","PSL"]])
for i in range(data.shape[0]):
    if np.isnan(eval_odds[i,0]):
        eval_odds[i,0] = 0
        eval_odds[i,1] = 0
    else:
        eval_odds[i,1] = -1

eval_odds_new = eval_odds.flatten()

dump(eval_odds_new,"Generated Data/eval_odds")

array([ 1.212, -1.   ,  1.69 , ..., -1.   ,  2.   , -1.   ])

## Making opponents

## Encoding of categorical features

In [51]:
#features_categorical = data[["Series","Court","Surface","Round","Best of","Tournament"]]
features_categorical = data[["Series","Court","Surface","Round","Best of"]]
features_categorical_encoded = categorical_features_encoding(features_categorical)
#players_encoded = features_players_encoding(data)
players_encoded = features_players_encoding_2(data, indices)
tournaments_encoded = features_tournaments_encoding(data)
#features_onehot = pd.concat([features_categorical_encoded,players_encoded,tournaments_encoded],1)
features_onehot = pd.concat([features_categorical_encoded,tournaments_encoded],1)

## Duplication of rows
For the moment we have one row per match. 
We "duplicate" each row to have one row for each outcome of each match. 
Of course it isn't a simple duplication of  each row, we need to "invert" some features

In [52]:
# Categorical features
features_onehot = pd.DataFrame(np.repeat(features_onehot.values,2, axis=0),columns=features_onehot.columns)

In [10]:
def make_opp_ft(feats):
    '''
    Takes the Features of a match and adds it to the opponent as as features_opp.
    
    Output:
    -----------
    gives back a Dataframe withe the features_opp only. 
    Not concat with the input features. 
    '''
    feats_odd = feats.loc[range(1,feats.shape[0],2),:]
    feats_odd.index = range(0,feats.shape[0],2)

    feats_even = feats.loc[range(0,feats.shape[0],2),:]
    feats_even.index = range(1,feats.shape[0],2)

    feats_opp = pd.concat([feats_odd, feats_even]).sort_index(kind='merge')
    col_names = list(feats.columns)
    col_names_new = [x + '_opp' for x in col_names]
    feats_opp.columns = col_names_new
    
    return feats_opp


SyntaxError: invalid syntax (<ipython-input-24-90d2a65c4071>, line 9)

In [53]:
# ------- player_ft
feat_to_opp = features_player
feat_ini = features_player
feat_opped = make_opp_ft(feat_to_opp)
features_player_opp = pd.concat([feat_ini, feat_opped], axis=1)

# ------- player_ft 20
feat_to_opp = features_player_20
feat_ini = features_player_20
feat_opped = make_opp_ft(feat_to_opp)
features_player_20_opp = pd.concat([feat_ini, feat_opped], axis=1)

# ------- recent ft
feat_to_opp = features_recent
feat_ini = features_recent
feat_opped = make_opp_ft(feat_to_opp)
features_recent_opp = pd.concat([feat_ini, feat_opped], axis=1)

# ------- general ft
feat_to_opp = features_general[list(features_general.columns)[9:12]]
feat_ini = features_general
feat_opped = make_opp_ft(feat_to_opp)
features_general_opp = pd.concat([feat_ini, feat_opped], axis=1)

# ------- general ft
feat_to_opp = features_odds
feat_ini = features_odds
feat_opped = make_opp_ft(feat_to_opp)
features_odds_opp = pd.concat([feat_ini, feat_opped], axis=1)

## Odd features
We will put in for the moment. Due to the enormous influence of the odds on the results we will drop them for some models later to make our own opinion.

In [40]:
# odds feature
features_odds = pd.Series(odds.values.flatten(),name="odds")
features_odds = pd.DataFrame(features_odds)

## Putting all together

In [61]:
### Building of the final dataset
# You can remove some features to see the effect on the ROI
features = pd.concat([features_odds_opp,
                  features_onehot,
                  features_player_opp,
                  features_player_20_opp,
                  features_duo,
                  features_general_opp,
                  players_encoded,
                  features_recent_opp],1)

#features.to_csv("../Generated Data/atp_data_features.csv",index=True)
dump(features,"Generated Data/atp_data_features2")

In [32]:
### Building of the final dataset
# You can remove some features to see the effect on the ROI
features = pd.concat([features_odds,
                  features_onehot,
                  features_player,
                  features_duo,
                  features_general,
                  features_recent],1)

#features.to_csv("../Generated Data/atp_data_features.csv",index=True)
#dump(data,"Generated Data/atp_data_features")

In [33]:
# - Have fun with the model building ! ;-)

In [60]:
features.shape

(63496, 2068)