In [43]:
import os
import tarfile
from six.moves import urllib
import pandas as pd

TENNIS_DATA_PATH = 'data'

TEST_DATA_YEARS = [2016]
TRAIN_DATA_YEARS = [2000, 2001, 2002, 2003, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2017]

TENNIS_COLUMNS = [
    'winner_id',
    'winner_rank',
    'winner_age',
    'winner_hand',
    'loser_id',
    'loser_rank',
    'loser_age',
    'loser_hand',
    'score',
]

# tourney_id            16399 non-null object
# tourney_name          16399 non-null object
# surface               16399 non-null object
# draw_size             16399 non-null int64
# tourney_level         16399 non-null object
# tourney_date          16399 non-null int64
# match_num             16399 non-null int64
# winner_id             16399 non-null int64
# winner_seed           6458 non-null float64
# winner_entry          1953 non-null object
# winner_name           16399 non-null object
# winner_hand           16398 non-null object
# winner_ht             15891 non-null float64
# winner_ioc            16399 non-null object
# winner_age            16395 non-null float64
# winner_rank           15819 non-null float64
# winner_rank_points    15819 non-null float64
# loser_id              16399 non-null int64
# loser_seed            3508 non-null float64
# loser_entry           3182 non-null object
# loser_name            16399 non-null object
# loser_hand            16399 non-null object
# loser_ht              15437 non-null float64
# loser_ioc             16399 non-null object
# loser_age             16393 non-null float64
# loser_rank            15604 non-null float64
# loser_rank_points     15604 non-null float64
# score                 16399 non-null object
# best_of               16399 non-null int64
# round                 16399 non-null object
# minutes               14584 non-null float64

def tennis_data(years, columns=TENNIS_COLUMNS):
    data_frames = []
    for year in years:
        csv_path = os.path.join(TENNIS_DATA_PATH, 'atp_matches_' + str(year) + '.csv')
        data_frames.append(pd.read_csv(csv_path, usecols=columns))
    
    return pd.concat(data_frames)


train_data = tennis_data(TRAIN_DATA_YEARS)
test_data = tennis_data(TEST_DATA_YEARS)
train_data.score
train_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 20331 entries, 0 to (2017-M-DC-2017-WG-M-SUI-USA-01, Davis Cup WG R1: SUI vs USA)
Data columns (total 9 columns):
winner_id      18226 non-null object
winner_hand    20323 non-null object
winner_age     20317 non-null float64
winner_rank    20181 non-null float64
loser_id       18381 non-null object
loser_hand     20316 non-null object
loser_age      20290 non-null float64
loser_rank     19964 non-null object
score          20331 non-null object
dtypes: float64(3), object(6)
memory usage: 1.6+ MB


In [44]:
from random import shuffle
from math import isnan
import re

def convert_score(score):
    winner_points = 0
    loser_points = 0
    for couple in score.split():
        points = couple.replace('(', ' ').replace(')', ' ').replace('-', ' ').split()
#         if (len(points) < 2):
#             print(score)
#             return 0
        points = list(map(int, points))

        winner_points += points[0]
        loser_points += points[1]
        if (len(points) == 3):
            if (points[2] > points[0]):
                winner_points += points[0] + 2
                loser_points += points[2]
            else:
                winner_points += points[0]
                loser_points += points[2]
    return winner_points / loser_points if loser_points > 0 else winner_points

def to_hand(hand):
    if hand == 'R': return 0
    if hand == 'L': return 1
    if hand == 'U': return 2
    return 3

def toLists(table):
    data = []
    labels = []
    
    pattern = re.compile('^[0|1|2|3|4|5|6|7|8|9|\-|\ |\(|\)]+$')
    for _, row in table.iterrows():
        row = dict(row)
        
        for key, value in row.items():
            if 'hand' in key:
                row[key] = to_hand(value)

        try:
            if (pattern.match(row['score']) is False): print(row['score'])
            if (pattern.match(row['score']) is False): continue
            score = convert_score(row['score'])
            if (score == 0): continue
        except: continue
            
        without_score = dict(row)
        without_score.pop('score')

        if (any(isnan(p) for p in without_score.values())): continue
        
        new_row = []
        winner = row['winner_id']
        loser = row['loser_id']
        
        players = [winner, loser]
        if (score < 1):
            players = [loser, winner]
        shuffle(players)
        first, second = players

        winner_props = [value for key, value in row.items() if 'winner' in key]
        loser_props = [value for key, value in row.items() if 'loser' in key]
        
        if first != winner:
            labels.append(1 / score)
            new_row += loser_props
            new_row += winner_props
        else:
            labels.append(score)
            new_row += winner_props
            new_row += loser_props
        
        
        for key, value in without_score.items():
            if 'winner' not in key and 'loser' not in key:
                new_row.append(value)
                
        data.append(new_row)
    
    return data, labels

toLists(train_data)
tuned_train_data, tuned_train_labels = toLists(train_data)
len(tuned_train_data) / len(train_data)

0.8256849146623383

In [45]:
from sklearn.ensemble import RandomForestRegressor

forest_reg = RandomForestRegressor(random_state=42)
forest_reg.fit(tuned_train_data, tuned_train_labels)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=42, verbose=0, warm_start=False)

In [46]:
tuned_test_data, tuned_test_labels = toLists(test_data)

def scores_to_bools(scores):
    result = []
    for score in scores:
        result.append(score > 1)
    
    return result

predictions = forest_reg.predict(tuned_test_data)
bool_predictions = scores_to_bools(predictions)

bool_labels = scores_to_bools(tuned_test_labels)

number_of_good_predictions = 0
for prediction, label in zip(bool_predictions, bool_labels):
    if prediction == label:
        number_of_good_predictions += 1

number_of_good_predictions / len(tuned_test_labels)

0.6109707686755684