In [1]:
%pylab inline
import pandas as pd

Populating the interactive namespace from numpy and matplotlib


In [2]:
df = pd.read_excel('../data/2018.xlsx')

In [3]:
!cat ../data/notes.txt

Notes for Tennis Data

All data is in csv format, ready for use within standard spreadsheet applications. 

Key to results data:

ATP = Tournament number (men)
WTA = Tournament number (women)
Location = Venue of tournament
Tournament = Name of tounament (including sponsor if relevant)
Data = Date of match (note: prior to 2003 the date shown for all matches played in a single tournament is the start date)
Series = Name of ATP tennis series (Grand Slam, Masters, International or International Gold)
Tier = Tier (tournament ranking) of WTA tennis series.
Court = Type of court (outdoors or indoors)
Surface = Type of surface (clay, hard, carpet or grass)
Round = Round of match
Best of = Maximum number of sets playable in match
Winner = Match winner
Loser = Match loser
WRank = ATP Entry ranking of the match winner as of the start of the tournament
LRank = ATP Entry ranking of the match loser as of the start of the tournament
WPts = ATP Entry points of

In [4]:
df.keys()

Index(['ATP', 'Location', 'Tournament', 'Date', 'Series', 'Court', 'Surface',
       'Round', 'Best of', 'Winner', 'Loser', 'WRank', 'LRank', 'WPts', 'LPts',
       'W1', 'L1', 'W2', 'L2', 'W3', 'L3', 'W4', 'L4', 'W5', 'L5', 'Wsets',
       'Lsets', 'Comment', 'B365W', 'B365L', 'EXW', 'EXL', 'LBW', 'LBL', 'PSW',
       'PSL', 'MaxW', 'MaxL', 'AvgW', 'AvgL'],
      dtype='object')

In [5]:
def get_win_loss(row, player):
    if row['Winner'] == player:
        return 1
    return 0

def get_ranks(row, player):
    if row['Winner'] == player:
        return row['WRank'], row['LRank']
    return row['LRank'], row['WRank']

def is_winner(row, player):
    return row['Winner'] == player

def get_court(surface):
    if surface == 'Clay':
        return 1
    if surface == 'Hard':
        return 2
    if surface == 'Grass':
        return 3
    if surface == 'Carpet':
        return 4
    raise RuntimeError('Unrecognised surface: ' + str(surface))

In [25]:
def get_recent_matches(df, player, date):
    frame = df[(df['Winner'] == player) | (df['Loser'] == player)]
    frame = frame[frame['Date'] < date]
    frame = frame.sort_values('Date', ascending=False)[0:3]
    if len(frame) != 3:
        raise RuntimeError()
    return frame

In [26]:
def is_winner(player, frames):
    l = []
    for i, frame in frames.iterrows():
        if frame['Winner'] == player:
            l.append(1)
        else:
            l.append(0)
    return l

In [27]:
def get_games(player):
    pass

In [32]:
def get_historical_data(player, framelist):
    l = []
    for i, frame in framelist.iterrows():
        l += [get_court(frame['Surface']), int(frame['WRank']), int(frame['LRank'])]
    return l

In [34]:
X = df[['Winner', 'Loser', 'WRank', 'LRank', 'Surface', 'WPts', 'LPts', 'Date', 'W1', 'W2', 'L1', 'L2']].copy()

players = unique(df[['Winner', 'Loser']])

form = pd.DataFrame(columns=['P1Rank', 'P2Rank', 'P1Pts', 'P2Pts', 'Surface', \
                            'AP1Surface', 'AP1Rank', 'AP1OppRank', 'BP1Surface', 'BP1Rank', 'BP1OppRank', \
                            'CP1Surface', 'CP1Rank', 'CP1OppRank', 'AP2Surface', 'AP2Rank', 'AP2OppRank', \
                            'BP2Surface', 'BP2Rank', 'BP2OppRank', 'CP2Surface', 'CP2Rank', 'CP2OppRank', \
                            'AP1Win', 'BP1Win', 'CP1Win', 'AP2Win', 'BP2Win', 'CP2Win'])
t = []

keys = X.keys()
for player in players:
    matches = X.loc[bitwise_or(X['Winner'] == player, X['Loser'] == player)]
    if len(matches) < 4:
        continue
    srtd = matches.sort_values('Date', ascending=False)
    idxs = srtd.index.tolist()
    try:
        for i in idxs[3:]:
            row = srtd.loc[i]

            win_hist_frames = get_recent_matches(X, row['Winner'], row['Date'])
            loss_hist_frames = get_recent_matches(X, row['Loser'], row['Date'])
#             print(win_hist_frames)
#             print('\n'*5)
            
             # Shuffle the items randomly
            switch = random.choice([True, False], size=1)[0]
            if switch:
                p1 = int(row['WRank'])
                p2 = int(row['LRank'])
                p1pts = int(row['WPts'])
                p2pts = int(row['LPts'])
                p1hist = get_historical_data(row['Winner'], win_hist_frames)
                p2hist = get_historical_data(row['Loser'], loss_hist_frames)
                p1results = is_winner(row['Winner'], win_hist_frames)
                p2results = is_winner(row['Loser'], loss_hist_frames)
                
                targ = [row['W1'], row['L1'], row['W2'], row['L2']]
            else:
                p1 = int(row['LRank'])
                p2 = int(row['WRank'])
                p1pts = int(row['LPts'])
                p2pts = int(row['WPts'])
                p1hist = get_historical_data(row['Loser'], loss_hist_frames)
                p2hist = get_historical_data(row['Winner'], win_hist_frames)
                p1results = is_winner(row['Loser'], loss_hist_frames)
                p2results = is_winner(row['Winner'], win_hist_frames)
                targ = [row['L1'], row['W1'], row['L2'], row['W2']]

            
            formarr = array([p1, p2, p1pts, p2pts, get_court(row['Surface']), *p1hist, *p2hist, \
                                        *p1results, *p2results])
            
            
            if count_nonzero(isnan(targ)) > 0:
                continue
                
            if count_nonzero(isnan(formarr)) > 0:
                continue
            
            form.loc[form.size] = formarr
            t.append(targ)
    except (RuntimeError, ValueError):
        continue

In [35]:
form.to_csv('../data/reg_inputs.csv')

In [36]:
t = array(t)

In [37]:
savetxt('../data/reg_targets.csv', t, delimiter=',')