In [1]:
import random
import os
import pandas as pd
import re

In [2]:
homedir = os.path.expanduser("~")
tennis_atp = os.path.join(homedir, "Documents", "GitHub", "tennis_atp")

In [3]:
csv_list = []
for file in os.listdir(tennis_atp):
    if re.search("^atp_matches_[0-9]", file):
        path = os.path.join(tennis_atp, file)
        temp = pd.read_csv(path)
        csv_list.append(temp)
        
df = pd.concat(csv_list)
df.index = range(len(df))
df[["year", "month", "day"]] = pd.DataFrame(df["tourney_date"].apply(lambda x: [int(str(x)[:4]), int(str(x)[5:7]), int(str(x)[7:])]).tolist(), columns = ["year", "month", "day"])
df = df.dropna(subset = ["w_1stIn"])


In [4]:
winners = [n for n in df.columns if any(x in n for x in ["w_", "winner_"]) and "draw" not in n]
losers = [n for n in df.columns if any(x in n for x in ["l_", "loser_"])]

wplayer = ["_".join(["player"] + n.split("_")[1:]) for n in winners]
woppo = ["_".join(["opponent"] + n.split("_")[1:]) for n in winners]

lplayer = ["_".join(["player"] + n.split("_")[1:]) for n in losers]
loppo = ["_".join(["opponent"] + n.split("_")[1:]) for n in losers]

wdict = {}
ldict = {}
for i in range(len(winners)):
    wdict[winners[i]] = wplayer[i]
    wdict[losers[i]] = loppo[i]
    
    ldict[losers[i]] = lplayer[i]
    ldict[winners[i]] = woppo[i]

In [5]:
wdf = df.rename(columns = wdict)
ldf = df.rename(columns = ldict)

pdf = pd.concat([wdf, ldf])
pdf.sort_values("tourney_date")

Unnamed: 0,tourney_id,tourney_name,surface,draw_size,tourney_level,tourney_date,match_num,player_id,player_seed,player_entry,...,opponent_SvGms,opponent_bpSaved,opponent_bpFaced,player_rank,player_rank_points,opponent_rank,opponent_rank_points,year,month,day
81064,1991-339,Adelaide,Hard,32,A,19901231,1,101723,,,...,16.0,6.0,8.0,56.0,,2.0,,1990,23,1
81091,1991-339,Adelaide,Hard,32,A,19901231,28,101441,,Q,...,11.0,0.0,1.0,111.0,,25.0,,1990,23,1
81090,1991-339,Adelaide,Hard,32,A,19901231,27,101061,,,...,14.0,2.0,4.0,60.0,,42.0,,1990,23,1
81089,1991-339,Adelaide,Hard,32,A,19901231,26,102148,,,...,7.0,1.0,3.0,62.0,,51.0,,1990,23,1
81088,1991-339,Adelaide,Hard,32,A,19901231,25,101234,,,...,15.0,1.0,2.0,82.0,,56.0,,1990,23,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
176888,2020-M-DC-2020-WG1-PO-MEX-FIN-01,Davis Cup WG1 PO: MEX vs FIN,Clay,4,D,20200306,1,126609,,,...,10.0,4.0,6.0,418.0,81.0,821.0,17.0,2020,30,6
176889,2020-M-DC-2020-WG1-PO-MEX-FIN-01,Davis Cup WG1 PO: MEX vs FIN,Clay,4,D,20200306,2,126204,,,...,16.0,15.0,20.0,522.0,54.0,719.0,24.0,2020,30,6
176890,2020-M-DC-2020-WG1-PO-MEX-FIN-01,Davis Cup WG1 PO: MEX vs FIN,Clay,4,D,20200306,4,126204,,,...,9.0,5.0,10.0,522.0,54.0,605.0,37.0,2020,30,6
176884,2020-M-DC-2020-WG1-PO-LBN-THA-01,Davis Cup WG1 PO: LBN vs THA,Clay,4,D,20200306,4,133975,,,...,7.0,4.0,9.0,348.0,109.0,623.0,35.0,2020,30,6


In [12]:
gdf = pdf.groupby(["player_name", "surface", "year"]).sum()

In [45]:
gdf["player_1stIn%"] = gdf["player_1stIn"]/gdf["player_svpt"]
gdf["player_1stWon%"] = gdf["player_1stWon"]/gdf["player_1stIn"]
gdf["player_2ndIn"] = gdf["player_svpt"] - gdf["player_1stIn"] - gdf["player_df"]
gdf["player_2ndIn%"] = gdf["player_2ndIn"]/(gdf["player_svpt"] - gdf["player_1stIn"])
gdf["player_2ndWon%"] = gdf["player_2ndWon"]/gdf["player_2ndIn"]
gdf["player_1stReturn%"] = 1 - gdf["opponent_1stWon"]/gdf["opponent_1stIn"]
gdf["opponent_2ndIn"] = gdf["opponent_svpt"] - gdf["opponent_1stIn"] - gdf["opponent_df"]
gdf["player_2ndReturn%"] = 1 - (gdf["opponent_2ndWon"]/(gdf["opponent_svpt"] - gdf["opponent_1stIn"]))

In [306]:
def agg(a, b):
    return (1 + a - b)/2

def zero_one():
    return 0 if random.random() < 0.5 else 1

In [506]:
class Player:
    def __init__(self, player_name, surface, year, df):
        self.player_name = player_name
        self.surface = surface
        self.year = year
        self.df = df
        
        self.points = 0
        self.games = 0
        self.sets = 0
        
    def stats(self):
        cols = "player_1stIn% player_1stWon% player_2ndIn% player_2ndWon% player_1stReturn% player_2ndReturn%".split()
        row = self.df.loc[(self.player_name, self.surface, self.year), cols]
        return row


In [636]:
class Match:
    def __init__(self, player1, player1year, player2, player2year, surface, sets, lastset_tb, df):
        self.player1 = player1
        self.player1year = player1year
        self.player2 = player2
        self.player2year = player2year
        self.surface = surface
        self.sets = sets
        self.lastset_tb = lastset_tb
        self.df = df
        
        self.score = ""
        
        self.p1 = Player(self.player1, self.surface, self.player1year, self.df)
        self.p2 = Player(self.player2, self.surface, self.player2year, self.df)
    
        self.p1stats = self.p1.stats()
        self.p2stats = self.p2.stats()
        
        self.current_server = [self.player1, self.player2][zero_one()]
        self.current_set = 1
                     
    def point(self, server_row, return_row):
        if random.random() <= server_row["player_1stIn%"]:
            p = agg(server_row["player_1stWon%"], return_row["player_1stReturn%"])
            if random.random() <= p:
                return [1, 0]
            else:
                return [0, 1]

        else:
            if random.random() <= server_row["player_2ndIn%"]:
                p = agg(server_row["player_2ndWon%"], return_row["player_2ndReturn%"])
                if random.random() <= p:
                    return [1, 0]
                else:
                    return [0, 1]
            else:
                return [0, 1]
            
    def play_point(self, server):
        if server == self.player1:
            point = self.point(self.p1stats, self.p2stats)
        else:
            point = [1 - n for n in self.point(self.p2stats, self.p1stats)]
        
        self.p1.points += point[0]
        self.p2.points += point[1]
        
    def play_game(self, server):
        while abs(self.p1.points - self.p2.points) < 2 or all([self.p1.points < 4, self.p2.points < 4]):
            self.play_point(server)
            
        if self.p1.points > self.p2.points:
            self.p1.games += 1
        else:
            self.p2.games += 1
            
        self.p1.points = 0
        self.p2.points = 0
        
    def serve_change(self):
        players = [self.player1, self.player2]
        cs_index = players.index(self.current_server)
        self.current_server = players[1 - cs_index]
        
    def play_tb(self):
        players = [self.player1, self.player2]
        cs = self.current_server
        cs_index = players.index(cs)
        
        i = 0
        while all([self.p1.points < 7, self.p2.points < 7]) or (abs(self.p1.points - self.p2.points) < 2):
            self.play_point(self.current_server)
            i += 1
            if i%2 == 1:
                self.serve_change()
        
        if self.p1.points > self.p2.points:
            self.p1.games += 1
        else:
            self.p2.games += 1
            
        self.current_server = players[1 - cs_index]
            
    def play_set(self):
        while all([self.p1.games < 6, self.p2.games < 6]) or (abs(self.p1.games - self.p2.games) < 2):
            self.play_game(self.current_server)
            self.serve_change()
            
            if self.p1.games == self.p2.games == 6:
                self.play_tb()
                print(self.p1.points, self.p2.points)
                break   
                
        if self.p1.games > self.p2.games:
            self.p1.sets += 1
        else:
            self.p2.sets += 1
            
    def play_match(self):
        

In [643]:
match = Match("Marin Cilic", 2012, "Novak Djokovic", 2016, "Hard", 5, True, gdf)
match.play_set()
print(match.p1.games, match.p2.games)
print(match.p1.sets, match.p2.sets)

6 2
1 0
