In [21]:
library(tidyverse)
library(repr)
library(tidymodels)
library(rvest)
library(stringr)
library(DBI)
library(dbplyr)
options(repr.matrix.max.rows = 6)

In [22]:
atp_data_frame <- read_csv("https://drive.google.com/uc?export=download&id=1fOQ8sy_qMkQiQEAO6uFdRX4tLI8EpSTn")
head(atp_data_frame)

“Missing column names filled in: 'X1' [1]”
Parsed with column specification:
cols(
  .default = col_double(),
  tourney_id = [31mcol_character()[39m,
  tourney_name = [31mcol_character()[39m,
  surface = [31mcol_character()[39m,
  tourney_level = [31mcol_character()[39m,
  winner_seed = [31mcol_character()[39m,
  winner_entry = [31mcol_character()[39m,
  winner_name = [31mcol_character()[39m,
  winner_hand = [31mcol_character()[39m,
  winner_ioc = [31mcol_character()[39m,
  loser_seed = [31mcol_character()[39m,
  loser_entry = [31mcol_character()[39m,
  loser_name = [31mcol_character()[39m,
  loser_hand = [31mcol_character()[39m,
  loser_ioc = [31mcol_character()[39m,
  score = [31mcol_character()[39m,
  round = [31mcol_character()[39m
)

See spec(...) for full column specifications.



X1,tourney_id,tourney_name,surface,draw_size,tourney_level,tourney_date,match_num,winner_id,winner_seed,⋯,l_1stIn,l_1stWon,l_2ndWon,l_SvGms,l_bpSaved,l_bpFaced,winner_rank,winner_rank_points,loser_rank,loser_rank_points
<dbl>,<chr>,<chr>,<chr>,<dbl>,<chr>,<dbl>,<dbl>,<dbl>,<chr>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
0,2019-M020,Brisbane,Hard,32,A,20181231,300,105453,2.0,⋯,54,34,20,14,10,15,9,3590,16,1977
1,2019-M020,Brisbane,Hard,32,A,20181231,299,106421,4.0,⋯,52,36,7,10,10,13,16,1977,239,200
2,2019-M020,Brisbane,Hard,32,A,20181231,298,105453,2.0,⋯,27,15,6,8,1,5,9,3590,40,1050
3,2019-M020,Brisbane,Hard,32,A,20181231,297,104542,,⋯,60,38,9,11,4,6,239,200,31,1298
4,2019-M020,Brisbane,Hard,32,A,20181231,296,106421,4.0,⋯,56,46,19,15,2,4,16,1977,18,1855
5,2019-M020,Brisbane,Hard,32,A,20181231,295,104871,,⋯,54,40,18,15,6,9,40,1050,185,275


In [23]:
colnames(atp_data_frame)

In [56]:
player_wins <- atp_data_frame %>%
    group_by(player_id = winner_id) %>%
    summarize(w_height = mean(winner_ht, na.rm =TRUE),
              w_breakpoint_saved_pct = mean(w_bpSaved/w_bpFaced, na.rm =TRUE),
              w_second_serve_win_pct = mean(w_2ndWon / w_svpt,na.rm =TRUE),
              w_df_pct = mean(w_df / w_svpt,na.rm =TRUE),
              w_first_serve_pct = mean(w_1stWon / w_1stIn,na.rm =TRUE),
              n_wins = n(),
              mean_age_w  = mean(winner_age),
              mean_rank_points_w = mean(winner_rank_points)    
             ) %>%
    drop_na() %>%
    mutate(player_id = as.character(player_id))
player_wins


player_lose <- atp_data_frame %>%
    group_by(player_id = loser_id) %>%
    summarize(l_height = mean(loser_ht, na.rm =TRUE),
              l_breakpoint_saved_pct = mean(l_bpSaved/l_bpFaced, na.rm =TRUE),
              l_second_serve_win_pct = mean(l_2ndWon / l_svpt,na.rm =TRUE),
              l_df_pct = mean(l_df / l_svpt, na.rm =TRUE),
              l_first_serve_pct = mean(l_1stWon / l_1stIn,na.rm =TRUE),
              n_lose = n(),
              mean_age_l  = mean(loser_age),
              mean_rank_points_l = mean(loser_rank_points)    
             ) %>%
    drop_na() %>%
    mutate(player_id = as.character(player_id))
player_lose

player_join <- left_join(player_wins, player_lose, by = NULL, copy = TRUE)
player_join

player_carrer <- player_join %>%
    mutate(height = (w_height + l_height)/2,
          breakpoint_saved_pct = (w_breakpoint_saved_pct+l_breakpoint_saved_pct)/2,
          second_serve_win_pct = (w_second_serve_win_pct+l_second_serve_win_pct)/2,
          df_pct = (w_df_pct+l_df_pct)/2,
          first_serve_pct = (w_first_serve_pct+l_first_serve_pct)/2,
          win_rate = (n_wins/(n_lose+n_wins)),
          age = (mean_age_w + mean_age_l) /2,
          mean_rank_points = (mean_rank_points_w + mean_rank_points_l)/2) %>%
    select(player_id,height,breakpoint_saved_pct,second_serve_win_pct,df_pct,first_serve_pct,win_rate,age,mean_rank_points)
player_carrer

`summarise()` ungrouping output (override with `.groups` argument)



player_id,w_height,w_breakpoint_saved_pct,w_second_serve_win_pct,w_df_pct,w_first_serve_pct,n_wins,mean_age_w,mean_rank_points_w
<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<int>,<dbl>,<dbl>
100644,198,0.6429829,0.1935299,0.04309366,0.7869635,138,21.07961,4251.8116
103333,208,0.7473333,0.2031084,0.04794784,0.8490398,32,39.09523,979.5625
103819,185,0.7275885,0.2311068,0.02211705,0.8143292,133,36.95220,6602.7820
⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮
106296,183,0.6897436,0.1900810,0.04093659,0.7681642,6,25.38809,494.500
106298,185,0.7089537,0.2340468,0.03746156,0.7807671,79,24.09458,1974.139
106401,193,0.7573701,0.1997221,0.04685663,0.8356655,74,22.99025,1732.230


`summarise()` ungrouping output (override with `.groups` argument)



player_id,l_height,l_breakpoint_saved_pct,l_second_serve_win_pct,l_df_pct,l_first_serve_pct,n_lose,mean_age_l,mean_rank_points_l
<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<int>,<dbl>,<dbl>
100644,198,0.4754650,0.1466021,0.05984107,0.6833523,61,21.23500,4311.4754
103333,208,0.6574816,0.1837921,0.06141021,0.8183123,46,39.14645,946.1739
103819,185,0.4911165,0.1917861,0.02347908,0.7397832,25,37.22919,6837.8000
⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮
106296,183,0.4641359,0.1636402,0.04307512,0.5804844,12,24.85946,422.250
106298,185,0.5751084,0.2109760,0.04987012,0.6959960,57,24.31015,1854.807
106401,193,0.5388043,0.1726877,0.04724779,0.7282479,44,23.05158,1712.841


Joining, by = "player_id"



player_id,w_height,w_breakpoint_saved_pct,w_second_serve_win_pct,w_df_pct,w_first_serve_pct,n_wins,mean_age_w,mean_rank_points_w,l_height,l_breakpoint_saved_pct,l_second_serve_win_pct,l_df_pct,l_first_serve_pct,n_lose,mean_age_l,mean_rank_points_l
<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<int>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<int>,<dbl>,<dbl>
100644,198,0.6429829,0.1935299,0.04309366,0.7869635,138,21.07961,4251.8116,198,0.4754650,0.1466021,0.05984107,0.6833523,61,21.23500,4311.4754
103333,208,0.7473333,0.2031084,0.04794784,0.8490398,32,39.09523,979.5625,208,0.6574816,0.1837921,0.06141021,0.8183123,46,39.14645,946.1739
103819,185,0.7275885,0.2311068,0.02211705,0.8143292,133,36.95220,6602.7820,185,0.4911165,0.1917861,0.02347908,0.7397832,25,37.22919,6837.8000
⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮
106296,183,0.6897436,0.1900810,0.04093659,0.7681642,6,25.38809,494.500,183,0.4641359,0.1636402,0.04307512,0.5804844,12,24.85946,422.250
106298,185,0.7089537,0.2340468,0.03746156,0.7807671,79,24.09458,1974.139,185,0.5751084,0.2109760,0.04987012,0.6959960,57,24.31015,1854.807
106401,193,0.7573701,0.1997221,0.04685663,0.8356655,74,22.99025,1732.230,193,0.5388043,0.1726877,0.04724779,0.7282479,44,23.05158,1712.841


player_id,height,breakpoint_saved_pct,second_serve_win_pct,df_pct,first_serve_pct,win_rate,age,mean_rank_points
<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
100644,198,0.5592240,0.1700660,0.05146736,0.7351579,0.6934673,21.15730,4281.6435
103333,208,0.7024075,0.1934502,0.05467902,0.8336761,0.4102564,39.12084,962.8682
103819,185,0.6093525,0.2114465,0.02279807,0.7770562,0.8417722,37.09069,6720.2910
⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮
106296,183,0.5769397,0.1768606,0.04200586,0.6743243,0.3333333,25.12377,458.375
106298,185,0.6420310,0.2225114,0.04366584,0.7383816,0.5808824,24.20236,1914.473
106401,193,0.6480872,0.1862049,0.04705221,0.7819567,0.6271186,23.02092,1722.535
