# Title: Determining the Current Rank of Tennis Players
----------------------------------------------------------------
## Introduction:
Global player rankings provide interesting and valuable insights into professional tennis performances. Players earn points based on their top 18 performances of the past 52 weeks at events hosted by one of the major governing bodies of the sport. Top performers earn significant prize money at major tournaments.

In our project, we investigated the following question: Can the current rank of professional tennis players be predicted through factors such as best rank achieved throughout their career, total prize money earned, and seasons played at the professional level? This provided an interesting sports analytics question, and the results can potentially help in predicting future player rankings, which has economic and entertainment value.

The dataset we used is “Player Stats for Top 500 Players'' from ultimatetennisstatistics.com. This dataset compiles individual player data for the 500 highest ranked professional tennis players, including a variety of different categories.


## Methods and Results:



In [1]:
library(tidyverse)
library(dbplyr)
library(DBI)
library(repr)
library(tidymodels)
library(GGally)
library(ISLR)
options(repr.matrix.max.rows = 6)
set.seed(2000)

── [1mAttaching packages[22m ─────────────────────────────────────── tidyverse 1.3.1 ──

[32m✔[39m [34mggplot2[39m 3.3.6     [32m✔[39m [34mpurrr  [39m 0.3.4
[32m✔[39m [34mtibble [39m 3.1.8     [32m✔[39m [34mdplyr  [39m 1.0.9
[32m✔[39m [34mtidyr  [39m 1.2.0     [32m✔[39m [34mstringr[39m 1.4.0
[32m✔[39m [34mreadr  [39m 2.1.2     [32m✔[39m [34mforcats[39m 0.5.1

“package ‘ggplot2’ was built under R version 4.1.3”
“package ‘tidyr’ was built under R version 4.1.2”
“package ‘readr’ was built under R version 4.1.2”
“package ‘dplyr’ was built under R version 4.1.3”
── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()

“package ‘dbplyr’ was built under R version 4.1.3”

Attaching package: ‘dbplyr’


The following objects are masked from ‘package:dplyr’:

    ident, sql



ERROR: Error in library(GGally): there is no package called ‘GGally’


In [None]:
## load data
player_stat = read_csv("https://drive.google.com/uc?export=download&id=1_MECmUXZuuILYeEOfonSGqodW6qVdhsS")

In [None]:
# clean data
# change column names from space sperated to underscore seperated
colnames(player_stat) = gsub(" ", "_", colnames(player_stat))

# seperate "Age" into age and date of birth,
# "Current_Rank" into current rank and points,
# "Best_Rank" into best rank and date of best rank.
player_clean <- player_stat |>
separate(col = Age, into = c("Age", "Date_Of_Birth"), sep = " ", convert = TRUE) |>
separate(col = Current_Rank, into = c("Current_Rank", "Points"), sep = " ", convert = TRUE) |>
separate(col = Best_Rank, into = c("Best_Rank", "Date_Of_Best_Rank"), sep = " ", convert = TRUE)

# clean Seasons from string into numeric
# remove characters in Pirze_Money, and change into double
player_clean$Seasons <- as.numeric(player_stat$Seasons)
player_clean$Prize_Money <- gsub("\\D+", "", as.character(player_clean$Prize_Money))
player_clean$Prize_Money <- as.numeric(player_clean$Prize_Money)

# player_data: select Age, Seasons, Current_Rank, Best_Rank, Prize_Money
player_data <- select(player_clean, Age, Seasons, Current_Rank, Best_Rank, Prize_Money)

head(player_data)

In [None]:
# numeric summary of predictor variables
# getting values for predictor variable table

# age values
age_mean <- mean(player_data$Age, na.rm = TRUE)
age_missing <- sum(is.na(player_data$Age))

# seasons values
seasons_mean <- mean(player_data$Seasons, na.rm = TRUE)
seasons_missing <- sum(is.na(player_data$Seasons))

# best rank values
best_rank_mean <- mean(player_data$Best_Rank, na.rm = TRUE)
best_rank_missing <- sum(is.na(player_data$Best_Rank))

# prize money values
prize_money_mean <- mean(player_data$Prize_Money, na.rm = TRUE)
prize_money_missing <- sum(is.na(player_data$Prize_Money))

total_row <- nrow(player_data)

*Table 1: Predictor variable characteristics determined using training data*
<div style="float: left">
    
| PREDICTOR VARIABLE | AGE   | SEASONS | BEST RANK | PRIZE MONEY |
|--------------------|-------|---------|-----------|-------------|
| MEAN               | 25.97 | 6.49    | 178.2     | 210         |
| NUMBER OF ROWS     | 500   | 500     | 500       | 500         |
| MISSING ROWS       | 1     | 126     | 1         | 81          |

</div>


In [None]:
# visualization

# remove all rows with empty variables
player_data <- player_data |>
    na.omit()
player_data_plot <- player_data |>
                ggpairs()

player_data_plot

In [None]:
# split data into train and test with a proportion of 75% 
# to ensure the accuraccy our model is not overrated
# Also shuffles and stratifies the data by the variable we are trying to predit, 

player_split <- initial_split(player_data, prop = 0.75, strata = Current_Rank)
player_train <- training(player_split)
player_test <- testing(player_split)

In [None]:
# recipe and model
player_recipe <- recipe(Current_Rank~Age+Seasons+Best_Rank+Prize_Money, data = player_train) |>
    step_scale(all_predictors()) |>
    step_scale(all_predictors())

player_spec <- nearest_neighbor(weight_func = "rectangular", neighbors = tune()) |>
    set_engine("kknn") |>
    set_mode("regression")

# create a 5-fold cross-validation object, and put recipe and model specification together in a workflow
player_vfold <- vfold_cv(player_train, v = 5, strata = Current_Rank)

player_wkflw <- workflow() |>
    add_recipe(player_recipe) |>
    add_model(player_spec) 

In [None]:
accuracy_k_plot <- k_accuracies %>% 
    ggplot(aes(x = k,y = Accuracy)) +
        geom_point() + 
        geom_line() +
        labs(x = "Value of K", y = "Accuracy") +
        ggtitle('Plot of Accuracy versus K')
accuracy_k_plot

In [None]:
# 5-fold cross-validation to choose the number of neighbors with min RMSPE

gridvals <- tibble(neighbors = seq(1, 200)) 

player_multi <- player_wkflw |>
    tune_grid(player_vfold, grid = gridvals) |>
    collect_metrics() |>
    filter(.metric == "rmse") |>
    filter(mean == min(mean))

player_k <- player_multi |>
    pull(neighbors)

player_multi

In [None]:
# from last step, we have smallest RMPSPE when k = 14
# re-train the mode with k = 14, and use the model to make prediction on the test data

player_spec <- nearest_neighbor(weight_func = "rectangular", neighbors = player_k) |>
    set_engine("kknn") |>
    set_mode("regression")

knn_fit <- workflow() |>
    add_recipe(player_recipe) |>
    add_model(player_spec) |>
    fit(data = player_train)

knn_preds <- knn_fit |>
    predict(player_test) |>
    bind_cols(player_test)

knn_mets <- metrics(knn_preds, truth = Current_Rank, estimate = .pred) |>
    filter(.metric == "rmse")

knn_mets

In [None]:
# confusion matrix

confusion <- cancer_test_predictions |>
             conf_mat(truth = Class, estimate = .pred_class)
confusion