In [1]:
# loading libraries

library(tidyverse)
library(testthat)
library(digest)
library(repr)
library(tidymodels)
library(GGally)
library(ISLR)
options(repr.matrix.max.rows = 6)

set.seed(20)

options(repr.matrix.max.rows = 7)

── [1mAttaching packages[22m ─────────────────────────────────────── tidyverse 1.3.1 ──

[32m✔[39m [34mggplot2[39m 3.3.6     [32m✔[39m [34mpurrr  [39m 0.3.4
[32m✔[39m [34mtibble [39m 3.1.7     [32m✔[39m [34mdplyr  [39m 1.0.9
[32m✔[39m [34mtidyr  [39m 1.2.0     [32m✔[39m [34mstringr[39m 1.4.0
[32m✔[39m [34mreadr  [39m 2.1.2     [32m✔[39m [34mforcats[39m 0.5.1

── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()


Attaching package: ‘testthat’


The following object is masked from ‘package:dplyr’:

    matches


The following object is masked from ‘package:purrr’:

    is_null


The following objects are masked from ‘package:readr’:

    edition_get, local_edition


The following object is masked from ‘package:tidyr’:

    matches


── [1mAttaching packages[22m

In [2]:
## reading data (from https://www.ultimatetennisstatistics.com/)

tennis_data <- read_csv("https://drive.google.com/uc?export=download&id=1_MECmUXZuuILYeEOfonSGqodW6qVdhsS")
## organizing/tidying data

colnames(tennis_data) = make.names(colnames(tennis_data))
tennis <- tennis_data |>
            select(Current.Rank, Age, Height) |> # select relevant variables
            mutate(across(everything(), function(col) {gsub(" .*", "", col)})) |> # format cells
            mutate(across(Current.Rank:Height, as.numeric)) %>% filter(Current.Rank != "NA", Age != "NA", Height != "NA")

[1m[22mNew names:
[36m•[39m `` -> `...1`
[1mRows: [22m[34m500[39m [1mColumns: [22m[34m38[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[31mchr[39m (25): Age, Country, Plays, Wikipedia, Current Rank, Best Rank, Name, Bac...
[32mdbl[39m (13): ...1, Turned Pro, Seasons, Titles, Best Season, Retired, Masters, ...

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.


In [3]:
tennis_split <- initial_split(tennis, prop = 0.75, strata = Current.Rank)

tennis_training <- training(tennis_split) %>% filter
tennis_testing <- testing(tennis_split)

tennis_training

Current.Rank,Age,Height
<dbl>,<dbl>,<dbl>
45,32,198
12,32,178
27,29,180
32,29,175
⋮,⋮,⋮
459,29,178
357,27,185
313,32,190


In [4]:
#model, recipe, cross-validation 
knn_spec <- nearest_neighbor(weight_func = "rectangular",
            neighbors = tune()) |>  
            set_engine("kknn") |>  
            set_mode("regression")

tennis_age_recipe <- recipe(Current.Rank ~ Age, data = tennis_training) |>  
            step_scale(all_predictors()) |>  
            step_center(all_predictors())

tennis_age_vfold <- vfold_cv(tennis_training, v = 5, strata = Current.Rank)

tennis_age_wkflw <- workflow() |>  add_recipe(tennis_age_recipe) |>  add_model(knn_spec)

#gridvals <- tibble(neighbors = seq(1, 66))

tennis_age_results <- tennis_age_wkflw |>
             tune_grid(resamples = tennis_age_vfold, grid = 65) |>
             collect_metrics() |>
             filter(.metric == "rmse")

tennis_age_min <- tennis_age_results |>  filter(mean == min(mean))
tennis_age_min

neighbors,.metric,.estimator,mean,n,std_err,.config
<int>,<chr>,<chr>,<dbl>,<int>,<dbl>,<chr>
14,rmse,standard,121.8198,5,6.941103,Preprocessor1_Model14


In [5]:
#Retrain data on training set, then predict with testing

#re-train KNN regression model on the training data set
tennis_age_spec <- nearest_neighbor(weight_func = "rectangular", neighbors = 15) |>  
            set_engine("kknn") |>  
            set_mode("regression")

tennis_age_fit <- workflow() |>  
        add_recipe(tennis_age_recipe) |>  
        add_model(tennis_age_spec) |>  
        fit(data = tennis_training)

#predict rmpse with testing
tennis_age_rmspe <- tennis_age_fit |>  
            predict(tennis_testing) |>  
            bind_cols(tennis_testing) |>  
            metrics(truth = Current.Rank, estimate = .pred) |>  
            filter(.metric == 'rmse') |>
            select(.estimate) |>
            pull()
tennis_age_rmspe

In [6]:
#REDO for Predictor: Height
#model, recipe, cross-validation 

tennis_height_recipe <- recipe(Current.Rank ~ Height, data = tennis_training) |>  
            step_scale(all_predictors()) |>  
            step_center(all_predictors())

tennis_height_vfold <- vfold_cv(tennis_training, v = 5, strata = Current.Rank)

tennis_height_wkflw <- workflow() |>  add_recipe(tennis_height_recipe) |>  add_model(knn_spec)

#gridvals <- tibble(neighbors = seq(1, 66))

tennis_height_results <- tennis_height_wkflw |>
             tune_grid(resamples = tennis_height_vfold, grid = 65) |>
             collect_metrics() |>
             filter(.metric == "rmse")

tennis_height_min <- tennis_age_results |>  filter(mean == min(mean))
tennis_height_min

neighbors,.metric,.estimator,mean,n,std_err,.config
<int>,<chr>,<chr>,<dbl>,<int>,<dbl>,<chr>
14,rmse,standard,121.8198,5,6.941103,Preprocessor1_Model14


In [7]:
#re-train KNN regression model on the training data set
tennis_height_spec <- nearest_neighbor(weight_func = "rectangular", neighbors = 15) |>  
            set_engine("kknn") |>  
            set_mode("regression")

tennis_height_fit <- workflow() |>  
        add_recipe(tennis_height_recipe) |>  
        add_model(tennis_height_spec) |>  
        fit(data = tennis_training)

#predict rmpse with testing
tennis_height_rmspe <- tennis_height_fit |>  
            predict(tennis_testing) |>  
            bind_cols(tennis_testing) |>  
            metrics(truth = Current.Rank, estimate = .pred) |>  
            filter(.metric == 'rmse') |>
            select(.estimate) |>
            pull()
tennis_height_rmspe

In [8]:
#Now see what RMSPE is using both predictors: Age and Height

tennis_both_recipe <- recipe(Current.Rank ~ Age + Height, data = tennis_training) |>  
            step_scale(all_predictors()) |>  
            step_center(all_predictors())

tennis_both_vfold <- vfold_cv(tennis_training, v = 5, strata = Current.Rank)

tennis_both_wkflw <- workflow() |>  add_recipe(tennis_both_recipe) |>  add_model(knn_spec)

#gridvals <- tibble(neighbors = seq(1, 66))

tennis_both_results <- tennis_both_wkflw |>
             tune_grid(resamples = tennis_both_vfold, grid = 65) |>
             collect_metrics() |>
             filter(.metric == "rmse")

tennis_both_min <- tennis_both_results |>  filter(mean == min(mean))
tennis_both_min

neighbors,.metric,.estimator,mean,n,std_err,.config
<int>,<chr>,<chr>,<dbl>,<int>,<dbl>,<chr>
15,rmse,standard,124.4717,5,3.385288,Preprocessor1_Model15


In [9]:
#re-train KNN regression model on the training data set
tennis_both_spec <- nearest_neighbor(weight_func = "rectangular", neighbors = 15) |>  
            set_engine("kknn") |>  
            set_mode("regression")

tennis_both_fit <- workflow() |>  
        add_recipe(tennis_both_recipe) |>  
        add_model(tennis_both_spec) |>  
        fit(data = tennis_training)

#predict rmpse with testing
tennis_both_rmspe <- tennis_both_fit |>  
            predict(tennis_testing) |>  
            bind_cols(tennis_testing) |>  
            metrics(truth = Current.Rank, estimate = .pred) |>  
            filter(.metric == 'rmse') |>
            select(.estimate) |>
            pull()
tennis_both_rmspe

In [10]:
#We see that using both the age and height predictors together gives us the smallest root mean square prediction error (RMSPE). 
#Therefore, we will model the predictions of current rank using both height and age predictor variables.

In [11]:
#3D Plot to visualize findings