In [None]:
library(tidyverse)
library(repr)
library(tidymodels)
library(GGally)
library(ISLR)
options(repr.matrix.max.rows = 6)
source("cleanup.R")

In [None]:
players <- read_csv("https://raw.githubusercontent.com/Snowy129/DSCI100-Project/refs/heads/main/players.csv")
head(players)

sessions <- read_csv("https://raw.githubusercontent.com/Snowy129/DSCI100-Project/refs/heads/main/sessions.csv")
head(sessions)

In [None]:
clean_players <- players |>
    mutate(gender = as_factor(gender), age = as.integer(age), experience = as_factor(experience)) |>
    select(played_hours, gender, age, experience, subscribe)
head(clean_players)

In [None]:
set.seed(1234)

#Splitting the Data into Testing and Training
players_split <- initial_split(clean_players, prop = 0.80, strata = played_hours)
players_testing <- testing(players_split)
players_training <- training(players_split)



#PLAN#

- Do first knn regression with all predictors
    - make sure to do vfold and cross analysis
    - find the weights of each predictor and find RMSPE

- Do second knn regression with the top 3 highest weights
    - do vfold
    - find the top 2 weights and the RMSPE

- Do final knn regression with only the top 2 weights
    - do vfold
    - find the RMSPE

- Compare all 3 RMSPE and decide which model is the best


In [None]:
knn_spec <- nearest_neighbor(weight_func = "rectangular", neighbors = tune()) |>
    set_engine("kknn") |>
    set_mode("regression")

knn_recipe <- recipe(played_hours ~ ., data = players_training) |>
    step_scale(all_predictors()) |>
    step_center(all_predictors())

player_vfold <- vfold_cv(players_training, v = 5, strata = played_hours) 

grid_vals <- tibble(neighbors = seq(from = 1, to = 200, by = 3))

all_pred_workflow <- workflow() |>
    add_recipe(knn_recipe) |>
    add_model(knn_spec) |>
    tune_grid(resamples = player_vfold, grid = grid_vals) |>
    collect_metrics() |>
    filter(.metric == "rmse")

all_pred_workflow    