In [None]:
library(patchwork)
library(tidymodels)

library(tidyverse)
library(repr)
library(infer)
library(lubridate)

options(repr.matrix.max.rows = 6)
library(dplyr)


## Merge dataframe

In [None]:
set.seed(1)
setwd("/home/jovyan/work/Minecraft-DSCI100-Project/R code")
players <- read_csv("data/players.csv")
unclean_sessions <- read_csv("data/sessions.csv")

merged_df <- left_join(unclean_sessions, players, by = "hashedEmail")
glimpse(merged_df)

## What does the data look like?

In [None]:
set.seed(1)

# make an hours logged column for each observation, where an observation is a single player
# Predictors: Play experience, hours logged, gender, and age
# Response: Subscribed to news letter?

gender_play_time <- merged_df |> group_by(gender) |> summarize(avg_hours = mean(played_hours))

#Notice, male, female, and non-binary make up a large majority of the player base. 
# For that reason they will be the only predictors

final_df <- merged_df |>   filter(gender %in% c("Female", "Male", "Non-binary")) |> mutate(subscribe = as_factor(subscribe))
glimpse(final_df)

In [None]:
# PLAYER TYPE
expert_player <- filter(final_df, experience %in% c("Veteran", "Pro"))
noob_player <- filter(final_df, experience %in% c("Beginner", "Amateur", "Regular"))


# Prep for K-NN Classification

In [None]:
set.seed(1)

pro_plot <- expert_player |> 
  ggplot(aes(x = Age, y = played_hours, color = subscribe)) +
  geom_point(alpha = 0.5) +
  labs(title = "Subscription Status by Age and Play Hours", color = "Subscribed") +
  theme_minimal()
noob_plot <- noob_player |> 
  ggplot(aes(x = Age, y = played_hours, color = subscribe)) +
  geom_point(alpha = 0.5) +
  labs(title = "Subscription Status by Age and Play Hours", color = "Subscribed") +
  theme_minimal()

num_obs <- nrow(final_df)
summary <- final_df |> 
            group_by(subscribe) |>
            summarize(count = n(), percentage = n() / num_obs *100)
(pro_plot / noob_plot ) +
  plot_layout(guides = "collect") & theme(legend.position = "bottom")

In [None]:
set.seed(1)

pro_train <- final_df |> select(subscribe, Age, played_hours)

# PRO
pro_split <- initial_split(pro_train, prop = 0.75, strata = subscribe)
pro_train <- training(pro_split)
pro_test <- testing(pro_split)


train_prop <- pro_train |>
            group_by(subscribe) |>
            summarize(n = n()) |>
            mutate(percent = 100* n/nrow(pro_train))
train_prop

sub_recipe <- recipe(subscribe ~., data = pro_train) |>
            step_scale(all_predictors())|>
            step_center(all_predictors())

knn_spec <- nearest_neighbor(weight_func = "rectangular", neighbors = 6) |>
            set_engine("kknn") |>
            set_mode("classification")

knn_fit <- workflow() |>
            add_recipe(sub_recipe) |>
            add_model(knn_spec) |>
            fit(data = pro_train)
sub_test_predict <- predict(knn_fit, pro_test) |>
                    bind_cols(pro_test)  
metrics <- sub_test_predict |> 
            metrics(truth = subscribe, estimate = .pred_class) |>
                    filter(.metric == "accuracy")
metrics
confusion <- sub_test_predict |>
             conf_mat(truth = subscribe, estimate = .pred_class)
confusion

recall <- sub_test_predict |> recall(truth = subscribe, estimate = .pred_class, event_level = "first") |> pull()
precision <- sub_test_predict |> precision(truth = subscribe, estimate = .pred_class, event_level = "first") |> pull()

recall
precision
# at k = 3 we have very high recall and precision 
# This is a very good sign in that our model will be very accurate, now lets tune the model

## Tune our model

In [None]:
set.seed(1)

# recreate the standardization recipe from before
# (since it must be based on the training data)
sub_fold <- vfold_cv(pro_train, v = 5, strata = subscribe)

knn_spec <- nearest_neighbor(
    weight_func = "rectangular",
    neighbors = tune()) |>
  set_engine("kknn") |>
  set_mode("classification")

k_vals <- tibble(neighbors = seq(from = 1, to = 100, by = 5))  

# fit the knn model (we can reuse the old knn_spec model from before)
knn_results <- workflow() |>
  add_recipe(sub_recipe) |>
  add_model(knn_spec) |>
  tune_grid(resamples = sub_fold, grid = k_vals) |>
  collect_metrics()
accuracies <- knn_results |>
  filter(.metric == "accuracy")

k_plot <- accuracies |> ggplot(aes(x = neighbors, y = mean)) +
                        geom_point() +
                        geom_line() +
                        ggtitle("accuracy vs k")

In [None]:
k_plot
best_k <- accuracies |> arrange(-mean) 
print(best_k)

## How does our model perform
- best k = 6
- recall = 0.773
- precision = 0.773




In [None]:
glimpse(final_df)