In [None]:
library(patchwork)
library(tidymodels)

library(tidyverse)
library(repr)
library(infer)
library(lubridate)

options(repr.matrix.max.rows = 6)
library(dplyr)


## Merge dataframe

In [None]:
set.seed(1)
setwd("/home/jovyan/work/Minecraft-DSCI100-Project/R code")
players <- read_csv("data/players.csv")
unclean_sessions <- read_csv("data/sessions.csv")

merged_df <- left_join(unclean_sessions, players, by = "hashedEmail")


## What does the data look like?

In [None]:
set.seed(1)
gender_play_time <- merged_df |> group_by(gender) |> summarize(avg_hours = mean(played_hours))
print(gender_play_time)
#Notice, male, female, and non-binary make up a large majority of the player base. 
# For that reason they will be the only predictors
final_df <- merged_df |>   filter(gender %in% c("Female", "Male", "Non-binary")) |> mutate(subscribe = as_factor(subscribe))


In [None]:
# PLAYER TYPE
expert_player <- filter(final_df, experience %in% c("Veteran", "Pro")) 
noob_player <- filter(final_df, experience %in% c("Beginner", "Amateur", "Regular"))

In [None]:
set.seed(1)

pro_plot <- expert_player |> 
  ggplot(aes(x = Age, y = played_hours, color = subscribe)) +
  geom_point(alpha = 0.5) +
  labs(title = "Subscription Status by Age and Play Hours", color = "Subscribed") +
  theme_minimal()
noob_plot <- noob_player |> 
  ggplot(aes(x = Age, y = played_hours, color = subscribe)) +
  geom_point(alpha = 0.5) +
  labs(title = "Subscription Status by Age and Play Hours", color = "Subscribed") +
  theme_minimal()

num_obs <- nrow(final_df)
summary <- final_df |> 
            group_by(subscribe) |>
            summarize(count = n(), percentage = n() / num_obs *100)
(pro_plot / noob_plot ) +
  plot_layout(guides = "collect") & theme(legend.position = "bottom")

# Prep for K-NN Classification

In [None]:
set.seed(1)

pro_train <- expert_player |> select(subscribe, Age, played_hours)
noob_train <- noob_player |> select(subscribe, Age, played_hours)

# PRO
pro_split <- initial_split(pro_train, prop = 0.75, strata = subscribe)
pro_train <- training(pro_split)
pro_test <- testing(pro_split)
# NOOB
noob_split <- initial_split(noob_train, prop = 0.75, strata = subscribe)
noob_train <- training(noob_split)
noob_test <- testing(noob_split)

pro_prop <- pro_train |>
            group_by(subscribe) |>
            summarize(n = n()) |>
            mutate(percent = 100* n/nrow(pro_train))

noob_prop <- noob_train |>
            group_by(subscribe) |>
            summarize(n = n()) |>
            mutate(percent = 100* n/nrow(noob_train))
cat("Pro Subscription Rate")
pro_prop 
cat("New Player Subscription Rate")
noob_prop

pro_recipe <- recipe(subscribe ~., data = pro_train) |>
            step_scale(all_predictors())|>
            step_center(all_predictors())
noob_recipe <- recipe(subscribe ~., data = noob_train) |>
            step_scale(all_predictors())|>
            step_center(all_predictors())

knn_spec <- nearest_neighbor(weight_func = "rectangular", neighbors = 6) |>
            set_engine("kknn") |>
            set_mode("classification")

pro_fit <- workflow() |>
            add_recipe(pro_recipe) |>
            add_model(knn_spec) |>
            fit(data = pro_train)
noob_fit <- workflow() |>
            add_recipe(noob_recipe) |>
            add_model(knn_spec) |>
            fit(data = noob_train)

pro_test_predict <- predict(pro_fit, pro_test) |>
                    bind_cols(pro_test)
noob_test_predict <- predict(noob_fit, noob_test) |>
                    bind_cols(noob_test)

pro_accuracy <- pro_test_predict |> 
            metrics(truth = subscribe, estimate = .pred_class) |>
                    filter(.metric == "accuracy") |> pull()
noob_accuracy <- noob_test_predict |> 
            metrics(truth = subscribe, estimate = .pred_class) |>
                    filter(.metric == "accuracy") |> pull()





pro_recall <- pro_test_predict |> recall(truth = subscribe, estimate = .pred_class, event_level = "first") |> pull()
pro_precision <- pro_test_predict |> precision(truth = subscribe, estimate = .pred_class, event_level = "first") |> pull()
noob_recall <- noob_test_predict |> recall(truth = subscribe, estimate = .pred_class, event_level = "first") |> pull()
noob_precision <- noob_test_predict |> precision(truth = subscribe, estimate = .pred_class, event_level = "first") |> pull()
cat("PRO ACCURACY: ", pro_accuracy, "\n")
cat("PRO RECALL: ", pro_recall, "\n")
cat("PRO PRECISION: ", pro_precision, "\n")
cat("NEW PLAYER ACCURACY: ", noob_accuracy, "\n")
cat("NEW PLAYER RECALL: ", noob_recall, "\n")
cat("NEW PLAYER PRECISION: ", noob_precision)


In [None]:
set.seed(1)

## PRO CHOOSE K
# recreate the standardization recipe from before
# (since it must be based on the training data)
sub_fold <- vfold_cv(pro_train, v = 5, strata = subscribe)
k_vals <- tibble(neighbors = seq(from = 1, to = 20, by = 2))  

knn_spec <- nearest_neighbor(
    weight_func = "rectangular",
    neighbors = tune()) |>
  set_engine("kknn") |>
  set_mode("classification")

knn_results <- workflow() |>
  add_recipe(pro_recipe) |>
  add_model(knn_spec) |>
  tune_grid(resamples = sub_fold, grid = k_vals) |>
  collect_metrics()
accuracies <- knn_results |>
  filter(.metric == "accuracy")

k_pro_plot <- accuracies |> ggplot(aes(x = neighbors, y = mean)) +
                        geom_point() +
                        geom_line() +
                        ggtitle("accuracy vs k")

## Tune our model

In [None]:
set.seed(1)

## NOOB CHOOSE K
noob_recipe <- recipe(subscribe ~., data = noob_train) |>
            step_scale(all_predictors())|>
            step_center(all_predictors())
knn_results <- workflow() |>
  add_recipe(noob_recipe) |>
  add_model(knn_spec) |>
  tune_grid(resamples = sub_fold, grid = k_vals) |>
  collect_metrics()
accuracies <- knn_results |>
  filter(.metric == "accuracy")

k_plot <- accuracies |> ggplot(aes(x = neighbors, y = mean)) +
                        geom_point() +
                        geom_line() +
                        ggtitle("accuracy vs k")

In [None]:
k_plot
best_k <- accuracies |> arrange(-mean) 
print(best_k)

## How does our model perform
- best k = 6
- recall = 0.773
- precision = 0.773




In [None]:
glimpse(final_df)