In [None]:
library(tidymodels)
library(tidyverse)
library(repr)
library(readxl)
options(repr.matrix.max.rows = 6)
set.seed(40000)

In [None]:
player_data <- read_csv("players.csv")
session_data <- read_csv("sessions.csv")

player_data

In [None]:
#this cell tidies the data and redies it to be analyzed and made into a vizualization and model

player_data_clean_0 <- select(player_data, -hashedEmail) |>
    select(-name) |>
  mutate(subscribe = as.factor(subscribe),
    gender = as.factor(gender),
    experience = as.factor(experience))

player_data_clean <- player_data_clean_0 |>
select(subscribe, played_hours, Age)|>
drop_na()

player_data_clean

In [None]:
#this cell makes the important vizualizations showing relation ships between variables and subscription


# player_barplot <- ggplot(player_data, aes(x = gender, fill = subscribe)) +
#   geom_bar(position = "fill") +
#   labs(
#     title = "Proportion Subscribed by Gender",
#     y = "Proportion of subscription", x = "Gender")

# player_barplot_experience <- ggplot(player_data, aes(x = experience, fill = subscribe)) +
#   geom_bar(position = "fill") +
#   labs(
#     title = "Proportion Subscribed by experience",
#     y = "Proportion of subscription", x = "Experience level")

player_histogram <- ggplot(player_data, aes(x = Age, fill = subscribe)) +
  geom_histogram(position = "dodge", alpha = 0.5, bins = 30) +
  labs(title = "Distribution of Age by Subscription Status",
       x = "Age", y = "Count of subscriptions")

player_histogram_hours <- ggplot(player_data, aes(x = played_hours, fill = subscribe)) +
  geom_histogram(position = "dodge", alpha = 0.5, bins = 30) +
  labs(title = "Distribution of played hours by Subscription Status",
       x = "played hours", y = "Count of subscriptions")

player_scatter <- ggplot(player_data, aes(x = Age, y = played_hours, color = subscribe)) +
  geom_point(alpha = 0.5) +
  labs(title = "Subscription Status based on played hours and player age",
       x = "Age", y = "Played hours")

# player_barplot
# player_barplot_experience
player_histogram
player_histogram_hours
player_scatter

In [None]:
# This cell is for the KNN classification model for the prediction of subscriptions

player_data_clean <- player_data_clean |>
  mutate(age = scale(Age),
    hours = scale(played_hours))

# Split the data into training and testing sets
data_split <- initial_split(player_data_clean, prop = 0.75, strata = subscribe)
train_data <- training(data_split)
test_data  <- testing(data_split)

# Create the folds for cross-validation
folds <- vfold_cv(train_data, v = 5, strata = subscribe)

#Create a table with k values
k_vals <- tibble(neighbors = seq(from = 1, to = 15, by = 1))

# Create a recipe
knn_recipe <- recipe(subscribe ~ age + hours, data = train_data) |>
  step_center(all_predictors()) |>
  step_scale(all_predictors())

# Specify the model
knn_spec <- nearest_neighbor(weight_func = "rectangular", neighbors = tune()) |>
  set_engine("kknn") |>
  set_mode("classification")

# Create a workflow and use tune grid
knn_workflow <- workflow() |>
  add_recipe(knn_recipe) |>
  add_model(knn_spec) |>
  tune_grid(resamples = folds, grid = k_vals) |>
  collect_metrics()

#find the best K to use in the model
accuracies <- knn_workflow |>
          filter(.metric == "accuracy") |>
          
best_k <- accuracies |>
    slice_max(mean, n = 1) |>
          pull(neighbors)
best_k

In [None]:
#this cell is for the plot of accuracy vs k to determine the best values of k

accuracy_vs_k <- ggplot(accuracies, aes(x = neighbors, y = mean)) +
  geom_point() +
  geom_line() +
  labs(x = "Neighbors", y = "Accuracy Estimate") +
  theme(text = element_text(size = 12))

In [None]:
#use the best k to train the model once again
knn_spec_best <- nearest_neighbor(weight_func = "rectangular", neighbors = best_k) |>
  set_engine("kknn") |>
  set_mode("classification")

knn_fit <- workflow() |>
  add_recipe(knn_recipe) |>
  add_model(knn_spec_best) |>
  fit(data = train_data)

#Use the new trained model to make predictions on the test set, look at its confusion matrix and assess its accuracy, precision, and recall
player_test_predictions <- predict(knn_fit, test_data) |>
  bind_cols(test_data)

player_test_predictions |>
  metrics(truth = subscribe, estimate = .pred_class) |>
  filter(.metric == "accuracy")

player_test_predictions |> pull(subscribe) |> levels()

player_test_predictions |>
    precision(truth = subscribe, estimate = .pred_class, event_level = "second")

player_test_predictions |>
    recall(truth = subscribe, estimate = .pred_class, event_level = "second")

confusion <- player_test_predictions |>
             conf_mat(truth = subscribe, estimate = .pred_class)
confusion