In [None]:
library(tidymodels)
library(tidyverse)
library(repr)
library(readxl)
source("cleanup.R")
options(repr.matrix.max.rows = 6)

In [None]:
player_data <- read_csv("players.csv")
session_data <- read_csv("sessions.csv")

player_data

In [None]:
#this cell tidies the data and redies it to be analyzed and made into a vizualization and model

player_data_clean <- select(player_data, -hashedEmail) |>
    select(-name) |>
  mutate(subscribe = as.factor(subscribe),
    gender = as.factor(gender),
    experience = as.factor(experience))

player_data_clean

In [None]:
#this cell makes the important vizualizations showing relation ships between variables and subscription


player_barplot <- ggplot(player_data, aes(x = gender, fill = subscribe)) +
  geom_bar(position = "fill") +
  labs(
    title = "Proportion Subscribed by Gender",
    y = "Proportion of subscription", x = "Gender")

player_barplot_experience <- ggplot(player_data, aes(x = experience, fill = subscribe)) +
  geom_bar(position = "fill") +
  labs(
    title = "Proportion Subscribed by experience",
    y = "Proportion of subscription", x = "Experience level")

player_histogram <- ggplot(player_data, aes(x = Age, fill = subscribe)) +
  geom_histogram(position = "identity", alpha = 0.5, bins = 30) +
  labs(title = "Distribution of Age by Subscription Status",
       x = "Age", y = "Count of subscriptions")

player_barplot
player_barplot_experience
player_histogram

In [None]:
# This cell is for the KNN classification model for the prediction of subscriptions

player_data_clean <- player_data_clean |>
  mutate(age = scale(Age),
    hours = scale(played_hours))

# Split the data into training and testing sets
data_split <- initial_split(player_data_clean, prop = 0.75, strata = subscribe)
train_data <- training(data_split)
test_data  <- testing(data_split)

# Create a recipe
knn_recipe <- recipe(subscribe ~ Age + gender + played_hours + experience, data = train_data) |>
  step_center(all_predictors()) |>
  step_scale(all_predictors())

# 3. Specify the model
knn_spec <- nearest_neighbor(neighbors = tune(),
  weight_func = "rectangular") |>
  set_engine("kknn") |>
  set_mode("classification")