In [None]:
library(tidyverse)
library(repr)
library(tidymodels)
options(repr.matrix.max.rows = 6)


players <- read_csv("https://raw.githubusercontent.com/Parsa-Sha/DSCI_100_Project/refs/heads/main/data/players.csv") 
players
sessions <- read_csv("https://raw.githubusercontent.com/Parsa-Sha/DSCI_100_Project/refs/heads/main/data/sessions.csv")
sessions

In [None]:
library(lubridate)

# sessions_tidy <- left_join(sessions, players) |>
#   mutate(
#     start_time = as.POSIXct(start_time, format = "%d/%m/%Y %H:%M"),
#     end_time   = as.POSIXct(end_time,   format = "%d/%m/%Y %H:%M"),

    
#     start_year  = year(start_time),
#     start_month = month(start_time),
#     start_day   = day(start_time),
#     start_hour  = hour(start_time),
#     start_min   = minute(start_time),

    
#     end_year  = year(end_time),
#     end_month = month(end_time),
#     end_day   = day(end_time),
#     end_hour  = hour(end_time),
#     end_min   = minute(end_time)
#   ) |>
#   select(
#     hashedEmail,
#     original_start_time,
#     original_end_time,
#     start_day, start_month, start_year, start_hour, start_min,
#     end_day, end_month, end_year, end_hour, end_min, played_hours, experience, Age, gender
#   ) |>
#     group_by(hashedEmail) |>
#     mutate(number_of_sessions = n())

# head(sessions_tidy)
# Calculate duration per session
sessions_tidy <- sessions %>%
  mutate(
    start_time = as.POSIXct(start_time, format = "%d/%m/%Y %H:%M"),
    end_time   = as.POSIXct(end_time, format = "%d/%m/%Y %H:%M"),
    session_duration = as.numeric(difftime(end_time, start_time, units = "mins"))
  )

# Summarize per user
sessions_summary <- sessions_tidy %>%
  group_by(hashedEmail) %>%
  summarise(
    total_session_duration_minutes = sum(session_duration, na.rm = TRUE),
    total_sessions = n()
  )

# Join with player data
player_data <- players %>%
  left_join(sessions_summary, by = "hashedEmail") %>%
  mutate(
    avg_session_duration = total_session_duration_minutes / total_sessions,
    avg_played_per_session = played_hours / total_sessions
  )
player_data

In [None]:

# Filter for "Pro" experience group
pro_data <- player_data %>%
  filter(experience == "Pro") %>%
  mutate(
    gender_num = case_when(
      gender == "Male" ~ 0,
      gender == "Female" ~ 1,
      TRUE ~ NA_real_
    )
  ) %>%
  select(subscribe, Age, avg_session_duration, gender_num) %>%
  drop_na()

pro_data <- pro_data %>%
  mutate(subscribe = as.factor(subscribe))
pro_data

In [None]:
set.seed(123)  # for reproducibility

pro_split <- initial_split(pro_data, prop = 0.8, strata = subscribe)
pro_train <- training(pro_split)
pro_test  <- testing(pro_split)


In [None]:
pro_recipe <- recipe(subscribe ~ Age + avg_session_duration + gender_num, data = pro_train) %>%
  step_zv(all_predictors()) %>%
  step_normalize(all_predictors())


In [None]:
knn_spec <- nearest_neighbor(neighbors = 5) %>%
  set_mode("classification") %>%
  set_engine("kknn")


In [None]:
knn_workflow <- workflow() %>%
  add_model(knn_spec) %>%
  add_recipe(pro_recipe)

knn_fit <- fit(knn_workflow, data = pro_train)


In [None]:
pro_preds <- predict(knn_fit, pro_test) %>%
  bind_cols(pro_test)

# Evaluate with basic metrics
library(yardstick)
metrics(pro_preds, truth = subscribe, estimate = .pred_class)


In [None]:
library(ggplot2)

ggplot(pro_preds, aes(x = avg_session_duration, y = Age)) +
  geom_point(aes(color = .pred_class, shape = subscribe), size = 3, alpha = 0.8) +
  scale_color_manual(values = c("red", "blue")) +
  labs(
    title = "KNN Prediction vs. Actual for Pro Experience Group",
    subtitle = "Color = Predicted, Shape = Actual (Subscribe)",
    x = "Average Session Duration (mins)",
    y = "Age"
  ) +
  theme_minimal()


In [None]:
# Reuse recipe and workflow
all_preds <- predict(knn_fit, new_data = pro_data) %>%
  bind_cols(pro_data)
ggplot(all_preds, aes(x = avg_session_duration, y = Age)) +
  geom_point(aes(color = .pred_class, shape = subscribe), size = 3, alpha = 0.8) +
  scale_color_manual(values = c("red", "blue")) +
  labs(
    title = "KNN Predictions on Full Pro Data",
    subtitle = "Color = Predicted, Shape = Actual (Subscribe)",
    x = "Average Session Duration (mins)",
    y = "Age"
  ) +
  theme_minimal()
