# PREDICTING MINECRAFT SERVER NEWSLETTER SUBSCRIPTION USING PLAYER DEMOGRAPHICS AND BEHAVIOR

**Name:** Zhaoxuan Wu  
**GitHub:** [https://github.com/Shad2zz/Zhaoxuanwu-dsci-100](https://github.com/Shad2zz/Zhaoxuanwu-dsci-100)

## Background
- Video‐game research platforms (e.g., Minecraft servers) enable computer science researchers to collect real‐world player behavior data.  
- The UBC research group led by Frank Wood aims to leverage these data to optimize player recruitment and allocate server resources effectively.  
- Subscribing to the game newsletter serves as an indicator of player engagement and future interaction intent.

## Question
> “Can player demographics (age, gender, experience) and behavioral features (total play time, number of sessions, average session duration, night/weekend play proportion) predict whether a player will subscribe to the game newsletter?”

## Data Description
- **players.csv**  
  - **Observations:** 196  
  - **Variables (6):  
    - `hashedEmail` (string): unique player identifier  
    - `experience` (numeric): cumulative experience points  
    - `played_hours` (numeric): total play time (hours)  
    - `subscribed` (factor): subscription status (“Yes”/“No”)  
    - `gender` (factor): gender (“Male”/“Female”/“Other”)  
    - `age` (numeric): age in years  
  - **Data Quality:** some missing age values; subscription rate approx. 60% Yes, 40% No

- **sessions.csv**  
  - **Observations:** 1,535  
  - **Variables (3):**  
    - `hashedEmail` (string): unique player identifier  
    - `start_time` (string datetime): session start time (UTC)  
    - `end_time` (string datetime): session end time (UTC)  
  - **Data Quality:** some sessions span midnight, requiring careful handling in feature engineering

> **Potential Issues:**  
> - Time zone alignment and timestamp consistency  
> - Players with no sessions or extremely long/short sessions  
> - Unobserved external factors (e.g., network outages, server maintenance) may influence behavior  










In [None]:
library(tidyverse)   
library(lubridate)   
library(tidymodels)   
library(cowplot)      



players  <- read_csv("https://raw.githubusercontent.com/Shad2zz/Zhaoxuanwu-dsci-100/refs/heads/main/players.csv")
sessions <- read_csv("https://raw.githubusercontent.com/Shad2zz/Zhaoxuanwu-dsci-100/refs/heads/main/sessions.csv")


head(players)
tail(players)
head(sessions)
tail(sessions)


Parse start_time/end_time as POSIX datetimes.

Compute duration_mins, extract hour and weekday (wday).

Flag sessions in night hours (20:00–06:00) and on weekends (Sat/Sun).

Aggregate per player:

n_sessions, avg_duration, prop_night, prop_weekend.

In [None]:
sessions_features <- sessions %>%
  mutate(
    start         = ymd_hms(start_time),
    end           = ymd_hms(end_time),
    duration_mins = as.numeric(difftime(end, start, units = "mins")),
    hour          = hour(start),
    wday          = wday(start, label = TRUE),
    night         = hour >= 20 | hour < 6,
    weekend       = wday %in% c("Sat", "Sun")
  ) %>%
  group_by(hashedEmail) %>%
  summarise(
    n_sessions   = n(),
    avg_duration = mean(duration_mins, na.rm = TRUE),
    prop_night   = mean(night, na.rm = TRUE),
    prop_weekend = mean(weekend, na.rm = TRUE)
  )
sessions_features

Left‐join sessions_features to players on hashedEmail.

Replace NA in new features with 0.

Convert subscribe and gender to factors.

In [None]:
data_all <- players %>%
  left_join(sessions_features, by = "hashedEmail") %>%
  replace_na(list(
    n_sessions   = 0,
    avg_duration = 0,
    prop_night   = 0,
    prop_weekend = 0
  )) %>%
  mutate(
    subscribe = factor(subscribe),
    gender     = factor(gender)
  )
data_all

Compute mean and SD of each predictor by subscribe status.

Visualize:

Figure 1: Boxplot of played_hours by subscribe.

Figure 2: Stacked bar chart of gender vs. subscribe.

In [None]:
data_all %>%
  group_by(subscribe) %>%
  summarise(
    across(
      c(Age, experience, played_hours, n_sessions, avg_duration, prop_night, prop_weekend),
      list(mean = mean, sd = sd),
      .names = "{.col}_{.fn}"
    )
  )

ggplot(data_all, aes(x = subscribe, y = played_hours)) +
  geom_boxplot() +
  labs(
    title = "Figure 1: Total Play Time by Subscription Status",
    x = "Subscribe",
    y = "Total Play Time (hours)"
  )

ggplot(data_all, aes(x = gender, fill = subscribe)) +
  geom_bar(position = "fill") +
  scale_y_continuous(labels = scales::percent_format()) +
  labs(
    title = "Figure 2: Subscription Rate by Gender",
    x = "Gender",
    y = "Proportion Subscribed",
    fill = "Subscribe"
  )

Split data into train (70%) and test (30%) stratified by subscribe.

Define logistic regression workflow with dummy encoding and normalization.

Perform 5-fold CV on training set to evaluate AUC and accuracy.

Fit final model and assess on test set.



In [None]:
set.seed(123)
data_split <- initial_split(data_all, prop = 0.7, strata = subscribe)
train_data <- training(data_split)
test_data  <- testing(data_split)
cv_folds   <- vfold_cv(train_data, v = 5, strata = subscribe)

recipe_knn <- recipe(subscribe ~ Age + gender + experience + played_hours +
                      n_sessions + avg_duration + prop_night + prop_weekend,
                    data = train_data) %>%
  step_dummy(all_nominal_predictors()) %>%
  step_normalize(all_numeric_predictors())

knn_spec <- nearest_neighbor(neighbors = tune(), weight_func = "rectangular") %>%
  set_engine("kknn") %>%
  set_mode("classification")

wf_knn <- workflow() %>%
  add_model(knn_spec) %>%
  add_recipe(recipe_knn)

grid <- tibble(neighbors = seq(3, 15, by = 2))

knn_results <- tune_grid(
  wf_knn,
  resamples = cv_folds,
  grid      = grid,
  metrics   = metric_set(accuracy, roc_auc)
)

best_k   <- select_best(knn_results, "accuracy")
final_knn <- finalize_workflow(wf_knn, best_k) %>%
  last_fit(data_split)

metrics_test <- collect_metrics(final_knn)
metrics_cv <- knn_results %>% collect_metrics()
metrics_cv %>%
  filter(.metric == "accuracy") %>%
  ggplot(aes(x = neighbors, y = mean)) +
  geom_line() +
  geom_point() +
  labs(
    title = "CV Accuracy vs Number of Neighbors",
    x = "k (neighbors)",
    y = "Accuracy"
  )
metrics_cv %>%
  filter(.metric == "roc_auc") %>%
  ggplot(aes(x = neighbors, y = mean)) +
  geom_line() +
  geom_point() +
  labs(
    title = "CV ROC AUC vs Number of Neighbors",
    x = "k (neighbors)",
    y = "ROC AUC"
  )

In [None]:
final_preds <- final_knn %>% collect_predictions()
roc_data    <- final_preds %>% roc_curve(truth = subscribe, .pred_Yes)

ggplot(roc_data, aes(x = 1 - specificity, y = sensitivity)) +
  geom_line() +
  geom_abline(lty = 2) +
  labs(
    title = paste0("ROC Curve for Final KNN Model (k=", best_k$neighbors, ")"),
    x = "False Positive Rate",
    y = "True Positive Rate"
  )