### DSCI100 Group 006-35 W2 Final Project

Younghoon Kim 11371317 <br>
Kentaro Barnes 49524861 <br>
Sean Liou 86967916 <br>
Matthew Pabustan 48718266 <br>

## How does player experience level and age influence the total number of hours played?

## Datasets
### players.csv overview
A list of 196 unique participants who played on the minecraft server hosted for a scientific study with the following information for each player:
##### experience
- Originally a character `<chr>` variable, converted to factor `<fct>` containing one of the following:
    - Beginner (35)
    - Regular (36)
    - Amateur (63)  
    - Veteran (48)
    - Pro (14)
- Represents a player's self-reported experience in the game.
##### subscribe
- a logical `<lgl>` variable where TRUE indicates that the player is subscribed to the game-related newsletter, and FALSE indicates that the player has not
    - 144 players reported TRUE, 52 reported FALSE.
##### hashedEmail
- a string of characters `<chr>` that acts as a unique id to identify players in sessions.csv
##### played_hours
- A double `<dbl>` variable, indicating the number of hours a participant has spent playing on the server.
    - On average, each player spends a total of 5.85 hours.
##### name
- A character `<chr`> variable containing the player's real (first) name.
##### gender
- Originally a character `<chr>` variable, converted to factor `<fct>` containing one of the following:
    - Male (124)
    - Female (37)
    - Non-binary (15)
    - Prefer not to say (11)
    - Agender (2)
    - Two-Spirited (6)
    - Other (1)
- Represents a player's gender
##### Age
- A double `<dbl>` variable, indicating the age of the player in years.
    - On average, the players are 20.52 years old.
    - Two rows/players contain missing Age data, and thus is removed from the list.

### sessions.csv overview
A catalogue of all 1535 instances where a player logs into the server with the following information for each instance.
##### hashedEmail
- a string of characters `<chr>` indicating which unique player that logged on, allowing us to track session information along with personal information in players.csv.
##### start_time
- Originally a string of characters `<chr>` indicating the time (Day/Month/Year Hour:Minute) when the player logs ON
    - Converted to a date-time `<ddtm>` variable for ease of use.
##### end_time
- Originally a string of characters `<chr>` indicating the time (Day/Month/Year Hour:Minute) when the player logs OFF
    - Converted to a date-time `<ddtm>` variable for ease of use.
##### original_start_time
- A double `<dbl>` variable that represents the time a player logs ON, in number of milliseconds since 1970.
    - Not reported to high enough precision for comparison, and thus is removed.
##### original_end_time
- A double `<dbl>` variable that represents the time a player logs OFF, in number of milliseconds since 1970.
    - Not reported to high enough precision for comparison, and thus is removed.



In [None]:
# Load necessary libraries
library(tidyverse)
library(repr)
library(tidymodels)
library(cowplot)
options(repr.matrix.max.rows = 6)

In [None]:
# Reading the data

# URL to the csv files stored on github
players_url <- "https://raw.githubusercontent.com/KentoBaguetti/DSCI100-GroupProjcect/refs/heads/main/players.csv"
sessions_url <- "https://raw.githubusercontent.com/KentoBaguetti/DSCI100-GroupProjcect/refs/heads/main/sessions.csv"

# read the csv data into dataframes
players <- read_csv(players_url)
sessions <- read_csv(sessions_url)

players
sessions

nnn <- players |> 
    select(experience) |>
    group_by(experience) |>
    summarize(n = n())
nnn

In [None]:
# Tidy the data
tidy_players <- players |>
  mutate(
    experience = factor(experience, levels = c("Beginner", "Amateur", "Regular", "Veteran", "Pro"), ordered = TRUE),
    experience_numeric = as.numeric(experience)
  ) |>
    select(played_hours, experience_numeric, Age)

tidy_players

In [None]:
# Age vs Play time
prepredict_plot <- tidy_players |>
    ggplot(aes(x=Age, y=played_hours, color = as.factor(experience_numeric))) +
    geom_point() +
    labs(title = "Distribution of Playtime Across Experience Level and Age",
         x = "Age (Years)",
         y = "Total Hours Played (Hours)",
         color = "Experience Level")

prepredict_plot

In [None]:
# use knn regression as a linear relationship is not present within the data

players_split <- initial_split(tidy_players, prop = 0.75, strata = played_hours)
players_train <- training(players_split)
players_test <- testing(players_split)

knn_recipe <- recipe(played_hours ~ Age + experience_numeric, data = tidy_players) |>
    step_impute_mean(all_numeric_predictors()) |>
    step_scale(all_predictors()) |>
    step_center(all_predictors())

knn_spec <- nearest_neighbor(weight_func = "rectangular", neighbors = tune()) |>
    set_engine("kknn") |>
    set_mode("regression")

players_vfold <- vfold_cv(players_train, v = 5, strata = played_hours)

players_wf <- workflow() |>
    add_recipe(knn_recipe) |>
    add_model(knn_spec) 

players_wf


In [None]:
gridvals <- tibble(neighbors = seq(from = 1, to = 30, by = 1))

players_results <- players_wf |>
    tune_grid(resamples = players_vfold, grid = gridvals) |>
    collect_metrics() |>
    filter(.metric == "rmse")

players_results

rmse_plot <- players_results |>
    ggplot(aes(x = neighbors, y = mean)) +
    geom_line() +
    geom_point()

rmse_plot

In [None]:
min_neighbor <- players_results |>
    filter(mean == min(mean))

min_neighbor

# lowest rmse is when k = 27

k_min <- min_neighbor |>
    pull(neighbors)

k_min

In [None]:
players_final_spec <- nearest_neighbor(weight_func = "rectangular", neighbors = k_min) |>
    set_engine("kknn") |>
    set_mode("regression")

players_final_fit <- workflow() |>
    add_recipe(knn_recipe) |>
    add_model(players_final_spec) |>
    fit(data = players_train)

players_summary <- players_final_fit |>
    predict(players_test) |>
    bind_cols(players_test) |>
    metrics(truth = played_hours, estimate = .pred) |>
    filter(.metric == "rmse")

players_summary

In [None]:
players_prediction_grid <- expand_grid(
  Age = seq(
    from = min(tidy_players$Age, na.rm = TRUE),
    to   = max(tidy_players$Age, na.rm = TRUE),
    by = 1
  ),
  experience_numeric = 1:5 
)

players_pred <- players_final_fit |>
    predict(players_prediction_grid)|>
    bind_cols(players_prediction_grid)

players_plot_final <- players_p