In [1]:
import altair as alt
import numpy as np
import pandas as pd
from sklearn import set_config
from sklearn.model_selection import GridSearchCV, cross_validate, train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.metrics import mean_squared_error
from sklearn.compose import make_column_transformer

In [2]:
url='https://drive.google.com/uc?export=download&id=1Mw9vW0hjTJwRWx0bDXiSpYsO3gKogaPz'
player_data=pd.read_csv(url)
player_data

Unnamed: 0,experience,subscribe,hashedEmail,played_hours,name,gender,age,individualId,organizationName
0,Pro,True,f6daba428a5e19a3d47574858c13550499be23603422e6...,30.3,Morgan,Male,9,,
1,Veteran,True,f3c813577c458ba0dfef80996f8f32c93b6e8af1fa9397...,3.8,Christian,Male,17,,
2,Veteran,False,b674dd7ee0d24096d1c019615ce4d12b20fcbff12d79d3...,0.0,Blake,Male,17,,
3,Amateur,True,23fe711e0e3b77f1da7aa221ab1192afe21648d47d2b4f...,0.7,Flora,Female,21,,
4,Regular,True,7dc01f10bf20671ecfccdac23812b1b415acd42c2147cb...,0.1,Kylie,Male,21,,
...,...,...,...,...,...,...,...,...,...
191,Amateur,True,b6e9e593b9ec51c5e335457341c324c34a2239531e1890...,0.0,Bailey,Female,17,,
192,Veteran,False,71453e425f07d10da4fa2b349c83e73ccdf0fb3312f778...,0.3,Pascal,Male,22,,
193,Amateur,False,d572f391d452b76ea2d7e5e53a3d38bfd7499c7399db29...,0.0,Dylan,Prefer not to say,17,,
194,Amateur,False,f19e136ddde68f365afc860c725ccff54307dedd13968e...,2.3,Harlow,Male,17,,


In [3]:
player_relevant_data=player_data.drop(columns=['hashedEmail', 'individualId', 'organizationName', 'name'])
player_relevant_data

Unnamed: 0,experience,subscribe,played_hours,gender,age
0,Pro,True,30.3,Male,9
1,Veteran,True,3.8,Male,17
2,Veteran,False,0.0,Male,17
3,Amateur,True,0.7,Female,21
4,Regular,True,0.1,Male,21
...,...,...,...,...,...
191,Amateur,True,0.0,Female,17
192,Veteran,False,0.3,Male,22
193,Amateur,False,0.0,Prefer not to say,17
194,Amateur,False,2.3,Male,17


In [4]:
player_relevant_data['subscribe']=player_relevant_data['subscribe'].map({True: 'Subscribed', False: 'Not Subscribed', 'Subscribed':'Subscribed',
                                                 'Not Subscribed': 'Not Subscribed'})
#                                                     ^allows for the cell to be run multiple times without NaNs being created
player_relevant_data['gender']=player_relevant_data['gender'].map({'Male': 'Male','Prefer not to say': 'Other', 'Non-binary': 'Other', 'Agender': 'Other',
                                                                    'Two-Spirited': 'Other', 'Other': 'Other', 'Female':'Female'})

player_training, player_testing = train_test_split(player_relevant_data,test_size=0.25,random_state=2024)
X_train = player_training[['experience','subscribe','gender','age']]
y_train = player_training['played_hours']

X_test = player_testing[['experience','subscribe','gender','age']]
y_test = player_testing['played_hours']

In [5]:
experience_levels = ['Beginner','Amateur','Regular', 'Veteran', 'Pro']
exp_chart=alt.Chart(player_training, title='Playing Time vs Experience').mark_point(opacity=0.4).encode(
    x=alt.X('experience', title='Level of Game Experience', sort=experience_levels),
    y=alt.Y('played_hours', title='Playing Time (hrs)')).properties(
    width=200,
    height=300)
exp_chart

In [6]:

sub_chart=alt.Chart(player_training, title='Playing Time vs Subscription Status').mark_point(opacity=0.4).encode(
    x=alt.X('subscribe', title='Subscription Status'),
    y=alt.Y('played_hours', title='Playing Time (hrs)')).properties(
    width=200,
    height=300)
sub_chart

In [7]:
# played hours vs gender 

gender_sort=['Male', 'Other', 'Female']
gender_chart=alt.Chart(player_training, title='Playing Time vs Gender').mark_point(opacity=0.3).encode(
    x=alt.X('gender', title='Gender', sort=gender_sort),
    y=alt.Y('played_hours', title='Playing Time (hrs)')).properties(
    width=200,
    height=300)
gender_chart

In [8]:
age_chart=alt.Chart(player_training).mark_point(opacity=0.3).encode(
    x=alt.X('age', title='Age (yrs)'),
    y=alt.Y('played_hours', title='Playing Time (hrs)'))
age_chart

In [9]:
player_preprocessor= make_column_transformer((StandardScaler(), ["age"]),
                                             (OneHotEncoder(sparse_output=False),["gender", "subscribe", "experience"]),
                                             (OrdinalEncoder(categories=[["Beginner", "Amateur", "Regular", "Veteran", "Pro"]]), ["experience"]),
                                             verbose_feature_names_out=False, remainder="passthrough")
player_pipe=make_pipeline(player_preprocessor, KNeighborsRegressor()) 

param_grid={"kneighborsregressor__n_neighbors": range(1, 10)}

player_gridsearch = GridSearchCV( estimator=player_pipe, param_grid=param_grid, cv=3, scoring="neg_root_mean_squared_error", n_jobs=-1)

player_results=pd.DataFrame(player_gridsearch.fit(X_train, y_train).cv_results_) 

player_best_K = player_gridsearch.best_params_
player_best_RMSPE = -player_gridsearch.best_score_ 
player_best_K

{'kneighborsregressor__n_neighbors': 9}

In [10]:
player_best_RMSPE

np.float64(28.892714051729353)

In [11]:
player_results=player_results.assign(RMSPE= -player_results["mean_test_score"])

In [12]:
#Best K graph
Optimal_K_Chart=alt.Chart(player_results).mark_line().encode(
    x=alt.X('param_kneighborsregressor__n_neighbors', title='K Value'),
    y=alt.Y('RMSPE', title='Root Mean Squared Prediction Error'))
Optimal_K_Chart

In [13]:
#I would argue that either K=2 or K=4 should be used as after K=4 is completely flat so there would be no benefit using a higher one,
#but because this dataset has a heavy risk of underfitting since there is only a small amount of large values, I would probably say K=2 would be better
#Maybe see which is better if you have time

In [14]:
# FINAL MODEL TESTING USING K = 4

from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error
import numpy as np

X = player_relevant_data[['experience', 'subscribe', 'gender', 'age']]
y = player_relevant_data['played_hours']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=123
)

try:
    player_preprocessor
except NameError:
    from sklearn.preprocessing import OneHotEncoder, StandardScaler
    from sklearn.compose import make_column_transformer
    player_preprocessor = make_column_transformer(
        (OneHotEncoder(handle_unknown='ignore'),
         ['experience', 'subscribe', 'gender']),
        remainder='passthrough'
    )

from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

final_knn_4 = make_pipeline(
    player_preprocessor,
    StandardScaler(with_mean=False),
    KNeighborsRegressor(n_neighbors=4)
)

final_knn_4.fit(X_train, y_train)

y_pred_4 = final_knn_4.predict(X_test)
rmse_4 = np.sqrt(mean_squared_error(y_test, y_pred_4))

rmse_4

np.float64(40.80678616204662)

# Methods


1) Loading the data

We started by importing the necessary Python libraries such as pandas, altair, sklearn, and numpy for data manipulation, modeling, and visualization. The dataset was loaded into a DataFrame by assigning the URL of our data to "url" and using pd.read_csv(). 

2) Cleaning and preprocessing data

We removed unnecessary or irrelevant columns such as hashedEmail, individualID, organizationName, and name, as none of them were going to be used in our modelling. The subscribe column was recoded into 2 categories: those who were "Subscribed" and "Not Subscribed". This step ensured our models and visualizations were using the categorical variable in a meaningful way. The gender column was also grouped into 3 categories for simplicity: Male, Female, and Other (consisting of non-binary, Agender, Two-spirited, etc.) to simplify analysis.

3) EDA

We examined the distribution of total playtime hours using a histogram, which revealed a highly skewed distribution with most players logging very few hours and a small group accumulating very high hours. We explored the relationships between played hours and potential predictors using scatterplots as visualizations. Here ,we investigated "Experience Level" with playtimes across Beginner, Amateur, Regular, Veteran, and Pro levels. "Subscription status" was plotted to see if being subscribed or unsubscribed correlated with total hours played. We also looked at Gender and age to see if either correlated with hours played. These visualizations helped our select a model that had the strongest relationships with playtime. 

4) Preparing data for our modeling

We split the dataset into a training set (80%) and testing set (20%) using train_test_split(), ensuring our reproducibility with present by using a fixed random seed (123). The features for our model included: experience, subscription status, gender, and age, with played hours set as our target variable. Categorical variables such as experience, subscribe, and gender were "one-hot" coded to convert them into a format suitable for KNN regression. All features were standardized using StandardScaler to make distances in KNN meaningful and fair amongst all variables.

5) Modelling with KNN Regression

A KNN regressor was used with K = 4. The model was trained our our training set and used to predict played hours on our test set. Model performance was evaluated with root mean squared error (RMSE).

6) Visualizing analysis results

We created a scatterplot of predicted vs actual playtime to assess model performance visually. This plot highlighted where predictions match actual values and where our model underestimates/overestimates playtime, especially for our high-hour individuals. 

