In [30]:
import pandas as pd
import altair as alt
import numpy as np
from sklearn import set_config
from sklearn.compose import make_column_transformer
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.model_selection import (
    GridSearchCV,
    RandomizedSearchCV,
    cross_validate,
    train_test_split,
)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
alt.data_transformers.enable('vegafusion')
set_config(transform_output="pandas")

In [25]:
url = 'https://drive.google.com/uc?export=download&id=1Mw9vW0hjTJwRWx0bDXiSpYsO3gKogaPz'

players = pd.read_csv(url)
players = players.drop(columns = ['individualId', 'organizationName', 'hashedEmail', 'subscribe', 'name']) #drop unnecessary columns
players

Unnamed: 0,experience,played_hours,gender,age
0,Pro,30.3,Male,9
1,Veteran,3.8,Male,17
2,Veteran,0.0,Male,17
3,Amateur,0.7,Female,21
4,Regular,0.1,Male,21
...,...,...,...,...
191,Amateur,0.0,Female,17
192,Veteran,0.3,Male,22
193,Amateur,0.0,Prefer not to say,17
194,Amateur,2.3,Male,17


In [26]:
mean_hours_played = players['played_hours'].mean()
mean_hours_played

np.float64(5.845918367346939)

In [27]:
mean_hours_hist = alt.Chart(players).mark_bar().encode(
    x=alt.X('played_hours').title('Hours Played').bin(maxbins=30),
    y=alt.Y('count()')
)

mean_hours_hist

Mean hours played is 5.84 but you can see most players play less than 10 hours
Indicates that data is very right tailed: large data contributors likely are below the mean anyway due to outliers significantly dragging up the mean.
Therefore I will choose significant data collection to be >3 hours played.  (can change if we wanna make it lower but I don't think we should make it any higher.)

In [145]:
np.random.seed(424242)
players = players.assign(high_contributor = players['played_hours'] >= 3)
players

Unnamed: 0,experience,played_hours,gender,age,high_contributor
0,Pro,30.3,Male,9,True
1,Veteran,3.8,Male,17,True
2,Veteran,0.0,Male,17,False
3,Amateur,0.7,Female,21,False
4,Regular,0.1,Male,21,False
...,...,...,...,...,...
191,Amateur,0.0,Female,17,False
192,Veteran,0.3,Male,22,False
193,Amateur,0.0,Prefer not to say,17,False
194,Amateur,2.3,Male,17,False


In [146]:


train, test = train_test_split(players, test_size = 0.3)

X = train[['age']]
y = train['high_contributor']

X_test = test[['age']]
y_test = test['high_contributor']





In [148]:

#cross validation

param_grid = {
    'n_neighbors': range(2, 15, 1)
}

pipeline = KNeighborsClassifier()

knn_tune_grid = GridSearchCV(
    estimator=pipeline, param_grid=param_grid, cv=5, return_train_score=True, n_jobs=-1)

model_grid = knn_tune_grid.fit(X, y)
accuracies_grid = pd.DataFrame(model_grid.cv_results_)

cross_val_plot = alt.Chart(accuracies_grid).mark_line(point=True).encode(
    x=alt.X('param_n_neighbors').title('KNN Number of Neighbours').scale(zero=False),
    y=alt.Y('mean_test_score').title('Accuracy Estimate').scale(zero=False)
)

cross_val_plot




In [163]:
# make classifier
spec = KNeighborsClassifier(n_neighbors=4)

preprocessor = make_column_transformer((StandardScaler(), ['age']))

pipe = make_pipeline(preprocessor, spec)
                    
fit = pipe.fit(X, y)



In [164]:
# predict high contributor 

contribution_df = test.assign(
    predicted = pipe.predict(X_test)
)

contribution_df

Unnamed: 0,experience,played_hours,gender,age,high_contributor,predicted
177,Veteran,2.7,Non-binary,21,False,False
9,Veteran,0.0,Female,22,False,False
150,Amateur,0.0,Male,17,False,False
35,Veteran,0.4,Male,20,False,False
119,Beginner,0.0,Male,23,False,False
117,Amateur,0.0,Prefer not to say,17,False,False
173,Regular,0.0,Male,50,False,False
126,Beginner,0.7,Female,24,False,False
115,Regular,0.1,Male,24,False,False
2,Veteran,0.0,Male,17,False,False


In [165]:
# crosstab

pd.crosstab(
    contribution_df['high_contributor'],
    contribution_df['predicted']
)


predicted,False
high_contributor,Unnamed: 1_level_1
False,54
True,5
