In [1]:
import pandas as pd
import altair as alt
import numpy as np
from sklearn import set_config
from sklearn.compose import make_column_transformer
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.model_selection import (
    GridSearchCV,
    RandomizedSearchCV,
    cross_validate,
    train_test_split,
)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import make_column_selector
alt.data_transformers.enable('vegafusion')
set_config(transform_output="pandas")
pd.set_option('display.max_colwidth', None)

np.random.seed(424242)

In [2]:
url = 'https://drive.google.com/uc?export=download&id=1Mw9vW0hjTJwRWx0bDXiSpYsO3gKogaPz'

players = pd.read_csv(url)
players = players.drop(columns = ['individualId', 'organizationName', 'hashedEmail', 'subscribe', 'name']) #drop unnecessary columns


In [3]:
mean_hours_played = players['played_hours'].mean()
mean_hours_played



np.float64(5.845918367346939)

In [4]:
mean_hours_hist = alt.Chart(players).mark_bar().encode(
    x=alt.X('played_hours').title('Hours Played').bin(maxbins=30),
    y=alt.Y('count()')
)

mean_hours_hist

Mean hours played is 5.84 but you can see most players play less than 10 hours
Indicates that data is very right tailed: large data contributors likely are below the mean anyway due to outliers significantly dragging up the mean.
Data is right tailed, therefore we will use median instead of mean

In [5]:
median_hours_played = players['played_hours'].median()
median_hours_played

0.1

In [6]:
players_quantitative = pd.get_dummies(players, columns=['gender', 'experience'])
players_quantitative

players_quantitative = players_quantitative.assign(high_contributor = players_quantitative['played_hours'] >= 0.1)
players_quantitative


Unnamed: 0,played_hours,age,gender_Agender,gender_Female,gender_Male,gender_Non-binary,gender_Other,gender_Prefer not to say,gender_Two-Spirited,experience_Amateur,experience_Beginner,experience_Pro,experience_Regular,experience_Veteran,high_contributor
0,30.3,9,False,False,True,False,False,False,False,False,False,True,False,False,True
1,3.8,17,False,False,True,False,False,False,False,False,False,False,False,True,True
2,0.0,17,False,False,True,False,False,False,False,False,False,False,False,True,False
3,0.7,21,False,True,False,False,False,False,False,True,False,False,False,False,True
4,0.1,21,False,False,True,False,False,False,False,False,False,False,True,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
191,0.0,17,False,True,False,False,False,False,False,True,False,False,False,False,False
192,0.3,22,False,False,True,False,False,False,False,False,False,False,False,True,True
193,0.0,17,False,False,False,False,False,True,False,True,False,False,False,False,False
194,2.3,17,False,False,True,False,False,False,False,True,False,False,False,False,True


In [39]:

# create training set
train, test = train_test_split(players_quantitative, test_size = 0.25)










In [23]:

# setting up forward iteration to choose predictors (classification 2 textbook bit)

contr_subset = players_quantitative
names = list(contr_subset.drop(columns=['high_contributor', 'played_hours']).columns.values)
accuracy_dict = {"size": [], "selected_predictors": [], "accuracy": []}

# store the total number of predictors
n_total = len(names)

# start with an empty list of selected predictors
selected = []

# create the pipeline and CV grid search objects
param_grid = {
    "kneighborsclassifier__n_neighbors": range(1,61, 5),
}
contr_preprocessor = make_column_transformer(
    (StandardScaler(), make_column_selector(dtype_include=[np.number, 'bool']))
)
contr_tune_pipe = make_pipeline(contr_preprocessor, KNeighborsClassifier())
contr_tune_grid = GridSearchCV(
    estimator=contr_tune_pipe,
    param_grid=param_grid,
    cv=10,
    n_jobs=-1
)




In [24]:

#forward iteration to determine how many predictors is best to use

# for every possible number of predictors
for i in range(1, n_total + 1):

    accs = np.zeros(len(names))
    # for every possible predictor to add
    for j in range(len(names)):
        # Add remaining predictor j to the model
            X = contr_subset[selected + [names[j]]]
            y = contr_subset["high_contributor"]

        # Find the best K for this set of predictors
            contr_tune_grid.fit(X, y)
            accuracies_grid = pd.DataFrame(contr_tune_grid.cv_results_)

        # Store the tuned accuracy for this set of predictors
            accs[j] = accuracies_grid["mean_test_score"].max()

    # get the best new set of predictors that maximize cv accuracy
    best_set = selected + [names[accs.argmax()]]

    # store the results for this round of forward selection
    accuracy_dict["size"].append(i)
    accuracy_dict["selected_predictors"].append(", ".join(best_set))
    accuracy_dict["accuracy"].append(accs.max())

    # update the selected & available sets of predictors
    selected = best_set
    del names[accs.argmax()]

accuracies = pd.DataFrame(accuracy_dict)
accuracies



Unnamed: 0,size,selected_predictors,accuracy
0,1,gender_Non-binary,0.581579
1,2,"gender_Non-binary, gender_Two-Spirited",0.591579
2,3,"gender_Non-binary, gender_Two-Spirited, gender_Agender",0.596842
3,4,"gender_Non-binary, gender_Two-Spirited, gender_Agender, gender_Other",0.596842
4,5,"gender_Non-binary, gender_Two-Spirited, gender_Agender, gender_Other, gender_Male",0.591579
5,6,"gender_Non-binary, gender_Two-Spirited, gender_Agender, gender_Other, gender_Male, age",0.611842
6,7,"gender_Non-binary, gender_Two-Spirited, gender_Agender, gender_Other, gender_Male, age, gender_Prefer not to say",0.611842
7,8,"gender_Non-binary, gender_Two-Spirited, gender_Agender, gender_Other, gender_Male, age, gender_Prefer not to say, experience_Pro",0.601842
8,9,"gender_Non-binary, gender_Two-Spirited, gender_Agender, gender_Other, gender_Male, age, gender_Prefer not to say, experience_Pro, experience_Amateur",0.595263
9,10,"gender_Non-binary, gender_Two-Spirited, gender_Agender, gender_Other, gender_Male, age, gender_Prefer not to say, experience_Pro, experience_Amateur, gender_Female",0.591053


In [28]:
#find top accuracy and select predictors

top_accuracy = accuracies.loc[[accuracies['accuracy'].idxmax()]]
top_accuracy

Unnamed: 0,size,selected_predictors,accuracy
5,6,"gender_Non-binary, gender_Two-Spirited, gender_Agender, gender_Other, gender_Male, age",0.611842


The highest accuracy is 0.611842 and the classifier is best at predicting high contribution from the gender information as well as age information. However, this does NOT tell us about whether they are likely high contributors: only that the classifier is more accurate at predicting whether or not one is a high contributor based on data. This means that these are the most effective PREDICTORS of high contribution.

In [29]:
accuracies_chart = alt.Chart(accuracies).mark_line(point=True).encode(
    x=alt.X('size').title('Number of Predictors'),
    y=alt.Y('accuracy').title('Estimated Accuracy')
)
accuracies_chart

In [87]:
# use selected predictors to use for training model
X_train = train[['gender_Non-binary' , 'gender_Two-Spirited', 'gender_Agender', 'gender_Other', 'gender_Male', 'age']]
y_train = train['high_contributor']

X_test = test[['gender_Non-binary' , 'gender_Two-Spirited', 'gender_Agender', 'gender_Other', 'gender_Male', 'age']]
y_test = test['high_contributor']



In [81]:

#cross validation to select k

param_grid_fr= {
    'n_neighbors': range(2, 15, 1)
}

pipeline = KNeighborsClassifier()

knn_tune_grid = GridSearchCV(
    estimator=pipeline, param_grid=param_grid_fr, cv=10, return_train_score=True, n_jobs=-1)

model_grid = knn_tune_grid.fit(X_train, y_train)
accuracies_grid_fr = pd.DataFrame(model_grid.cv_results_)

cross_val_plot = alt.Chart(accuracies_grid_fr).mark_line(point=True).encode(
    x=alt.X('param_n_neighbors').title('KNN Number of Neighbours').scale(zero=False),
    y=alt.Y('mean_test_score').title('Accuracy Estimate').scale(zero=False)
)

cross_val_plot




In [82]:
# make classifier
spec = KNeighborsClassifier(n_neighbors=3)

preprocessor = make_column_transformer((StandardScaler(), ['gender_Non-binary' , 'gender_Two-Spirited', 'gender_Agender', 'gender_Other', 'gender_Male', 'age']))

pipe = make_pipeline(preprocessor, spec)
pipe.fit(X_train, y_train)



In [90]:
# predict high contributor 

contribution_df = test.assign(
    predicted = pipe.predict(X_test)
)

contribution_df

Unnamed: 0,played_hours,age,gender_Agender,gender_Female,gender_Male,gender_Non-binary,gender_Other,gender_Prefer not to say,gender_Two-Spirited,experience_Amateur,experience_Beginner,experience_Pro,experience_Regular,experience_Veteran,high_contributor,predicted
126,0.7,24,False,True,False,False,False,False,False,False,True,False,False,False,True,False
27,0.0,23,False,False,True,False,False,False,False,False,False,False,False,True,False,True
123,7.1,17,False,False,True,False,False,False,False,False,True,False,False,False,True,True
64,0.1,23,False,False,True,False,False,False,False,True,False,False,False,False,True,True
172,0.0,20,True,False,False,False,False,False,False,False,False,False,False,True,False,True
54,0.0,42,False,True,False,False,False,False,False,False,True,False,False,False,False,True
1,3.8,17,False,False,True,False,False,False,False,False,False,False,False,True,True,True
121,0.1,24,False,False,True,False,False,False,False,False,True,False,False,False,True,True
68,0.1,15,False,False,True,False,False,False,False,False,False,False,True,False,True,True
86,0.0,23,False,False,False,False,False,False,True,True,False,False,False,False,False,False


In [92]:
# crosstab

pd.crosstab(
    contribution_df['high_contributor'],
    contribution_df['predicted']
)


predicted,False,True
high_contributor,Unnamed: 1_level_1,Unnamed: 2_level_1
False,4,16
True,6,23
