In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%load_ext autoreload
%autoreload 2

from sklearn import metrics

In [2]:
df = pd.read_csv("./CrowdstormingDataJuly1st.csv")

In [3]:
from helpers import clean_data, group_data, prep_ML, normalize
dfc = clean_data(df)
dfg = group_data(dfc)
X_p,y_possible = prep_ML(dfg)
X = normalize(X_p, None)
# We use our best version of the previous part.
label_true = ((y_possible['rater1'] + y_possible['rater2']) / 2 <= 0.5).values



First let's try the k-means algorithm with 2 clusters and print out the silhouette score:

In [4]:
from sklearn.cluster import KMeans

FYI: n_jobs = 1 because the parallel version of k-means doesn't work on OSX.

In [5]:
km = KMeans(n_clusters=2, max_iter=600, init="k-means++", n_jobs=1 )

In [6]:
km.fit(X)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=600,
    n_clusters=2, n_init=10, n_jobs=1, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)

In [7]:
labels = km.labels_
s1 = metrics.silhouette_score(X, labels, metric='euclidean')
print("s1 =", s1)

s1 = 0.253540740782


In [8]:
def test_km(X):
    km = KMeans(n_clusters=2, max_iter=600, init="k-means++", n_jobs=1 )
    km.fit(X)
    labels = km.labels_
    return metrics.silhouette_score(X, labels, metric='euclidean')

We know that the most valuable feature is the seIAT therefore we will remove it (just for fun)

In [9]:
from helpers import compute_feature_importance_rfc
from sklearn.ensemble import RandomForestClassifier as RFC

# From previous exercice

prop1 = np.sum(label_true) / len(label_true)
prop0 = 1 - prop1
class_weights = {
    0 : prop0,
    1 : prop1
}

best_results = {'max_depth': 10, 'max_features': 'sqrt', 'n_estimators': 25}

rfc = RFC(max_depth=best_results["max_depth"], max_features=best_results["max_features"], n_estimators=best_results["n_estimators"], n_jobs=-1, class_weight=class_weights)
compute_feature_importance_rfc(rfc, X, label_true)

[('seIAT', 0.1192076761662911),
 ('meanExp', 0.11535965082301609),
 ('seExp', 0.11150842886559237),
 ('meanIAT', 0.089507460738880468),
 ('games', 0.060907841239402016),
 ('yellowCards', 0.060738748097005794),
 ('birthday', 0.053452685400786543),
 ('victories', 0.052951434426624096),
 ('ties', 0.051097570820641744),
 ('goals', 0.043814371663238164),
 ('defeats', 0.037644758289615075),
 ('position', 0.035497059910240741),
 ('club', 0.032691467561180215),
 ('weight', 0.031075560051553593),
 ('redCards', 0.030116507195549827),
 ('height', 0.029678838337128463),
 ('leagueCountry', 0.029279799091059617),
 ('yellowReds', 0.015470141322194092)]

In [10]:
X2 = X.drop("seIAT", axis=1)
s2 = test_km(X2)
print("s2 =", s2)
print("s2 - s1 =", s2 - s1)

s2 = 0.264064546624
s2 - s1 = 0.0105238058419


We have a better classification... that's fun !

Ok now let's remove a second feature, we will follow the same intuition as before and remove the second best feature

In [11]:
# we know from before that the mean_Exp and goal information is at position 8, and 7 respectively in the X array
X3 = X.drop(["meanExp", "seIAT"], axis=1)
s3 = test_km(X3)
print("s3 =", s3)
print("s3 - s1 =", s3 - s1)
print("s3 - s2 =", s3 - s2)

s3 = 0.266082409217
s3 - s1 = 0.0125416684348
s3 - s2 = 0.00201786259294


There is an even better improvement

In [12]:
# we now remove seExp in addition to the other 2 (3rd best feature)
X4 = X.drop(["meanExp", "seExp", "seIAT", "meanIAT"], axis=1)
s4 = test_km(X4)
print("s4 =", s4)
print("s4 - s1 =", s4 - s1)
print("s4 - s2 =", s4 - s2)
print("s4 - s3 =", s4 - s3)

s4 = 0.290107346202
s4 - s1 = 0.0365666054193
s4 - s2 = 0.0260427995774
s4 - s3 = 0.0240249369845


To see whether the clustering is close to a dark/light separation we will compute in addition to the silhouette the adjusted mutual info score 

In [13]:
km = KMeans(n_clusters=2, max_iter=300, init="k-means++", n_jobs=1 )

def scoring_complete(X):
    km.fit(X)
    labels = km.labels_
    print("silhouette score :", metrics.silhouette_score(X, labels, metric='euclidean'))
    print("closeness to true label score :", metrics.adjusted_mutual_info_score(label_true, labels))

scoring_complete(X)

silhouette score : 0.253540740782
closeness to true label score : -0.000485621163783


In [14]:
scoring_complete(X2)

silhouette score : 0.264064546624
closeness to true label score : -0.000484796987391


Here we see that eventhough the silhouette score is better our label accuracy is the same as before.

Other examples :

In [15]:
scoring_complete(X3)

silhouette score : 0.266082409217
closeness to true label score : -0.000482821657274


In [16]:
scoring_complete(X4)

silhouette score : 0.289771659077
closeness to true label score : -0.000186755446396


Now let's remove the worst features for example the red / yellow / redYellow / cards (not all are the absolute worsts but they all were in the < 0.05 importance in the previous exercice.)

In [17]:
X5 =  X.drop(["yellowReds", "redCards", "yellowCards"], axis=1)
scoring_complete(X5)

silhouette score : 0.283747370182
closeness to true label score : 0.00499147512141


We have good results ! The silhouette is better, and closeness is much better

Let's now remove all the features that have the worst feature importance until our closeness score drops significantly

In [18]:
compute_feature_importance_rfc(rfc, X5, label_true)

[('seIAT', 0.13849534362108062),
 ('meanExp', 0.10835464871535683),
 ('seExp', 0.10555781825842103),
 ('meanIAT', 0.094494815415764627),
 ('victories', 0.077655344122097014),
 ('birthday', 0.06951269778038037),
 ('games', 0.062332242521145602),
 ('ties', 0.056592918598599544),
 ('goals', 0.051770081133035405),
 ('defeats', 0.045579878128750029),
 ('height', 0.042976406135540407),
 ('club', 0.041034433406836525),
 ('weight', 0.040390915704725652),
 ('leagueCountry', 0.035672655901709094),
 ('position', 0.029579800556557245)]

In [19]:
X6 =  X5.drop(["position", "leagueCountry", "height"], axis=1)
scoring_complete(X6)

silhouette score : 0.347447009627
closeness to true label score : 0.00424424095901


In [20]:
compute_feature_importance_rfc(rfc,X6, label_true)

[('seIAT', 0.16139115280717553),
 ('meanExp', 0.1294132177483733),
 ('meanIAT', 0.11383730697347309),
 ('seExp', 0.11223431480452559),
 ('victories', 0.089413755620049495),
 ('club', 0.070465718296803648),
 ('games', 0.066548709948347495),
 ('birthday', 0.064472033427266207),
 ('ties', 0.05917840847567693),
 ('goals', 0.052610381942177653),
 ('defeats', 0.043455220491093786),
 ('weight', 0.03697977946503727)]

In [21]:
X7 =  X6.drop(["defeats"], axis=1)
scoring_complete(X7)

silhouette score : 0.565611585424
closeness to true label score : 0.0817211263061


In [22]:
compute_feature_importance_rfc(rfc, X7, label_true)

[('seIAT', 0.13976230977649784),
 ('seExp', 0.13517318650270735),
 ('meanIAT', 0.13327805837383366),
 ('meanExp', 0.12941877459761422),
 ('games', 0.07913355014180215),
 ('victories', 0.074340973526954859),
 ('birthday', 0.068952605186483884),
 ('club', 0.066543058871073882),
 ('goals', 0.064623112132487795),
 ('ties', 0.05763911475542579),
 ('weight', 0.051135256135118511)]

In [23]:
X8 =  X7.drop(["weight"], axis=1)
scoring_complete(X8)

silhouette score : 0.553357594614
closeness to true label score : 0.090802477


In [24]:
compute_feature_importance_rfc(rfc, X8, label_true)

[('seIAT', 0.15653694319775782),
 ('meanExp', 0.15291797030087717),
 ('seExp', 0.13646924598897103),
 ('meanIAT', 0.12573211742991902),
 ('victories', 0.098342254953741812),
 ('birthday', 0.078376670022084524),
 ('games', 0.078069292179089769),
 ('club', 0.061049873548434608),
 ('ties', 0.056684304889594778),
 ('goals', 0.055821327489529518)]

In [25]:
X9 =  X8.drop(["ties"], axis=1)
scoring_complete(X9)

silhouette score : 0.576040825545
closeness to true label score : 0.0930262765814


In [26]:
compute_feature_importance_rfc(rfc, X9, label_true)

[('seIAT', 0.17772056636542932),
 ('meanIAT', 0.14308682374825898),
 ('seExp', 0.13391029921145123),
 ('meanExp', 0.12824459866513904),
 ('games', 0.094228699692052068),
 ('goals', 0.086349794830141741),
 ('birthday', 0.085176128243323421),
 ('club', 0.081872932616398622),
 ('victories', 0.06941015662780553)]

In [27]:
X10 =  X9.drop(["goals"], axis=1)
scoring_complete(X10)

silhouette score : 0.635972691467
closeness to true label score : 0.0905806182631


In [28]:
compute_feature_importance_rfc(rfc, X10, label_true)

[('seIAT', 0.17241641142613112),
 ('meanIAT', 0.15236054729126522),
 ('meanExp', 0.14215875796079641),
 ('seExp', 0.13891188173080843),
 ('victories', 0.1283310002888173),
 ('games', 0.10729793126070607),
 ('club', 0.081398434656610105),
 ('birthday', 0.077125035384865315)]

In [29]:
X11 =  X10.drop(["club"], axis=1)
scoring_complete(X11)

silhouette score : 0.682393276867
closeness to true label score : 0.0905806182631


In [30]:
compute_feature_importance_rfc(rfc, X11, label_true)

[('seIAT', 0.18571776836661069),
 ('meanExp', 0.18107576657489094),
 ('meanIAT', 0.16215291360509357),
 ('seExp', 0.14941803548140886),
 ('victories', 0.11935634725197533),
 ('games', 0.1065985292509703),
 ('birthday', 0.095680639469050255)]

In [31]:
X12 =  X11.drop(["birthday"], axis=1)
scoring_complete(X12)

silhouette score : 0.687036611032
closeness to true label score : 0.0905806182631


In [32]:
compute_feature_importance_rfc(rfc, X12, label_true)

[('meanExp', 0.20983983089746389),
 ('seExp', 0.18324839774095397),
 ('seIAT', 0.17447977591259306),
 ('meanIAT', 0.15285153992997086),
 ('games', 0.14759487436343),
 ('victories', 0.13198558115558826)]

In [33]:
X13 =  X12.drop(["victories"], axis=1)
scoring_complete(X13)

silhouette score : 0.730033265353
closeness to true label score : 0.0966882724244


Removing more gives worse results or equal results.