In [203]:
import pandas as pd                                     
import numpy as np                                   
import os                                               
import matplotlib.pyplot as plt                         
import scipy.stats.mstats as ssm                        
from scipy.stats import gaussian_kde as kde
import random

%matplotlib inline

Our first task is to transform the dataset into something meaningfully that we can use in our classifier. To do that we are going to aggregate the data based on the player and we are going to do that in the following way:
    1. Columns: playerShort, club, leagueCountry, birthday, height, weight and, position just get copied
    2. We drop column: player since the playerShort column is unique
    3. Columns: games, victories, ties, defeats, goals, yellowCards, yellowReds and, redCards get summed up
    4. We drop the photoID column
    5. Then we average the rating of the skin colour
    6. We then drop columns: refNum, refCountry and, Alpha_3
    7. We also drop nIAT, nExp, then average meanIAT, meanExp and calculate new seIAT, seExp based on the variance of the values we used in the averaging of meanIAT and meanExp.




In [204]:
df = pd.read_csv('CrowdstormingDataJuly1st.csv')

Let's check that the data was loaded 

In [205]:
df.head(5)

Unnamed: 0,playerShort,player,club,leagueCountry,birthday,height,weight,position,games,victories,...,rater2,refNum,refCountry,Alpha_3,meanIAT,nIAT,seIAT,meanExp,nExp,seExp
0,lucas-wilchez,Lucas Wilchez,Real Zaragoza,Spain,31.08.1983,177.0,72.0,Attacking Midfielder,1,0,...,0.5,1,1,GRC,0.326391,712.0,0.000564,0.396,750.0,0.002696
1,john-utaka,John Utaka,Montpellier HSC,France,08.01.1982,179.0,82.0,Right Winger,1,0,...,0.75,2,2,ZMB,0.203375,40.0,0.010875,-0.204082,49.0,0.061504
2,abdon-prats,Abdón Prats,RCD Mallorca,Spain,17.12.1992,181.0,79.0,,1,0,...,,3,3,ESP,0.369894,1785.0,0.000229,0.588297,1897.0,0.001002
3,pablo-mari,Pablo Marí,RCD Mallorca,Spain,31.08.1993,191.0,87.0,Center Back,1,1,...,,3,3,ESP,0.369894,1785.0,0.000229,0.588297,1897.0,0.001002
4,ruben-pena,Rubén Peña,Real Valladolid,Spain,18.07.1991,172.0,70.0,Right Midfielder,1,1,...,,3,3,ESP,0.369894,1785.0,0.000229,0.588297,1897.0,0.001002


Ok now, let's make the playerShort the index, also let's average the skintone and drop all players that have the skintone as NaN. Plus, we average the player skintone.

In [206]:
#df.set_index(df.columns[0], inplace=True)
df.dropna(subset=['rater1', 'rater2'], inplace=True)
df['skintone']=df[['rater1', 'rater2']].mean(axis=1)
df.head(10)

Unnamed: 0,playerShort,player,club,leagueCountry,birthday,height,weight,position,games,victories,...,refNum,refCountry,Alpha_3,meanIAT,nIAT,seIAT,meanExp,nExp,seExp,skintone
0,lucas-wilchez,Lucas Wilchez,Real Zaragoza,Spain,31.08.1983,177.0,72.0,Attacking Midfielder,1,0,...,1,1,GRC,0.326391,712.0,0.000564,0.396,750.0,0.002696,0.375
1,john-utaka,John Utaka,Montpellier HSC,France,08.01.1982,179.0,82.0,Right Winger,1,0,...,2,2,ZMB,0.203375,40.0,0.010875,-0.204082,49.0,0.061504,0.75
5,aaron-hughes,Aaron Hughes,Fulham FC,England,08.11.1979,182.0,71.0,Center Back,1,0,...,4,4,LUX,0.325185,127.0,0.003297,0.538462,130.0,0.013752,0.125
6,aleksandar-kolarov,Aleksandar Kolarov,Manchester City,England,10.11.1985,187.0,80.0,Left Fullback,1,1,...,4,4,LUX,0.325185,127.0,0.003297,0.538462,130.0,0.013752,0.125
7,alexander-tettey,Alexander Tettey,Norwich City,England,04.04.1986,180.0,68.0,Defensive Midfielder,1,0,...,4,4,LUX,0.325185,127.0,0.003297,0.538462,130.0,0.013752,1.0
8,anders-lindegaard,Anders Lindegaard,Manchester United,England,13.04.1984,193.0,80.0,Goalkeeper,1,0,...,4,4,LUX,0.325185,127.0,0.003297,0.538462,130.0,0.013752,0.25
9,andreas-beck,Andreas Beck,1899 Hoffenheim,Germany,13.03.1987,180.0,70.0,Right Fullback,1,1,...,4,4,LUX,0.325185,127.0,0.003297,0.538462,130.0,0.013752,0.0
10,antonio-rukavina,Antonio Rukavina,Real Valladolid,Spain,26.01.1984,177.0,74.0,Right Fullback,2,2,...,4,4,LUX,0.325185,127.0,0.003297,0.538462,130.0,0.013752,0.0
11,ashkan-dejagah,Ashkan Dejagah,Fulham FC,England,05.07.1986,181.0,74.0,Left Winger,1,1,...,4,4,LUX,0.325185,127.0,0.003297,0.538462,130.0,0.013752,0.5
12,benedikt-hoewedes,Benedikt Höwedes,FC Schalke 04,Germany,29.02.1988,187.0,80.0,Center Back,1,1,...,4,4,LUX,0.325185,127.0,0.003297,0.538462,130.0,0.013752,0.0


The next step is to change the format of the data and keep only relevant fields.

In [207]:
df.drop(['player', 'rater1', 'rater2', 'photoID', 'birthday'], inplace=True, axis=1)
df.head()

Unnamed: 0,playerShort,club,leagueCountry,height,weight,position,games,victories,ties,defeats,...,refNum,refCountry,Alpha_3,meanIAT,nIAT,seIAT,meanExp,nExp,seExp,skintone
0,lucas-wilchez,Real Zaragoza,Spain,177.0,72.0,Attacking Midfielder,1,0,0,1,...,1,1,GRC,0.326391,712.0,0.000564,0.396,750.0,0.002696,0.375
1,john-utaka,Montpellier HSC,France,179.0,82.0,Right Winger,1,0,0,1,...,2,2,ZMB,0.203375,40.0,0.010875,-0.204082,49.0,0.061504,0.75
5,aaron-hughes,Fulham FC,England,182.0,71.0,Center Back,1,0,0,1,...,4,4,LUX,0.325185,127.0,0.003297,0.538462,130.0,0.013752,0.125
6,aleksandar-kolarov,Manchester City,England,187.0,80.0,Left Fullback,1,1,0,0,...,4,4,LUX,0.325185,127.0,0.003297,0.538462,130.0,0.013752,0.125
7,alexander-tettey,Norwich City,England,180.0,68.0,Defensive Midfielder,1,0,0,1,...,4,4,LUX,0.325185,127.0,0.003297,0.538462,130.0,0.013752,1.0


Now we will collapse these entries so that we have one row per player.

In [243]:
players_grouped = df.groupby(by=['playerShort'])

fdict = dict.fromkeys(['club', 'leagueCountry', 'height', 'weight', 'position', 'skintone'], np.max)
fdict.update(dict.fromkeys(['meanIAT', 'meanExp', 'seIAT', 'seExp'], np.mean))
fdict.update(dict.fromkeys(['games', 'victories', 'ties', 'defeats', 'goals', 'yellowCards', 'yellowReds', 'redCards'], np.sum))

data = players_grouped.agg(fdict)
data.reset_index(inplace=True, drop=True)
labels = data[['skintone']]
data.drop('skintone', inplace=True, axis=1)
data.head()

Unnamed: 0,ties,position,leagueCountry,victories,yellowReds,redCards,goals,meanIAT,seIAT,height,yellowCards,seExp,meanExp,weight,club,games,defeats
0,179,Center Back,England,247,0,0,9,0.346459,0.000652,182.0,19,0.004065,0.494575,71.0,Fulham FC,654,228
1,73,Attacking Midfielder,Germany,141,0,1,62,0.348818,0.000219,183.0,42,0.000993,0.44922,73.0,Werder Bremen,336,122
2,97,Right Midfielder,England,200,0,0,31,0.345893,0.000367,165.0,11,0.002032,0.491482,63.0,Tottenham Hotspur,412,115
3,42,Center Midfielder,England,150,0,1,39,0.346821,0.003334,178.0,31,0.013522,0.514693,76.0,Arsenal FC,260,68
4,40,Center Back,France,41,4,2,1,0.3316,0.001488,180.0,8,0.005296,0.335587,73.0,Montpellier HSC,124,43


Now we need to deal with the text features. Our model does not like text features so we have to encode them as numbers. There are two ways of doing this.
1. We relate each text value to a number
2. For each text value we add a new column and set it to 0 where that text value does not appear and 1 where it appers

In general it it suggested to use the second aproach, but in our case I will use the first aproach for 2 reasons: 
-There are a lot of clubs and that will add a lot of columns
-It will make it very dificult to inspect the feature_importances_

In [219]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le1 = preprocessing.LabelEncoder()
le2 = preprocessing.LabelEncoder()
le.fit(data['leagueCountry'])
le1.fit(data['club'])
le2.fit(data['position'].dropna())

LabelEncoder()

In [244]:
data['leagueCountry'] = le.transform(data['leagueCountry'])
data['club'] = le1.transform(data['club'])
data['position'] = data['position'].fillna(le2.inverse_transform(np.median(le2.transform(data['position'].dropna())).astype(int)))
data['position'] = le2.transform(data['position'])
data.head()

Unnamed: 0,ties,position,leagueCountry,victories,yellowReds,redCards,goals,meanIAT,seIAT,height,yellowCards,seExp,meanExp,weight,club,games,defeats
0,179,1,0,247,0,0,9,0.346459,0.000652,182.0,19,0.004065,0.494575,71.0,34,654,228
1,73,0,2,141,0,1,62,0.348818,0.000219,183.0,42,0.000993,0.44922,73.0,91,336,122
2,97,10,0,200,0,0,31,0.345893,0.000367,165.0,11,0.002032,0.491482,63.0,83,412,115
3,42,3,0,150,0,1,39,0.346821,0.003334,178.0,31,0.013522,0.514693,76.0,6,260,68
4,40,1,1,41,4,2,1,0.3316,0.001488,180.0,8,0.005296,0.335587,73.0,51,124,43


Our model also does not like NaN values so we will fill them with the median of the respetive column

In [221]:
data['height'] = data['height'].fillna(np.median(data['height'].dropna()))
data['weight'] = data['weight'].fillna(np.median(data['weight'].dropna()))

In [222]:
#data.set_index(data.columns[0], inplace=True)

In [245]:
labels = labels.apply(lambda x: np.round(x))

In [246]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import KFold
from sklearn import metrics
def checkCrossValidationAccuracy (data, labels):
    scores = []
    kf = KFold(len(labels), n_folds=20, shuffle=True, random_state=123)
    for train_index, test_index in kf:
        clf = RandomForestClassifier(n_estimators=15)
        X_train, X_test = data.loc[train_index], data.loc[test_index]
        y_train, y_test = labels.loc[train_index], labels.loc[test_index]
        clf.fit(X_train, y_train['skintone'])
        scores.append(metrics.accuracy_score(y_test['skintone'], clf.predict(X_test)))
        #print(metrics.accuracy_score(y_test['skintone'], clf.predict(X_test)))
        #print(metrics.accuracy_score(y_train['skintone'], clf.predict(X_train)))
    return np.average(scores) 
checkCrossValidationAccuracy(data,labels)

ValueError: Input contains NaN, infinity or a value too large for dtype('float32').

Okay now let's try dropping some cols. Let's drop the club and the leagueCountry as those atributs should not affect the player

In [225]:
data1 = data.drop("leagueCountry", 1)
data1 = data1.drop("club", 1)

In [226]:
checkCrossValidationAccuracy(data1,labels)

0.84231803797468374

It looks like droping thoose features did not increase the accuracy but at the same time it simplified the model

In [227]:
data2 = data1.drop("games", 1)
data2 = data2.drop("victories", 1)
data2 = data2.drop("ties", 1)
data2 = data2.drop("defeats", 1)

In [228]:
data2.head()

Unnamed: 0,seIAT,height,position,yellowCards,yellowReds,seExp,meanExp,goals,weight,meanIAT,redCards
0,0.000652,182.0,1,19,0,0.004065,0.494575,9,71.0,0.346459,0
1,0.000219,183.0,0,42,0,0.000993,0.44922,62,73.0,0.348818,1
2,0.000367,165.0,10,11,0,0.002032,0.491482,31,63.0,0.345893,0
3,0.003334,178.0,3,31,0,0.013522,0.514693,39,76.0,0.346821,1
4,0.001488,180.0,1,8,4,0.005296,0.335587,1,73.0,0.3316,2


In [229]:
checkCrossValidationAccuracy(data2,labels)

0.84348892405063314

In [230]:
clf = RandomForestClassifier(n_estimators=15)
clf.fit(data2, labels['skintone'])
clf.feature_importances_

array([ 0.16328552,  0.06962188,  0.05848035,  0.09203162,  0.02728087,
        0.118001  ,  0.15373077,  0.08047306,  0.06655263,  0.14069798,
        0.02984432])

Let's try to join the yellowReds and redCards columns

In [231]:
data3 = data2
data3['redCards'] = data3['redCards'] + data3['yellowReds']
data3 = data3.drop("yellowReds", 1)

In [232]:
checkCrossValidationAccuracy(data3,labels)

0.85182753164556979

In [233]:
clf = RandomForestClassifier(n_estimators=15)
clf.fit(data3, labels['skintone'])
clf.feature_importances_

array([ 0.15086221,  0.0625765 ,  0.06450229,  0.09375426,  0.1250114 ,
        0.16043275,  0.08638213,  0.07389196,  0.14648112,  0.03610538])

Let's also try to remove the position 

In [234]:
data4 = data3.drop("position", 1)

In [235]:
checkCrossValidationAccuracy(data4,labels)

0.84976265822784813

We can also try to join the yellow cards and red cards in one column named cards

In [236]:
data5 = data4
data5['cards'] = data5['redCards'] + data5['yellowCards']
data5 = data5.drop('redCards', 1)
data5 = data5.drop('yellowCards', 1)
data5.head()

Unnamed: 0,seIAT,height,seExp,meanExp,goals,weight,meanIAT,cards
0,0.000652,182.0,0.004065,0.494575,9,71.0,0.346459,19
1,0.000219,183.0,0.000993,0.44922,62,73.0,0.348818,43
2,0.000367,165.0,0.002032,0.491482,31,63.0,0.345893,11
3,0.003334,178.0,0.013522,0.514693,39,76.0,0.346821,32
4,0.001488,180.0,0.005296,0.335587,1,73.0,0.3316,14


In [237]:
checkCrossValidationAccuracy(data5,labels)

0.83659018987341782

In [238]:
clf = RandomForestClassifier(n_estimators=15)
clf.fit(data5, labels['skintone'])
clf.feature_importances_

array([ 0.17015794,  0.07783543,  0.15361058,  0.158316  ,  0.08825488,
        0.08166358,  0.16323924,  0.10692235])

The final simplification that we can try is to eleminate all data that is not connected to the cards

In [239]:
data6 = data5.drop('height', 1)
data6 = data6.drop("weight", 1)

In [240]:
checkCrossValidationAccuracy(data6,labels) 

0.84424050632911385

In [165]:
clf = RandomForestClassifier(n_estimators=15)
clf.fit(data6, labels['skintone'])
clf.feature_importances_

array([ 0.82086033,  0.05031839,  0.03265829,  0.03287362,  0.01138505,
        0.03599505,  0.01590928])