In [1]:
import pandas as pd                                     
import numpy as np                                   
import os                                               
import matplotlib.pyplot as plt                         
import scipy.stats.mstats as ssm                        
from scipy.stats import gaussian_kde as kde
import random

%matplotlib inline

Our first task is to transform the dataset into something meaningfully that we can use in our classifier. To do that we are going to aggregate the data based on the player and we are going to do that in the following way:
    1. Columns: playerShort, club, leagueCountry, birthday, height, weight and, position just get copied
    2. We drop column: player since the playerShort column is unique
    3. Columns: games, victories, ties, defeats, goals, yellowCards, yellowReds and, redCards get summed up
    4. We drop the photoID column
    5. Then we average the rating of the skin colour
    6. We then drop columns: refNum, refCountry and, Alpha_3
    7. We also drop nIAT, nExp, then average meanIAT, meanExp and calculate new seIAT, seExp based on the variance of the values we used in the averaging of meanIAT and meanExp.




In [2]:
df = pd.read_csv('CrowdstormingDataJuly1st.csv')

Let's check that the data was loaded 

In [3]:
df.head(5)

Unnamed: 0,playerShort,player,club,leagueCountry,birthday,height,weight,position,games,victories,...,rater2,refNum,refCountry,Alpha_3,meanIAT,nIAT,seIAT,meanExp,nExp,seExp
0,lucas-wilchez,Lucas Wilchez,Real Zaragoza,Spain,31.08.1983,177.0,72.0,Attacking Midfielder,1,0,...,0.5,1,1,GRC,0.326391,712.0,0.000564,0.396,750.0,0.002696
1,john-utaka,John Utaka,Montpellier HSC,France,08.01.1982,179.0,82.0,Right Winger,1,0,...,0.75,2,2,ZMB,0.203375,40.0,0.010875,-0.204082,49.0,0.061504
2,abdon-prats,Abdón Prats,RCD Mallorca,Spain,17.12.1992,181.0,79.0,,1,0,...,,3,3,ESP,0.369894,1785.0,0.000229,0.588297,1897.0,0.001002
3,pablo-mari,Pablo Marí,RCD Mallorca,Spain,31.08.1993,191.0,87.0,Center Back,1,1,...,,3,3,ESP,0.369894,1785.0,0.000229,0.588297,1897.0,0.001002
4,ruben-pena,Rubén Peña,Real Valladolid,Spain,18.07.1991,172.0,70.0,Right Midfielder,1,1,...,,3,3,ESP,0.369894,1785.0,0.000229,0.588297,1897.0,0.001002


Ok now, let's make the playerShort the index, also let's average the skintone and drop all players that have the skintone as NaN

In [4]:
df.set_index(df.columns[0], inplace=True)
df['skintone']=(df['rater1']+df['rater2'])/2
df.dropna(subset=['skintone'], inplace=True)
players=pd.unique(df.index.values.ravel())

Let's check the number of players left

In [5]:
players.size

1585

In [36]:
data = pd.DataFrame()
labels = pd.DataFrame()

In [37]:
k = 0
for player in players:
    #Copy same columns
    data.loc[k, 'playerShort'] = player
    data.loc[k, 'club'] =  np.array(df.loc[player, "club"]).flatten()[0]
    data.loc[k, 'leagueCountry'] = np.array(df.loc[player, "leagueCountry"]).flatten()[0]
    data.loc[k, 'height'] = np.array(df.loc[player, "height"]).flatten()[0]
    data.loc[k, 'weight'] = np.array(df.loc[player, "weight"]).flatten()[0]
    data.loc[k, 'position'] = np.array(df.loc[player, "position"]).flatten()[0]
    #Sum up columns 
    data.loc[k, 'games'] =  np.array(df.loc[player, "games"]).sum()
    data.loc[k, 'victories'] =  np.array(df.loc[player, "victories"]).sum()
    data.loc[k, 'ties'] =  np.array(df.loc[player, "ties"]).sum()
    data.loc[k, 'defeats'] =  np.array(df.loc[player, "defeats"]).sum()
    data.loc[k, 'goals'] =  np.array(df.loc[player, "goals"]).sum()
    data.loc[k, 'yellowCards'] =  np.array(df.loc[player, "yellowCards"]).sum()
    data.loc[k, 'yellowReds'] =  np.array(df.loc[player, "yellowReds"]).sum()
    data.loc[k, 'redCards'] =  np.array(df.loc[player, "redCards"]).sum()
    #Averge 2 colums
    data.loc[k, 'meanIAT'] =  np.ma.average(np.ma.array(df.loc[player, "meanIAT"], mask=np.isnan(np.array(df.loc[player, "meanIAT"]))))
    data.loc[k, 'meanExp'] =  np.ma.std(np.ma.array(df.loc[player, "meanIAT"], mask=np.isnan(np.array(df.loc[player, "meanIAT"]))))
    data.loc[k, 'seIAT'] =  np.ma.average(np.ma.array(df.loc[player, "seIAT"], mask=np.isnan(np.array(df.loc[player, "seIAT"]))))
    data.loc[k, 'seExp'] =  np.ma.std(np.ma.array(df.loc[player, "seIAT"], mask=np.isnan(np.array(df.loc[player, "seIAT"]))))
    # get the class
    labels.loc[k, 'skintone'] =  np.array(df.loc[player, 'skintone']).flatten()[0]
    k += 1
    #print(k)
    

In [53]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le1 = preprocessing.LabelEncoder()
le2 = preprocessing.LabelEncoder()
le.fit(data['leagueCountry'])
le1.fit(data['club'])
le2.fit(data['position'].dropna())

LabelEncoder()

In [72]:
data['leagueCountry'] = le.transform(data['leagueCountry'])
data['club'] = le1.transform(data['club'])
data['position'] = data['position'].fillna(le2.inverse_transform(np.median(le2.transform(data['position'].dropna())).astype(int)))
data['position'] = le2.transform(data['position'])

In [74]:
data['height'] = data['height'].fillna(np.median(data['height'].dropna()))
data['weight'] =data['weight'].fillna(np.median(data['weight'].dropna()))

In [79]:
data.set_index(data.columns[0], inplace=True)

In [80]:
data

Unnamed: 0_level_0,club,leagueCountry,height,weight,position,games,victories,ties,defeats,goals,yellowCards,yellowReds,redCards,meanIAT,meanExp,seIAT,seExp
playerShort,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
lucas-wilchez,70,3,177.0,72.0,0,144.0,65.0,32.0,47.0,10.0,21.0,1.0,2.0,0.356667,2.144522e-02,0.000857,1.224737e-03
john-utaka,51,1,179.0,82.0,11,431.0,186.0,102.0,143.0,88.0,33.0,0.0,2.0,0.326257,5.615282e-02,0.002318,6.582443e-03
aaron-hughes,34,0,182.0,71.0,1,654.0,247.0,179.0,228.0,9.0,19.0,0.0,0.0,0.346459,3.141599e-02,0.000652,3.250574e-03
aleksandar-kolarov,48,0,187.0,80.0,6,285.0,138.0,57.0,90.0,28.0,50.0,4.0,3.0,0.360355,2.882958e-02,0.000445,8.495246e-04
alexander-tettey,54,0,180.0,68.0,4,214.0,88.0,54.0,72.0,11.0,34.0,0.0,0.0,0.345591,3.061784e-02,0.000413,7.417314e-04
anders-lindegaard,49,0,193.0,80.0,5,100.0,50.0,20.0,30.0,0.0,1.0,0.0,0.0,0.343969,2.654639e-02,0.000320,7.060504e-04
andreas-beck,2,2,180.0,70.0,9,296.0,125.0,72.0,99.0,7.0,57.0,1.0,0.0,0.344388,2.208224e-02,0.000272,5.963274e-04
antonio-rukavina,69,3,177.0,74.0,9,249.0,95.0,70.0,84.0,3.0,37.0,1.0,0.0,0.352328,3.234136e-02,0.000656,3.595608e-03
ashkan-dejagah,34,0,181.0,74.0,8,321.0,131.0,71.0,119.0,56.0,55.0,1.0,4.0,0.344038,2.336233e-02,0.000316,1.148661e-03
benedikt-hoewedes,31,2,187.0,80.0,1,295.0,159.0,54.0,82.0,21.0,27.0,2.0,3.0,0.347212,2.382636e-02,0.000317,6.118591e-04


In [81]:
from sklearn.ensemble import RandomForestClassifier

In [102]:
clf = RandomForestClassifier(n_estimators=10)

In [90]:
clf = clf.fit(data, le3.transform(labels['skintone']))

In [117]:
labels[labels['skintone'] >= 0.5] = 1
labels[labels['skintone'] < 0.5] = 0

In [118]:
labels

Unnamed: 0,skintone
0,0.0
1,1.0
2,0.0
3,0.0
4,1.0
5,0.0
6,0.0
7,0.0
8,1.0
9,0.0


In [119]:
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data, labels, random_state=1)

In [120]:
clf.fit(X_train, y_train['skintone'])

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [107]:
from sklearn import metrics
print(metrics.accuracy_score(y_test['skintone']), clf.predict(X_test)))

0.309823677582


In [112]:
y_test['skintone_pred'] = le3.inverse_transform(clf.predict(X_test))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [113]:
y_test

Unnamed: 0,skintone,skintone_pred
1318,0.125,0.250
654,0.125,0.000
572,0.000,0.125
1133,0.000,0.000
628,0.000,0.000
916,0.625,0.000
409,0.000,0.000
909,0.500,0.250
808,0.000,0.250
75,0.500,0.500
