In [2]:
import pandas as pd
import numpy as np
import matplotlib as plt
import sklearn as sk

In [3]:
data = pd.read_csv('CrowdstormingDataJuly1st.csv')
data

Unnamed: 0,playerShort,player,club,leagueCountry,birthday,height,weight,position,games,victories,...,rater2,refNum,refCountry,Alpha_3,meanIAT,nIAT,seIAT,meanExp,nExp,seExp
0,lucas-wilchez,Lucas Wilchez,Real Zaragoza,Spain,31.08.1983,177,72,Attacking Midfielder,1,0,...,0.50,1,1,GRC,0.326391,712,0.000564,0.396000,750,0.002696
1,john-utaka,John Utaka,Montpellier HSC,France,08.01.1982,179,82,Right Winger,1,0,...,0.75,2,2,ZMB,0.203375,40,0.010875,-0.204082,49,0.061504
2,abdon-prats,Abdón Prats,RCD Mallorca,Spain,17.12.1992,181,79,,1,0,...,,3,3,ESP,0.369894,1785,0.000229,0.588297,1897,0.001002
3,pablo-mari,Pablo Marí,RCD Mallorca,Spain,31.08.1993,191,87,Center Back,1,1,...,,3,3,ESP,0.369894,1785,0.000229,0.588297,1897,0.001002
4,ruben-pena,Rubén Peña,Real Valladolid,Spain,18.07.1991,172,70,Right Midfielder,1,1,...,,3,3,ESP,0.369894,1785,0.000229,0.588297,1897,0.001002
5,aaron-hughes,Aaron Hughes,Fulham FC,England,08.11.1979,182,71,Center Back,1,0,...,0.00,4,4,LUX,0.325185,127,0.003297,0.538462,130,0.013752
6,aleksandar-kolarov,Aleksandar Kolarov,Manchester City,England,10.11.1985,187,80,Left Fullback,1,1,...,0.25,4,4,LUX,0.325185,127,0.003297,0.538462,130,0.013752
7,alexander-tettey,Alexander Tettey,Norwich City,England,04.04.1986,180,68,Defensive Midfielder,1,0,...,1.00,4,4,LUX,0.325185,127,0.003297,0.538462,130,0.013752
8,anders-lindegaard,Anders Lindegaard,Manchester United,England,13.04.1984,193,80,Goalkeeper,1,0,...,0.25,4,4,LUX,0.325185,127,0.003297,0.538462,130,0.013752
9,andreas-beck,Andreas Beck,1899 Hoffenheim,Germany,13.03.1987,180,70,Right Fullback,1,1,...,0.00,4,4,LUX,0.325185,127,0.003297,0.538462,130,0.013752


Disaggregating the data
======================

We now proceed to disaggregate the data as the guys from OSF did. The idea is to get a row per interaction (game) between a player and a referee instead of only one with a high games count.

In [None]:
refs = pd.unique(data['refNum'].values.ravel())

data['refCount'] = 0

for r in refs:
    data['refCount'][data['refNum'] == r] = len(data[data['refNum'] == r]) # Adding a new colum couting the number of occurence of this ref to each dyad 
    
j = 0 # J will be the index in the resulting dataframe
result = [0 for _ in range(sum(data['games']))]

for _, row in data.iterrows():
    g = row['games']
    reds = row['redCards']
    yell = row['yellowReds']
    
    for _ in range(g): # Now we create one row per game
        row['reds'] = 1 if (reds - _) > 0 else 0
        row['yellow'] = 1 if (yell - _) > 0 else 0
        rowcopy = list(row)
        result[j] = rowcopy
        j += 1
        
        if j % 10 == 0:
            print("Done " + j + " rows!")
    
pd.DataFrame(result, colnames = data.colnames).to_csv('disagg.csv', index=False)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Cleaning up
===========
Now that w'eve disaggregated the data as one row per meeting between a referee and a player, we proceed to fix the problem spotted by the guys at osf.io, namely the fact that the players' entire career are included in the dataset. We proceed to count the number of relation between each referee and any player and remove it if it is less than 22 since that means that refered a game outside the scope of our data.

In [5]:
# In retrospect I don't really know why I did the disaggregation because we don't really about referees at all ....
df = pd.read_csv('disagg.csv')
allRefs = df['refNum'].value_counts()
validRefs = allRefs[allRefs > 21] # Only keep referee which appear more than 21 times

goodRefs = df[df['refNum'].isin(validRefs)]
goodRefs.sample(5)

Unnamed: 0,playerShort,player,club,leagueCountry,birthday,height,weight,position,games,victories,...,Alpha_3,meanIAT,nIAT,seIAT,meanExp,nExp,seExp,refCount,reds,yellow
74958,cesc-fabregas,Cesc Fàbregas,FC Barcelona,Spain,04.05.1987,179,75,Center Midfielder,6,6,...,ESP,0.369894,1785,0.000229,0.588297,1897,0.001002,556,0,0
232664,stephen-warnock,Stephen Warnock,Bolton Wanderers,England,12.12.1981,183,77,Left Fullback,10,3,...,ENGL,0.32669,44791,1e-05,0.356446,46916,3.7e-05,599,0,0
300265,pepe-reina,Pepe Reina,Liverpool FC,England,31.08.1982,188,92,Goalkeeper,13,9,...,ENGL,0.32669,44791,1e-05,0.356446,46916,3.7e-05,411,0,1
91976,stilian-petrov,Stilian Petrov,Aston Villa,England,05.07.1979,180,77,Center Midfielder,1,0,...,SCOT,0.32669,44791,1e-05,0.356446,46916,3.7e-05,12,0,0
52124,geoffrey-dernis,Geoffrey Dernis,Stade Brest,France,24.12.1980,171,66,Left Midfielder,2,1,...,FRA,0.334684,2882,0.000151,0.336101,3011,0.000586,440,0,0


In [20]:
# We now have to combine both ratings of skin color into to have only one response variable and simplify the process
# Really don't know why we care about disaggregation...
df = pd.read_csv('CrowdstormingDataJuly1st.csv')

total = len(df)

print("%d players known" % total)

df['skintone'] = (df['rater1'] + df['rater2']) / 2
del df['rater1']
del df['rater2']

# We then need to remove players for which at least one the skin rating is NaN since we can't train the classifier with those
cleaned = df.dropna(axis=0, subset=['skintone'])

print("Removed %d invalid players" % (total - len(cleaned)))

146028 players known
Removed 21407 invalid players


There might be other features which are NaN left in the set let's see what they are.

In [24]:
for c in cleaned.columns:
    print(c + ": " + str(len(cleaned[cleaned[c].isnull()])))

playerShort: 0
player: 0
club: 0
leagueCountry: 0
birthday: 0
height: 46
weight: 753
position: 8461
games: 0
victories: 0
ties: 0
defeats: 0
goals: 0
yellowCards: 0
yellowReds: 0
redCards: 0
photoID: 0
rater1: 0
rater2: 0
refNum: 0
refCountry: 0
Alpha_3: 1
meanIAT: 153
nIAT: 153
seIAT: 153
meanExp: 153
nExp: 153
seExp: 153
skintone: 0


We notice there are a lot of players with no position but hopefully a smart coach will not put a player at a certain position based on his skin color but rather based on his skill ;)

We also notice that the same set of players don't have the \*IAT and \*Exp variables set! This makes senses since those are collected per country. However considering the size of our dataset there are very few entries with no IAT and Exp so we can drop those without much concern!

In [31]:
cleaned = cleaned.dropna(axis=0, subset=['meanIAT'])
assert cleaned[cleaned['nExp'].isnull()].empty # This confirms that the *IAT and *Exp are NaN for the same set of players

For the weight and height features it makes sense to assume that the players have a rather similar build because of playing footbal professionnaly (essentially the same diet and exercise schedule). So we can approximate the NaN values with the mean weight and mean height respectively.

In [28]:
nweight_idx = cleaned[cleaned['weight'].isnull()].index
nheight_idx =cleaned[cleaned['height'].isnull()].index

print(len(nweight_idx))
print(len(nheight_idx))

753
46


In [34]:
mean_weight = cleaned['weight'].mean()
mean_height = cleaned['height'].mean()

for idx in nweight_idx:
    cleaned.set_value(idx, 'weight', mean_weight)
    
for idx in nheight_idx:
    cleaned.set_value(idx, 'height', mean_height)

We are now reasonably confident that we have a cleaned dataset and we proceed to the actual classification task.

Classification
============

Dropping useless features
---------------------------------------------------------

We now delete a few features that seems completely unrelated to a player's skin color

In [44]:
# Now dropping features that are useless for our classification

dropping = ['player', 'position', 'birthday', 'photoID', 'nExp', 'seExp', 'seIAT', 'refNum', 'nIAT', 'rater1', 'rater2']

for c in dropping:
    try:
        cleaned.drop(c, axis=1, inplace=True)
    except:
        continue
# Hopefully players play a certain role based on their skill ...
# The referee certainly has nothing to do with the player's skin color :X
# More black people are born on certain date ? :D
# The name of the photo, seriously? 
# The standard error and count for IAT and Exp are based on countries not on the players themselves so we can remove them as well

cleaned.columns

Index(['playerShort', 'club', 'leagueCountry', 'height', 'weight', 'games',
       'victories', 'ties', 'defeats', 'goals', 'yellowCards', 'yellowReds',
       'redCards', 'refCountry', 'Alpha_3', 'meanIAT', 'meanExp', 'skintone'],
      dtype='object')

Label encoding
----------------------
Since scikit-learn only supports numerical data we need to create a mapping between our textual data and a numerical represtation. We do this using the LabelEncoder from scikit-learn. See [here](http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html)

In [61]:
from sklearn.preprocessing import LabelEncoder

features = cleaned.drop('skintone', axis=1) # The features df is the same but without the response variable

for f in features.columns:
    encoder = LabelEncoder()
    labels = encoder.fit_transform(features[f])
    features[f] = labels
    
labels = LabelEncoder().fit_transform(cleaned['skintone'])

features.refCountry.sample(5) # Verify that we have no text left 
features.leagueCountry.sample(5)

90996     2
120764    0
97490     0
95132     2
126111    0
Name: leagueCountry, dtype: int64

Random Forest Classifier training
--------------------------------------

In [62]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import train_test_split

X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.4)

clf = RandomForestClassifier()

clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [63]:
clf.score(X_test, y_test)

0.99049971880774479