In [1]:
import pandas as pd                                     
import numpy as np                                   
import os                                               
import matplotlib.pyplot as plt                         
import scipy.stats.mstats as ssm                        
from scipy.stats import gaussian_kde as kde
import random

%matplotlib inline

Our first task is to transform the dataset into something meaningfully that we can use in our classifier. To do that we are going to aggregate the data based on the player and we are going to do that in the following way:
    1. Columns: playerShort, club, leagueCountry, birthday, height, weight and, position just get copied
    2. We drop column: player since the playerShort column is unique
    3. Columns: games, victories, ties, defeats, goals, yellowCards, yellowReds and, redCards get summed up
    4. We drop the photoID column
    5. Then we average the rating of the skin colour
    6. We then drop columns: refNum, refCountry and, Alpha_3
    7. We also drop nIAT, nExp, then average meanIAT, meanExp and calculate new seIAT, seExp based on the variance of the values we used in the averaging of meanIAT and meanExp.




In [2]:
df = pd.read_csv('CrowdstormingDataJuly1st.csv')

Let's check that the data was loaded 

In [4]:
df.head(5)

Unnamed: 0,playerShort,player,club,leagueCountry,birthday,height,weight,position,games,victories,...,rater2,refNum,refCountry,Alpha_3,meanIAT,nIAT,seIAT,meanExp,nExp,seExp
0,lucas-wilchez,Lucas Wilchez,Real Zaragoza,Spain,31.08.1983,177.0,72.0,Attacking Midfielder,1,0,...,0.5,1,1,GRC,0.326391,712.0,0.000564,0.396,750.0,0.002696
1,john-utaka,John Utaka,Montpellier HSC,France,08.01.1982,179.0,82.0,Right Winger,1,0,...,0.75,2,2,ZMB,0.203375,40.0,0.010875,-0.204082,49.0,0.061504
2,abdon-prats,Abdón Prats,RCD Mallorca,Spain,17.12.1992,181.0,79.0,,1,0,...,,3,3,ESP,0.369894,1785.0,0.000229,0.588297,1897.0,0.001002
3,pablo-mari,Pablo Marí,RCD Mallorca,Spain,31.08.1993,191.0,87.0,Center Back,1,1,...,,3,3,ESP,0.369894,1785.0,0.000229,0.588297,1897.0,0.001002
4,ruben-pena,Rubén Peña,Real Valladolid,Spain,18.07.1991,172.0,70.0,Right Midfielder,1,1,...,,3,3,ESP,0.369894,1785.0,0.000229,0.588297,1897.0,0.001002


In [6]:
df.set_index(df.columns[0], inplace=True)

In [13]:
df['skintone']=(df['rater1']+df['rater2'])/2

In [15]:
players=pd.unique(df.index.values.ravel())

In [18]:
players.size

2053

In [22]:
df.dropna(subset=['skintone'], inplace=True)

In [23]:
players=pd.unique(df.index.values.ravel())

In [24]:
players.size

1585

In [28]:
players[0]

'lucas-wilchez'

In [191]:
data = pd.DataFrame()
labels = pd.DataFrame()

In [192]:
k = 0
for player in players:
    #Copy same columns
    data.loc[k, 'playerShort'] = player
    data.loc[k, 'club'] =  np.array(df.loc[player, "club"]).flatten()[0]
    data.loc[k, 'leagueCountry'] = np.array(df.loc[player, "leagueCountry"]).flatten()[0]
    data.loc[k, 'birthday'] = np.array(df.loc[player, "birthday"]).flatten()[0]
    data.loc[k, 'height'] = np.array(df.loc[player, "height"]).flatten()[0]
    data.loc[k, 'weight'] = np.array(df.loc[player, "weight"]).flatten()[0]
    #Sum up columns 
    data.loc[k, 'games'] =  np.array(df.loc[player, "games"]).sum()
    data.loc[k, 'victories'] =  np.array(df.loc[player, "victories"]).sum()
    data.loc[k, 'ties'] =  np.array(df.loc[player, "ties"]).sum()
    data.loc[k, 'defeats'] =  np.array(df.loc[player, "defeats"]).sum()
    data.loc[k, 'goals'] =  np.array(df.loc[player, "goals"]).sum()
    data.loc[k, 'yellowCards'] =  np.array(df.loc[player, "yellowCards"]).sum()
    data.loc[k, 'yellowReds'] =  np.array(df.loc[player, "yellowReds"]).sum()
    data.loc[k, 'redCards'] =  np.array(df.loc[player, "redCards"]).sum()
    #Averge 2 colums
    data.loc[k, 'meanIAT'] =  np.ma.average(np.ma.array(df.loc[player, "meanIAT"], mask=np.isnan(np.array(df.loc[player, "meanIAT"]))))
    data.loc[k, 'meanExp'] =  np.ma.std(np.ma.array(df.loc[player, "meanIAT"], mask=np.isnan(np.array(df.loc[player, "meanIAT"]))))
    data.loc[k, 'seIAT'] =  np.ma.average(np.ma.array(df.loc[player, "seIAT"], mask=np.isnan(np.array(df.loc[player, "seIAT"]))))
    data.loc[k, 'seExp'] =  np.ma.std(np.ma.array(df.loc[player, "seIAT"], mask=np.isnan(np.array(df.loc[player, "seIAT"]))))
    k += 1
    #print(k)
    # get the class
    labels.loc[k, 'skintone'] =  np.array(df.loc[player, 'skintone']).flatten()[0]
    
    

In [193]:
data

Unnamed: 0,playerShort,club,leagueCountry,birthday,height,weight,games,victories,ties,defeats,goals,yellowCards,yellowReds,redCards,meanIAT,meanExp,seIAT,seExp
0,lucas-wilchez,Real Zaragoza,Spain,31.08.1983,177.0,72.0,144.0,65.0,32.0,47.0,10.0,21.0,1.0,2.0,0.356667,2.144522e-02,0.000857,1.224737e-03
1,john-utaka,Montpellier HSC,France,08.01.1982,179.0,82.0,431.0,186.0,102.0,143.0,88.0,33.0,0.0,2.0,0.326257,5.615282e-02,0.002318,6.582443e-03
2,aaron-hughes,Fulham FC,England,08.11.1979,182.0,71.0,654.0,247.0,179.0,228.0,9.0,19.0,0.0,0.0,0.346459,3.141599e-02,0.000652,3.250574e-03
3,aleksandar-kolarov,Manchester City,England,10.11.1985,187.0,80.0,285.0,138.0,57.0,90.0,28.0,50.0,4.0,3.0,0.360355,2.882958e-02,0.000445,8.495246e-04
4,alexander-tettey,Norwich City,England,04.04.1986,180.0,68.0,214.0,88.0,54.0,72.0,11.0,34.0,0.0,0.0,0.345591,3.061784e-02,0.000413,7.417314e-04
5,anders-lindegaard,Manchester United,England,13.04.1984,193.0,80.0,100.0,50.0,20.0,30.0,0.0,1.0,0.0,0.0,0.343969,2.654639e-02,0.000320,7.060504e-04
6,andreas-beck,1899 Hoffenheim,Germany,13.03.1987,180.0,70.0,296.0,125.0,72.0,99.0,7.0,57.0,1.0,0.0,0.344388,2.208224e-02,0.000272,5.963274e-04
7,antonio-rukavina,Real Valladolid,Spain,26.01.1984,177.0,74.0,249.0,95.0,70.0,84.0,3.0,37.0,1.0,0.0,0.352328,3.234136e-02,0.000656,3.595608e-03
8,ashkan-dejagah,Fulham FC,England,05.07.1986,181.0,74.0,321.0,131.0,71.0,119.0,56.0,55.0,1.0,4.0,0.344038,2.336233e-02,0.000316,1.148661e-03
9,benedikt-hoewedes,FC Schalke 04,Germany,29.02.1988,187.0,80.0,295.0,159.0,54.0,82.0,21.0,27.0,2.0,3.0,0.347212,2.382636e-02,0.000317,6.118591e-04


In [194]:
labels

Unnamed: 0,skintone
1,0.375
2,0.750
3,0.125
4,0.125
5,1.000
6,0.250
7,0.000
8,0.000
9,0.500
10,0.000
