# Applied ML

In [1]:
# Normal stack of pandas, numpy, matplotlib and seaborn
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
import seaborn as sns

# Statistical test library
import scipy.stats as stats
import random

from sklearn import preprocessing
from sklearn.model_selection import learning_curve
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.grid_search import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

%matplotlib inline

np.random.seed(10)
random.seed(10)



# Data pre-processing

In [2]:
# Load the dataset
original = pd.read_csv("CrowdstormingDataJuly1st.csv", parse_dates=['birthday'], infer_datetime_format=True)

In [3]:
original.describe()



Unnamed: 0,height,weight,games,victories,ties,defeats,goals,yellowCards,yellowReds,redCards,rater1,rater2,refNum,refCountry,meanIAT,nIAT,seIAT,meanExp,nExp,seExp
count,145765.0,143785.0,146028.0,146028.0,146028.0,146028.0,146028.0,146028.0,146028.0,146028.0,124621.0,124621.0,146028.0,146028.0,145865.0,145865.0,145865.0,145865.0,145865.0,145865.0
mean,181.935938,76.075662,2.921166,1.278344,0.708241,0.934581,0.338058,0.385364,0.011381,0.012559,0.264255,0.302862,1534.827444,29.642842,0.346276,19697.41,0.0006310849,0.452026,20440.23,0.002994
std,6.738726,7.140906,3.413633,1.790725,1.116793,1.383059,0.906481,0.795333,0.107931,0.112889,0.295382,0.29302,918.736625,27.496189,0.032246,127126.2,0.004735857,0.217469,130615.7,0.019723
min,161.0,54.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,-0.047254,2.0,2.235373e-07,-1.375,2.0,1e-06
25%,,,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,641.0,7.0,,,,,,
50%,,,2.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,,,1604.0,21.0,,,,,,
75%,,,3.0,2.0,1.0,1.0,0.0,1.0,0.0,0.0,,,2345.0,44.0,,,,,,
max,203.0,100.0,47.0,29.0,14.0,18.0,23.0,14.0,3.0,2.0,1.0,1.0,3147.0,161.0,0.573793,1975803.0,0.2862871,1.8,2029548.0,1.06066


In [4]:
# First glimpse at data content
original.ix[:5,:13]

Unnamed: 0,playerShort,player,club,leagueCountry,birthday,height,weight,position,games,victories,ties,defeats,goals
0,lucas-wilchez,Lucas Wilchez,Real Zaragoza,Spain,1983-08-31,177.0,72.0,Attacking Midfielder,1,0,0,1,0
1,john-utaka,John Utaka,Montpellier HSC,France,1982-01-08,179.0,82.0,Right Winger,1,0,0,1,0
2,abdon-prats,Abdón Prats,RCD Mallorca,Spain,1992-12-17,181.0,79.0,,1,0,1,0,0
3,pablo-mari,Pablo Marí,RCD Mallorca,Spain,1993-08-31,191.0,87.0,Center Back,1,1,0,0,0
4,ruben-pena,Rubén Peña,Real Valladolid,Spain,1991-07-18,172.0,70.0,Right Midfielder,1,1,0,0,0
5,aaron-hughes,Aaron Hughes,Fulham FC,England,1979-11-08,182.0,71.0,Center Back,1,0,0,1,0


In [5]:
original.ix[:5,13:]

Unnamed: 0,yellowCards,yellowReds,redCards,photoID,rater1,rater2,refNum,refCountry,Alpha_3,meanIAT,nIAT,seIAT,meanExp,nExp,seExp
0,0,0,0,95212.jpg,0.25,0.5,1,1,GRC,0.326391,712.0,0.000564,0.396,750.0,0.002696
1,1,0,0,1663.jpg,0.75,0.75,2,2,ZMB,0.203375,40.0,0.010875,-0.204082,49.0,0.061504
2,1,0,0,,,,3,3,ESP,0.369894,1785.0,0.000229,0.588297,1897.0,0.001002
3,0,0,0,,,,3,3,ESP,0.369894,1785.0,0.000229,0.588297,1897.0,0.001002
4,0,0,0,,,,3,3,ESP,0.369894,1785.0,0.000229,0.588297,1897.0,0.001002
5,0,0,0,3868.jpg,0.25,0.0,4,4,LUX,0.325185,127.0,0.003297,0.538462,130.0,0.013752


Challenges in dataset:

- Skin ratings don't match -> take average
- Picture was missing -> exclude from training
- No cards drawn in some dyads


Prune dataset where there is no rater information

# Initial cleaning


In [6]:
# Helper function to see the efect when we drop rows
def dropping_stats(df):
    drop_perc =  100 * (original.shape[0] - df.shape[0]) / original.shape[0]
    print("%.2f%% of original data droped." % (drop_perc) )
    
    print("Now: %d rows" % df.shape[0] )

In [7]:
dropping_stats(original)

0.00% of original data droped.
Now: 146028 rows


In [235]:
# Drop columns that will not be relevant for our model
original['year'] = original['birthday'].apply(lambda x: x.year)
cleaned = original.drop(['photoID', 'refCountry', 'Alpha_3', 'player', 'birthday'], errors='raise', axis=1)

In [236]:
# If a referee is present in less than 22 triads (rows), he cannot have
# refereed a match in these leages.
ref_grouped = cleaned[['refNum', 'games']].groupby(['refNum']).sum()
ref_filtered = ref_grouped[ref_grouped['games'] >= 22].reset_index()

# Therefore, we filter the data on this condition
has_referee = cleaned[cleaned['refNum'].isin(ref_filtered['refNum'].values)]

dropping_stats(has_referee)


5.75% of original data droped.
Now: 137635 rows


### Dealing with nan-values


In [237]:
cleaned['weight'].isnull().value_counts()

False    143785
True       2243
Name: weight, dtype: int64

In [238]:
cleaned['height'].isnull().value_counts()

False    145765
True        263
Name: height, dtype: int64

We will fill these with the mean value

In [239]:
cleaned['weight'].fillna('mean', inplace=True)
cleaned['height'].fillna('mean', inplace=True)

In [240]:
print(cleaned['height'].hasnans)
print(cleaned['weight'].hasnans)

False
False


## Making the class feature

To make the class of the skin color of each player, we take the mean of the value from the two raters. 
Players that does not have a rating gets dropped.

In [241]:
def get_binary_class(x):
    """ Returns 0 for players rated below 0.5 ('light-skinned') and 1 for players rated above ('dark-skinned')"""
    if x < 0.5:
        return 0
    else:
        return 1

In [242]:
# Take the mean of the two raters value
mean_rating = has_referee[['rater1', 'rater2']].mean(axis=1)
# Drop the players that does not have a rating
mean_rating.dropna(inplace=True)
dropping_stats(mean_rating)

19.48% of original data droped.
Now: 117579 rows


In [243]:
# The players now have a rating between 0 and 1, real numbers
mean_rating.head()

5    0.125
6    0.125
7    1.000
8    0.250
9    0.000
dtype: float64

In [244]:
# Since we want to do a binary classification, we convert the real numbers to 0/1

binary_class = mean_rating.apply(get_binary_class)
binary_class.name = 'class'
binary_class.head()

5    0
6    0
7    1
8    0
9    0
Name: class, dtype: int64

In [245]:
# Keep only the data rows where we have the class

has_class = has_referee.ix[binary_class.index]
dropping_stats(has_class)

19.48% of original data droped.
Now: 117579 rows


In [246]:
counts = binary_class.value_counts()
counts

0    90492
1    27087
Name: class, dtype: int64

In [247]:
print("%.2f%% of the examples are light skinned" % (counts[0] / (counts[0] + counts[1]) * 100))

76.96% of the examples are light skinned


Convert string values to floats by LabelEncoder to make them readable by the classifier

## Feature transformation

When we have categorical data, we need to transform them so that they can be taken into account in our model. One way of doing this is to use LabelEncoder and OneHotEncoder from SciKitLearn. 

LabelEncoder converts each category into an integer, so that we don't have to deal with strings. After doing this transformation, we use OneHotEncoder to make a binary feature for each category. This way, we can capture for instance wether a person has played for both Fulham FC and Manchester City. 

In [248]:
countries_encoded = pd.get_dummies(has_class['leagueCountry'])
countries_encoded.head()

Unnamed: 0,England,France,Germany,Spain
5,1.0,0.0,0.0,0.0
6,1.0,0.0,0.0,0.0
7,1.0,0.0,0.0,0.0
8,1.0,0.0,0.0,0.0
9,0.0,0.0,1.0,0.0


In [249]:
position_encoded = pd.get_dummies(has_class['position'])
position_encoded.head()

Unnamed: 0,Attacking Midfielder,Center Back,Center Forward,Center Midfielder,Defensive Midfielder,Goalkeeper,Left Fullback,Left Midfielder,Left Winger,Right Fullback,Right Midfielder,Right Winger
5,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


We remove the original categorical features, and attatch the new one_hot_encoded ones :

### Feature combinations

In [250]:
# Our hypothesis is that the combination of the referees 'discrimination score' and cards given,
# might help us classify the players. E.g., if a player got many cards from racist referees, 
# he is more likely dark-skinned.

red_comb = has_class['redCards'] * has_class['meanIAT']
yellow_comb = has_class['yellowCards'] * has_class['meanIAT']
yellow_red_comb = has_class['yellowReds'] * has_class['meanIAT']

cards_iat = pd.concat([red_comb, yellow_comb, yellow_red_comb], axis=1)
cards_iat.columns = ['red_iat', 'yellow_iat', 'red_yellow_iat']
cards_iat.head(6)

Unnamed: 0,red_iat,yellow_iat,red_yellow_iat
5,0.0,0.0,0.0
6,0.0,0.0,0.0
7,0.0,0.0,0.0
8,0.0,0.0,0.0
9,0.0,0.0,0.0
10,0.0,0.325185,0.0


### Combining the features

In [261]:
columns_from_orig = ['playerShort', 'year', 'height', 'weight', 'games', 'victories','ties', 'defeats','goals','yellowCards',
                     'yellowReds','redCards','meanIAT', 'nIAT', 'seIAT', 'meanExp', 'nExp', 'seExp']

colomns_one_hot_enoded = countries_encoded.columns | position_encoded.columns

features = has_class[columns_from_orig].join(countries_encoded).join(position_encoded).join(cards_iat).join(binary_class)
features.head()

Unnamed: 0,playerShort,year,height,weight,games,victories,ties,defeats,goals,yellowCards,...,Left Fullback,Left Midfielder,Left Winger,Right Fullback,Right Midfielder,Right Winger,red_iat,yellow_iat,red_yellow_iat,class
5,aaron-hughes,1979,182.0,71.0,1,0,0,1,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
6,aleksandar-kolarov,1985,187.0,80.0,1,1,0,0,0,0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
7,alexander-tettey,1986,180.0,68.0,1,0,0,1,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
8,anders-lindegaard,1984,193.0,80.0,1,0,1,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
9,andreas-beck,1987,180.0,70.0,1,1,0,0,0,0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0


### Aggregating over the players:

We now run grouping and aggregation of our dataframe. The aggregation functions used are defined in two dictionarys.
Each element of the dictionarys contains of a column name and an aggregation function, which is applied to our grouped features.

In [267]:
players = features.groupby(['playerShort','year','height', 'weight'])

# We sum over the one hot encoded features
one_hot_aggregation = {i: sum for i in colomns_one_hot_enoded}

# And then sum over games, victories, ties, defeats, goals, cards, 
column_aggfunc_mapping = {'class': max, 'games': sum, 'victories': sum, 'ties': sum, 'defeats': sum, 'goals': sum,
                          'yellowCards': sum, 'yellowReds': sum, 'redCards': sum, 'red_iat': sum,
                          'yellow_iat': sum, 'red_yellow_iat': sum, 'meanIAT': np.mean, 'nIAT': np.mean}

# Union the aggregation function dicts
agg_funcs = {**one_hot_aggregation, **column_aggfunc_mapping}

agg_features = players.agg(agg_funcs)
agg_features.reset_index().set_index('playerShort')
agg_features.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,victories,yellow_iat,meanIAT,games,redCards,Center Midfielder,Right Fullback,yellowReds,red_yellow_iat,Spain,...,Left Winger,Germany,France,Left Midfielder,England,goals,Defensive Midfielder,class,Right Winger,nIAT
playerShort,year,height,weight,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1
aaron-hughes,1979,182.0,71.0,246,6.420221,0.344413,650,0,0.0,0.0,0,0.0,0.0,...,0.0,0.0,0.0,0.0,162.0,9,0.0,0,0.0,20193.709877
aaron-hunt,1986,183.0,73.0,141,14.374395,0.348943,335,1,0.0,0.0,0,0.0,0.0,...,0.0,98.0,0.0,0.0,0.0,62,0.0,0,0.0,26291.591837
aaron-lennon,1987,165.0,63.0,200,3.651979,0.345893,412,0,0.0,0.0,0,0.0,0.0,...,0.0,0.0,0.0,0.0,101.0,31,0.0,0,0.0,21234.861386
aaron-ramsey,1990,178.0,76.0,145,10.462342,0.34679,254,1,98.0,0.0,0,0.0,0.0,...,0.0,0.0,0.0,0.0,98.0,39,0.0,0,0.0,40568.571429
abdelhamid-el-kaoutari,1990,180.0,73.0,41,2.677471,0.3316,124,2,0.0,0.0,4,1.356972,0.0,...,0.0,0.0,37.0,0.0,0.0,1,0.0,0,0.0,2832.351351


In [None]:
aggregated_features = players.reset_index().set_index('playerShort')
aggregated_features['meanIAT'].head()

In [None]:
games = aggregated_features['games']
game_dep = aggregated_features[['Center Forward', 'defeats', 'yellowCards', 'meanIAT', 'Spain',
       'Germany', 'Left Winger', 'England', 'France', 'nExp',
       'Attacking Midfielder', 'victories', 'Defensive Midfielder', 'nIAT',
       'yellowReds', 'Left Fullback', 'Goalkeeper', 'Center Back',
       'combIAT', 'meanExp', 'Missing', 'Left Midfielder', 'goals', 'redCards',
       'Right Fullback', 'Center Midfielder', 'Right Winger',
       'Right Midfielder', 'ties']]

game_normalized = game_dep.div(games, axis='rows')

In [None]:
rating = aggregated_features['rater_mean']
prepped_features = aggregated_features[aggregated_features.columns[:3]].join(game_normalized).join(rating).join(games)

In [None]:
prepped_features['year'].hist()

In [None]:
prepped_features['rater'].isnull().value_counts()

Prepared dataset for further processing with ML methods

In [None]:



Xy = prepped_features #.apply(LabelEncoder().fit_transform)

In [None]:
Xt = Xy.drop(['rater_mean'], axis=1)
y = Xy['rater_mean']

print('Samples:', Xt.shape)
print('Lables:', y.shape)

In [None]:
Xt.shape

## Normalizing

In [None]:

X = preprocessing.normalize(Xt, norm='l2')
X

## Machine Learning by RandomForestClassifier

### Base model

In [None]:
y.value_counts().plot(kind='barh', stacked=True)

In [None]:
print('There are about %.2f%% 0s in the class vector.' % (y.value_counts()[0] / y.shape[0]))

The result above shows that by allways predicting 0, we could achieve an accuracy of approximately 60%. 
We should therefore expect that our classifier performs at least as good as this, and hopefully significantly better. 

### Tuning the model

In [None]:
rfc = RandomForestClassifier(n_jobs=-1, n_estimators=200, random_state=42) 

param_grid = {
    'max_features': ['log2','sqrt', None],
    'min_samples_leaf': [2,20,200],
    'max_depth': [4,8,16,None],
}

#(8)    'max_depth': [4,8,16,None],
#(gini) 'criterion': ['gini', 'entropy'],
#(log2) 'max_features': ['log2','sqrt', None],
#(20) 'min_samples_split': [2,20,200],
#(2) 'min_samples_leaf': [2,20,200],
#(20) 'min_samples_split': [2,20,200],
#(1e-7) 'min_impurity_split': [1e-07, 1e-06, 1e-05],
#(True) 'bootstrap': [True, False],

#### Optimizing for F1 score

In [None]:
CV_rfc = GridSearchCV(estimator=rfc, scoring='f1', param_grid=param_grid, cv=10, verbose=True, n_jobs=-1)
CV_rfc.fit(X, y, )
print(CV_rfc.best_params_)

#### Optimizing for accuracy

In [None]:
CV_rfc = GridSearchCV(estimator=rfc, scoring='accuracy', param_grid=param_grid, cv=10, verbose=True, n_jobs=-1)
CV_rfc.fit(X, y, )
print(CV_rfc.best_params_)

### Training the model

We use the results found in the grid search to tune our random forrest classifier. 

Description of the hyperparameters:

- n_estimators: The number of trees used in random forest
- min_samples_leaf: TODO
- max_features: TODO
- max_depth: The maximal depth of the tree

- bootstrap: Todo
- oob_score: Todo
- n_jobs: Number of processes used in the calculation. -1 uses all avilable.
- random_state: Seed for the random generator, to give reproducable results.

In [None]:
rfc = RandomForestClassifier(n_estimators=1000, min_samples_leaf=2, max_features='log2', max_depth=8, bootstrap=True, oob_score=True, random_state=4, n_jobs=-1)
print(rfc)

We can then fit our model to the data:

In [None]:
rfc.fit(X, y)

In [None]:
X_rfc.head()

## Inspect most relevant features of RandomForest

In [None]:
importances = rfc.feature_importances_
std = np.std([tree.feature_importances_ for tree in rfc.estimators_],
             axis=0)
indices = np.argsort(importances)[::-1]

# Print the feature ranking
print("Feature ranking:")

for f in range(X.shape[1]):
    print("%d. feature %d : %s  (%f)" % (f + 1, indices[f], Xt.columns[indices[f]], importances[indices[f]]))

# Plot the feature importances of the forest
plt.figure()
plt.title("Feature importances")
plt.bar(range(X.shape[1]), importances[indices],
       color="r", yerr=std[indices], align="center")
plt.xticks(range(X.shape[1]), indices)
plt.xlim([-1, X.shape[1]])
plt.xlabel("Feature")
plt.ylabel("Feature importance")
plt.show()

## Performance assessment 

### Cross-validation

In [None]:
# 10-fold cross-validation with K=5 for KNN (the n_neighbors parameter)
scores = cross_val_score(rfc, X, y, cv=10, scoring='f1', n_jobs=-1)
print(scores)
print('Achieved model score: ', np.mean(scores))

Visualize score results as boxplots

In [None]:
plt.boxplot(scores)

### Confusion matrix

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

In [None]:
rfc.fit(X_train, y_train)

y_pred = rfc.predict(X_test)
print(classification_report(y_test, y_pred))

# Bonus: Learning curve


In [None]:
#train_sizes = np.arange(300,1201, int((1201-300)/20))
train_sizes = np.arange(0.1, 1.0, 0.8/3)
train_sizes.shape

In [None]:


rfc = RandomForestClassifier(
    n_estimators=1000, min_samples_leaf=2, max_features='log2', max_depth=8,
    bootstrap=True, oob_score=True, random_state=4, n_jobs=-1)

train_sizes, train_scores, test_scores = learning_curve(
     rfc, X, y, train_sizes=train_sizes, cv=10, n_jobs=-1)

In [None]:
plt.figure()
plt.title("Learning curve")

plt.xlabel("Training examples")
plt.ylabel("Score")

train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)

plt.grid()
plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                 train_scores_mean + train_scores_std, alpha=0.1,
                 color="r")
plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                 test_scores_mean + test_scores_std, alpha=0.1, color="g")
plt.plot(train_sizes, train_scores_mean, '-', color="r",
         label="Training score")
plt.plot(train_sizes, test_scores_mean, '-', color="g",
         label="Cross-validation score")

plt.legend(loc="best")

### Club skin-color rate

In [219]:
# TODO: is this cheating?
club_class = pd.concat([has_class['club'], binary_class], axis=1)
club_class_mean = club_class.groupby('club').mean()
club_class_mean.columns = ['club_class_mean']
club_class_mean.head()

Unnamed: 0_level_0,club_class_mean
club,Unnamed: 1_level_1
1. FC Nürnberg,0.128803
1. FSV Mainz 05,0.165644
1899 Hoffenheim,0.16595
AC Ajaccio,0.402757
AS Nancy,0.396


In [220]:
club_stats = pd.DataFrame(has_class['club']).merge(club_class_mean, left_on='club', right_index=True, how='left')
club_stats.head()

Unnamed: 0,club,club_class_mean
5,Fulham FC,0.157295
6,Manchester City,0.454163
7,Norwich City,0.414286
8,Manchester United,0.288742
9,1899 Hoffenheim,0.16595


In [None]:
plt.savefig("Learning_curve")

## TODO

- Verify if classifier should be categorical of binary
- Take into account the referee statistics in a smart way (see Slack discussion)
- Try different parameters for the random forest to compare results (bias, variance)
- Bonus
- Exercise 2
