# 04 - Applied ML

In [1]:
# import all the required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier

## Cleaning the data

In [2]:
# import the data
df = pd.read_csv('CrowdstormingDataJuly1st.csv')

# save initial number of records
initial_records = df.shape[0]

# initial shape of the data
print('Initial shape of the DataFrame:', df.shape)

Initial shape of the DataFrame: (146028, 28)


In [3]:
# check which columns have NaNs
df.isnull().any()

playerShort      False
player           False
club             False
leagueCountry    False
birthday         False
height            True
weight            True
position          True
games            False
victories        False
ties             False
defeats          False
goals            False
yellowCards      False
yellowReds       False
redCards         False
photoID           True
rater1            True
rater2            True
refNum           False
refCountry       False
Alpha_3           True
meanIAT           True
nIAT              True
seIAT             True
meanExp           True
nExp              True
seExp             True
dtype: bool

We can drop the column photoID since we won't use the photos of the players.

In [4]:
df = df.drop(['photoID'], axis=1)

In [5]:
# remove records that have rater1 and/or rater2 as NaNs
df = df.dropna(axis=0, subset=['rater1', 'rater2'])

# create a new column with the mean of the two raters
df['raterMean'] = (df['rater1'] + df['rater2']) / 2

# remove the columns rater1 and rater2
df = df.drop(['rater1', 'rater2'], axis=1)

In [6]:
df.shape

(124621, 26)

Let's now focus on the referees. As stated in [this notebook](http://nbviewer.jupyter.org/github/mathewzilla/redcard/blob/master/Crowdstorming_visualisation.ipynb), we should only take into account referees that took part in games of one of the considered leagues. This means that referees with less than 22 dyads (corresponding to the, at least, 22 players of a game) shouldn't be relevant.

In [7]:
#refs = df['refNum'].value_counts()

#df = df[df['refNum'].isin(refs[refs > 21].index)]

In [8]:
df.shape

(124621, 26)

Since we will group the records by player, the columns concerning the referees information can be removed, namely: refNum, refCountry and Alpha_3.

In [9]:
df = df.drop(['refNum', 'refCountry', 'Alpha_3'], axis=1)

Let's check again the features that have NaNs associated.

In [10]:
# check which columns have NaNs
df.isnull().any()

playerShort      False
player           False
club             False
leagueCountry    False
birthday         False
height            True
weight            True
position          True
games            False
victories        False
ties             False
defeats          False
goals            False
yellowCards      False
yellowReds       False
redCards         False
meanIAT           True
nIAT              True
seIAT             True
meanExp           True
nExp              True
seExp             True
raterMean        False
dtype: bool

In [11]:
df = df.dropna(axis=0, subset=['height', 'weight'])

records_after = df.shape[0]

By dropping the records that have NaNs in the height and weight columns we loose the following percentage of data:

In [12]:
(initial_records - records_after) / initial_records * 100

15.175171884844001

The columns nIAT, seIAT, nExp and seExp are associated with a particular referee and this information will become irrelevant as soon as the data is aggregated by players. So we'll also remove these features.

In [13]:
df.drop(['nIAT', 'seIAT', 'nExp', 'seExp'], axis=1, inplace=True)

The same happens with the columns meanIAT and meanExp. So, for now we'll also drop them.

In [14]:
#TODO: these features seem important, we should keep them somehow
df.drop(['meanIAT', 'meanExp'], axis=1, inplace=True)

We can remove the columns that have non-numeric data associated:

In [15]:
#TODO: should we keep the position of the player?
non_numeric_features = ['player', 'club', 'leagueCountry', 'birthday', 'position']

df.drop(non_numeric_features, axis=1, inplace=True)

In [16]:
df.isnull().any()

playerShort    False
height         False
weight         False
games          False
victories      False
ties           False
defeats        False
goals          False
yellowCards    False
yellowReds     False
redCards       False
raterMean      False
dtype: bool

In [17]:
print('Shape of the DataFrame before aggregating by player:', df.shape)

Shape of the DataFrame before aggregating by player: (123868, 12)


## Aggregating the data (by player)

In [18]:
grouped_by_player = df.groupby(['playerShort', 'height', 'weight', 'raterMean'], as_index=False)

clean_data = grouped_by_player.agg({'games': np.sum, 'victories': np.sum, 'ties': np.sum, 'defeats': np.sum, 'goals': np.sum,
                       'yellowCards': np.sum, 'yellowReds': np.sum, 'redCards': np.sum})

# drop the column playerShort (non-numeric data not relevant for the classifier)
clean_data.drop('playerShort', axis=1, inplace=True)

## Training the Classifier

In [19]:
features_to_consider = ['height', 'weight',
                        'games', 'victories', 'ties', 'defeats', 'goals',
                        'yellowCards', 'yellowReds', 'redCards']

X_train = df[features_to_consider]
Y_train = np.asarray(df['raterMean'], dtype="|S6")

In [20]:
clf = RandomForestClassifier(n_estimators=50)

In [21]:
clf = clf.fit(X_train, Y_train)

In [22]:
# error on the training set
train_error = 1.0 - clf.score(X_train, Y_train)
print('Error on the training set:', train_error)

Error on the training set: 0.266614460555


In [None]:
# cross validation :
from sklearn.model_selection import train_test_split
import random

# split the data :
randomState=random.randint(0,100)
TrainingError =[]
TestError = []

for i in range(10) : 
    clf = RandomForestClassifier(n_estimators=10*i)
    X_train, X_test, y_train, y_test = train_test_split(Xtrain, Ytrain, test_size=0.4, random_state=randomState)
    clf = clf.fit(X_train, Y_train)
    train_error = 1.0 - clf.score(X_train, y_train)
    test_error = clf.score(X_test,y_test)
    TrainingError.append(train_error)
    TestError.append(test_error)