In [89]:
# Import skope-rules
from skrules import SkopeRules

# Import librairies
import pandas as pd
from sklearn.cluster import AgglomerativeClustering
import warnings

#
warnings.filterwarnings('ignore')
data = pd.read_csv('../CompleteDataset.csv')

In [90]:
X_train.columns

Index(['Acceleration', 'Aggression', 'Agility', 'Balance', 'Ball_control',
       'Composure', 'Crossing', 'Curve', 'Dribbling', 'Finishing',
       'Free_kick_accuracy', 'GK_diving', 'GK_handling', 'GK_kicking',
       'GK_positioning', 'GK_reflexes', 'Heading_accuracy', 'CAM CM ',
       'CAM LW ', 'CB ', 'CDM ', 'CDM CAM CM ', 'CDM CB ', 'CDM CM ', 'CM ',
       'CM CB LB ', 'CM CDM ', 'GK ', 'LB ', 'LB RB CB ', 'LM CM ',
       'LM CM CAM ', 'LM LB ', 'LM RM CAM ', 'LM RM CAM CM ', 'LM RW RM ',
       'LW ', 'LW LM ', 'LW RW ', 'LW RW ST ', 'LW ST LM ', 'RM CM CAM ',
       'RM LM ', 'RM LW ST LM ', 'RW ', 'RW CAM ', 'RW RM ', 'ST ', 'ST CAM ',
       'ST CAM RW CF ', 'ST LW ', 'ST LW CF '],
      dtype='object')

In [91]:
data = data.query("Overall>=85") # Select players with an overall attribute larger than 85/100.

column_to_keep = ['Name', 'Acceleration', 'Aggression', 'Agility', 'Balance', 'Ball control',
       'Composure', 'Crossing', 'Curve', 'Dribbling', 'Finishing',
       'Free kick accuracy', 'GK diving', 'GK handling', 'GK kicking',
       'GK positioning', 'GK reflexes', 'Heading accuracy', 'Preferred Positions']
data = data[column_to_keep] # Keep only performance attributes and names.
data = pd.concat([data,pd.get_dummies(data['Preferred Positions'],drop_first=True)],axis=1)
data.columns = [x.replace(' ', '_') for x in data.columns] # Replace white spaces in the column names

feature_names = data.drop(['Name','Preferred_Positions'], axis=1).columns.tolist()

In [74]:
data = pd.concat([data,pd.get_dummies(data['Preferred_Positions'],drop_first=True)],axis=1)

In [40]:
from sklearn.preprocessing import LabelEncoder

In [41]:
number = LabelEncoder()
data['Preferred_Positions']= number.fit_transform(data['Preferred_Positions'].astype('str'))

In [93]:
clust = AgglomerativeClustering(n_clusters=4) #with euclidian distance and ward linkage

data['cluster'] = clust.fit_predict(data.drop(['Name','Preferred_Positions'], axis=1))

In [94]:
data.columns

Index(['Name', 'Acceleration', 'Aggression', 'Agility', 'Balance',
       'Ball_control', 'Composure', 'Crossing', 'Curve', 'Dribbling',
       'Finishing', 'Free_kick_accuracy', 'GK_diving', 'GK_handling',
       'GK_kicking', 'GK_positioning', 'GK_reflexes', 'Heading_accuracy',
       'Preferred_Positions', 'CAM_CM_', 'CAM_LW_', 'CB_', 'CDM_',
       'CDM_CAM_CM_', 'CDM_CB_', 'CDM_CM_', 'CM_', 'CM_CB_LB_', 'CM_CDM_',
       'GK_', 'LB_', 'LB_RB_CB_', 'LM_CM_', 'LM_CM_CAM_', 'LM_LB_',
       'LM_RM_CAM_', 'LM_RM_CAM_CM_', 'LM_RW_RM_', 'LW_', 'LW_LM_', 'LW_RW_',
       'LW_RW_ST_', 'LW_ST_LM_', 'RM_CM_CAM_', 'RM_LM_', 'RM_LW_ST_LM_', 'RW_',
       'RW_CAM_', 'RW_RM_', 'ST_', 'ST_CAM_', 'ST_CAM_RW_CF_', 'ST_LW_',
       'ST_LW_CF_', 'cluster'],
      dtype='object')

range(0, 4)

In [95]:
warnings.filterwarnings('ignore') #To deals with warning raised by max_samples=1 (see below).
#With max_samples=1, there is no Out-Of-Bag sample to evaluate performance (it is evaluated on all samples. 
#As there are less than 100 samples and this is a clustering-oriented task, the risk of overfitting is not 
#dramatic here.

i_cluster = 0
for i_cluster in range(4):
    X_train = data.drop(['Name','Preferred_Positions','cluster'], axis=1)
    y_train = (data['cluster']==i_cluster)*1
    skope_rules_clf = SkopeRules(feature_names=feature_names, random_state=42, n_estimators=5,
                                   recall_min=0.5, precision_min=0.5, max_depth_duplication=0,
                                   max_samples=1., max_depth=3)
    skope_rules_clf.fit(X_train, y_train)
    print('Cluster '+str(i_cluster)+':')
    #print(data.query('cluster=='+str(i_cluster))[['Name', 'Preferred_Positions']])
    print(skope_rules_clf.rules_)

Cluster 0:
[('Agility <= 81.5 and Free_kick_accuracy > 56.0 and Heading_accuracy > 58.5', (0.9354838709677419, 0.8529411764705882, 10))]
Cluster 1:
[('Aggression <= 76.5 and Agility > 81.5 and ST_LW_ <= 0.5', (1.0, 0.7741935483870968, 2))]
Cluster 2:
[('CB_ > 0.5 and Free_kick_accuracy <= 64.0', (1.0, 0.9285714285714286, 10))]
Cluster 3:
[('GK_reflexes > 59.0', (1.0, 1.0, 2))]


In [96]:
for i_cluster in range(4):
    print('5 players from cluster '+str(i_cluster)+':')
    print(data.query("cluster=="+str(i_cluster))['Name'].sample(5, random_state=42).tolist()) # Get 5 random players per cluster
    print()

5 players from cluster 0:
['M. Hamšík', 'Alex Sandro', 'Casemiro', 'K. Benzema', 'Z. Ibrahimović']

5 players from cluster 1:
['H. Mkhitaryan', 'David Silva', 'F. Ribéry', 'J. Rodríguez', 'P. Dybala']

5 players from cluster 2:
['Pepe', 'K. Glik', 'G. Chiellini', 'V. Kompany', 'Piqué']

5 players from cluster 3:
['M. ter Stegen', 'D. Subašić', 'M. Neuer', 'K. Navas', 'H. Lloris']

