# Feature selection

Here we have used RFE or recursive feature elimination with scikit learn to select the relevant features required to classify a player based on his skills. Recursive Feature Elimination as its title suggests recursively removes features, builds a model using the remaining attributes and calculates model accuracy. 

In [1]:
# Libraries
import matplotlib.pyplot as plt
import pandas as pd
import math
import numpy as np
import datetime as DT
from sklearn import svm
from sklearn import datasets, linear_model
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.svm import SVC
from sklearn.feature_selection import RFE
from dataloader import loader


In [2]:
X,y,featurenames=loader()
svc = SVC(kernel="linear", C=1)
rfe = RFE(estimator=svc, n_features_to_select=1, step=1)
rfe.fit(X, y)

RFE(estimator=SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
  n_features_to_select=1, step=1, verbose=0)

In [3]:
rank=list(rfe.ranking_)
res=[]
for i in range(1,36):
    res.append(featurenames[rank.index(i)])

### Below are the features based on the order of importance according to RFE

In [4]:
res

['Vision',
 'StandingTackle',
 'ShortPassing',
 'Crossing',
 'Positioning',
 'GKHandling',
 'BallControl',
 'Aggression',
 'Marking',
 'Strength',
 'GKReflexes',
 'SlidingTackle',
 'height',
 'Acceleration',
 'Agility',
 'Volleys',
 'Balance',
 'Composure',
 'Jumping',
 'Dribbling',
 'LongPassing',
 'Interceptions',
 'FKAccuracy',
 'SprintSpeed',
 'LongShots',
 'GKPositioning',
 'HeadingAccuracy',
 'Finishing',
 'Stamina',
 'weight',
 'Penalties',
 'GKDiving',
 'GKKicking',
 'Curve',
 'ShotPower']

### Relative feature importance of selected features with Kbest classifier

Here we are extracting the scores of each feature which corresponds to importance of each feature for classification. For this we have used SelectKBest classifier because RFE classifier doesn't have any score parameter which can be used to evaluate the relative importance of each feature.

In [5]:
Kbest=SelectKBest(k=8)
Kbest.fit(X,y)
scores=np.array(Kbest.scores_)
skills_dict = {}
for i in range(len(scores)):
    skills_dict[featurenames[i]]=scores[i]

In [6]:
scoreslist= [skills_dict['Vision'], skills_dict['StandingTackle'],skills_dict['ShortPassing'],skills_dict['Crossing'],skills_dict['Positioning'],skills_dict['GKHandling'],skills_dict['BallControl'],skills_dict['Aggression']]
names=['Vision','StandingTackle','ShortPassing','Crossing','Positioning','GKHandling','BallControl','Aggression']

### Bar plot of attributes vs score with Kbest classifier

In [7]:
"""
bar plots showing y vs x

"""
import bokeh
from bokeh.plotting import figure, show, output_notebook
from bokeh.models import ColumnDataSource, ranges, LabelSet

import matplotlib.pyplot as plt
import numpy as np


output_notebook()

source = ColumnDataSource(dict(x=names,y=scoreslist))

p = figure(y_range=names, plot_width=700, plot_height=400, title='Attribute Scores')
p.yaxis.major_label_orientation = np.pi/4
p.hbar(y=names, height=0.5, left=0, right=scoreslist, color="firebrick" )
p.xaxis.axis_label = 'Score'
show(p)