### Dataset Used

FIFA 19 Player Skills : https://github.com/SukhsimarSingh/GBC-ML1/blob/main/fifa19.csv

Raw file link : https://raw.githubusercontent.com/SukhsimarSingh/GBC-ML1/main/fifa19.csv

Attributes: FIFA 2019 players attributes like Age, Nationality, Overall, Potential, Club, Value, Wage, Preferred Foot, International Reputation, Weak Foot, Skill Moves, Work Rate, Position, Jersey Number, Joined, Loaned From, Contract Valid Until, Height, Weight, LS, ST, RS, LW, LF, CF, RF, RW, LAM, CAM, RAM, LM, LCM, CM, RCM, RM, LWB, LDM, CDM, RDM, RWB, LB, LCB, CB, RCB, RB, Crossing, Finishing, Heading, Accuracy, ShortPassing, Volleys, Dribbling, Curve, FKAccuracy, LongPassing, BallControl, Acceleration, SprintSpeed, Agility, Reactions, Balance, ShotPower, Jumping, Stamina, Strength, LongShots, Aggression, Interceptions, Positioning, Vision, Penalties, Composure, Marking, StandingTackle, SlidingTackle, GKDiving, GKHandling, GKKicking, GKPositioning, GKReflexes, and Release Clause.

As of now the Feature selector works only for fifa19.csv dataset, but the aim is to make it generalized for any dataset 

### Importing Libraries

In [9]:
%matplotlib inline
import numpy as np
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats as ss
from collections import Counter
import math
from scipy import stats
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import SelectFromModel

from lightgbm import LGBMClassifier

### Building Functions for Feature Selection 

In [10]:
def cor_selector(X, y,num_feats):

    cor_list = []
    global feature_name 
    feature_name = X.columns.tolist()
    # calculate the correlation with for each feature
    for i in X.columns.tolist():
        cor  = np.corrcoef(X[i], y)[0,1]
        cor_list.append(cor)

    cor_list = [0 if np.isnan(i) else i for i in cor_list]
    # feature name
    cor_feature = X.iloc[:,np.argsort(np.abs(cor_list))[-num_feats:]].columns.tolist()
    # feature selection? 0 for not select, 1 for select
    cor_support = [True if i in cor_feature else False for i in feature_name]

    return cor_support, cor_feature

def chi_squared_selector(X, y, num_feats):

    X_norm = MinMaxScaler().fit_transform(X)
    chi_selector = SelectKBest(chi2, k = num_feats)
    chi_selector.fit(X_norm, y)
    chi_support = chi_selector.get_support()
    chi_feature = X.loc[:, chi_support].columns.tolist()

    return chi_support, chi_feature

def rfe_selector(X, y, num_feats):

    rfe_selector = RFE(estimator =LogisticRegression(),
                      n_features_to_select = num_feats,
                      step =10, verbose =5
                      )
    X_norm = MinMaxScaler().fit_transform(X)
    rfe_selector.fit(X_norm,y)
    rfe_support = rfe_selector.get_support()
    rfe_feature = X.loc[:, rfe_support].columns.tolist()

    return rfe_support, rfe_feature

def embedded_log_reg_selector(X, y, num_feats):

    X_norm = MinMaxScaler().fit_transform(X)
    embedded_lr_selector = SelectFromModel(LogisticRegression(penalty="l2"), max_features=num_feats)
    embedded_lr_selector.fit(X_norm,y)
    
    embedded_lr_support = embedded_lr_selector.get_support()
    embedded_lr_feature = X.loc[:,embedded_lr_support].columns.tolist()

    return embedded_lr_support, embedded_lr_feature

def embedded_rf_selector(X, y, num_feats):

    embedded_rf_selector = SelectFromModel(RandomForestClassifier(n_estimators=100), max_features=num_feats)
    embedded_rf_selector.fit(X,y)
    
    embedded_rf_support = embedded_rf_selector.get_support()
    embedded_rf_feature = X.loc[:, embedded_rf_support].columns.tolist()

    return embedded_rf_support, embedded_rf_feature

def embedded_lgbm_selector(X, y, num_feats):

    lgbc = LGBMClassifier(n_estimators=500, learning_rate=0.05, num_leaves =32, colsample_bytree=0.2,
                         reg_alpha=3, reg_lambda=1, min_split_gain=0.01, min_child_weight=40)
    embedded_lgbm_selector = SelectFromModel(lgbc, max_features=num_feats)
    embedded_lgbm_selector.fit(X,y)
    
    embedded_lgbm_support = embedded_lgbm_selector.get_support()
    embedded_lgbm_feature = X.loc[:, embedded_lgbm_support].columns.tolist()

    return embedded_lgbm_support, embedded_lgbm_feature

### Data Preprocessing

In [11]:
def preprocess_dataset(dataset_path):
    # Your code starts here (Multiple lines)
    player_df = pd.read_csv(dataset_path)
    numcols = ['Overall', 'Crossing','Finishing',  'ShortPassing',  'Dribbling','LongPassing', 'BallControl', 'Acceleration','SprintSpeed', 'Agility',  'Stamina','Volleys','FKAccuracy','Reactions','Balance','ShotPower','Strength','LongShots','Aggression','Interceptions']
    catcols = ['Preferred Foot','Position','Body Type','Nationality','Weak Foot']
    player_df = player_df[numcols+catcols]
    traindf = pd.concat([player_df[numcols], pd.get_dummies(player_df[catcols])],axis=1)
    features = traindf.columns

    traindf = traindf.dropna()
    traindf = pd.DataFrame(traindf,columns=features)
    
    y = traindf['Overall']>=87
    X = traindf.copy()
    del X['Overall']
    
    feature_name = list(X.columns)
    
    num_feats = 30

    return X, y, num_feats

### Putting everything together

In [8]:
def autoFeatureSelector(dataset_path, methods=[]):
    # Parameters
    # data - dataset to be analyzed (csv file)
    # methods - various feature selection methods we outlined before, use them all here (list)
    
    # preprocessing
    X, y, num_feats = preprocess_dataset(dataset_path)
    
    # Run every method we outlined above from the methods list and collect returned best features from every method
    if 'pearson' in methods:
        cor_support, cor_feature = cor_selector(X, y,num_feats)
    if 'chi-square' in methods:
        chi_support, chi_feature = chi_squared_selector(X, y,num_feats)
    if 'rfe' in methods:
        rfe_support, rfe_feature = rfe_selector(X, y,num_feats)
    if 'log-reg' in methods:
        embedded_lr_support, embedded_lr_feature = embedded_log_reg_selector(X, y, num_feats)
    if 'rf' in methods:
        embedded_rf_support, embedded_rf_feature = embedded_rf_selector(X, y, num_feats)
    if 'lgbm' in methods:
        embedded_lgbm_support, embedded_lgbm_feature = embedded_lgbm_selector(X, y, num_feats)
    
    
    # Combining all the above feature list and count the maximum set of features that got selected by all methods

    pd.set_option('display.max_rows', None)
    # put all selection together
    feature_selection_df = pd.DataFrame({'Feature':feature_name, 'Pearson':cor_support, 'Chi-2':chi_support, 'RFE':rfe_support, 'Logistics':embedded_lr_support,
                                    'Random Forest':embedded_rf_support, 'LightGBM':embedded_lgbm_support})
    # Counting the selected times for each feature
    feature_selection_df['Total'] = np.sum(feature_selection_df, axis=1)
    # Displaying the top 100
    feature_selection_df = feature_selection_df.sort_values(['Total','Feature'] , ascending=False)
    feature_selection_df.index = range(1, len(feature_selection_df)+1)
    feature_selection_df.head(num_feats)
    
    best_features = list(feature_selection_df["Feature"].head())

    return best_features

if __name__ == '__main__':
    dataset_path = input("Enter the path of Dataset:  ")
    methods = input("Enter the methods used with 1 space between them:  ").lower().split()
    best_features = autoFeatureSelector(dataset_path, methods=['pearson', 'chi-square', 'rfe', 'log-reg', 'rf', 'lgbm'])
    print("The best features are as follows: ",best_features)
    close = input("Press Enter to Close the Program :  ")

Enter the path of Dataset:  https://raw.githubusercontent.com/SukhsimarSingh/GBC-ML1/main/fifa19.csv
Enter the methods used with 1 space between them:  pearson rfe
Fitting estimator with 223 features.
Fitting estimator with 213 features.
Fitting estimator with 203 features.
Fitting estimator with 193 features.
Fitting estimator with 183 features.
Fitting estimator with 173 features.
Fitting estimator with 163 features.
Fitting estimator with 153 features.
Fitting estimator with 143 features.
Fitting estimator with 133 features.
Fitting estimator with 123 features.
Fitting estimator with 113 features.
Fitting estimator with 103 features.
Fitting estimator with 93 features.
Fitting estimator with 83 features.
Fitting estimator with 73 features.
Fitting estimator with 63 features.
Fitting estimator with 53 features.
Fitting estimator with 43 features.
Fitting estimator with 33 features.


  return reduction(axis=axis, out=out, **passkwargs)


The best features are as follows:  ['Volleys', 'ShortPassing', 'Reactions', 'LongPassing', 'Finishing']
Press Enter to Close the Program :  
