# Task 7: AutoFeatureSelector Tool
## This task is to test your understanding of various Feature Selection methods outlined in the lecture and the ability to apply this knowledge in a real-world dataset to select best features and also to build an automated feature selection tool as your toolkit

### Use your knowledge of different feature selector methods to build an Automatic Feature Selection tool
- Pearson Correlation
- Chi-Square
- RFE
- Embedded
- Tree (Random Forest)
- Tree (Light GBM)

### Dataset: FIFA 19 Player Skills
#### Attributes: FIFA 2019 players attributes like Age, Nationality, Overall, Potential, Club, Value, Wage, Preferred Foot, International Reputation, Weak Foot, Skill Moves, Work Rate, Position, Jersey Number, Joined, Loaned From, Contract Valid Until, Height, Weight, LS, ST, RS, LW, LF, CF, RF, RW, LAM, CAM, RAM, LM, LCM, CM, RCM, RM, LWB, LDM, CDM, RDM, RWB, LB, LCB, CB, RCB, RB, Crossing, Finishing, Heading, Accuracy, ShortPassing, Volleys, Dribbling, Curve, FKAccuracy, LongPassing, BallControl, Acceleration, SprintSpeed, Agility, Reactions, Balance, ShotPower, Jumping, Stamina, Strength, LongShots, Aggression, Interceptions, Positioning, Vision, Penalties, Composure, Marking, StandingTackle, SlidingTackle, GKDiving, GKHandling, GKKicking, GKPositioning, GKReflexes, and Release Clause.

In [7]:
%matplotlib inline
import numpy as np
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats as ss
from collections import Counter
import math
from scipy import stats

In [8]:
player_df = pd.read_csv(r"D:\AI GBC\Machine Learning 1\Assignments & Tasks\Task 7 AutoFeatureSelector Tool\fifa19.csv")

In [9]:
numcols = ['Overall', 'Crossing','Finishing',  'ShortPassing',  'Dribbling','LongPassing', 'BallControl', 'Acceleration','SprintSpeed', 'Agility',  'Stamina','Volleys','FKAccuracy','Reactions','Balance','ShotPower','Strength','LongShots','Aggression','Interceptions']
catcols = ['Preferred Foot','Position','Body Type','Nationality','Weak Foot']

In [10]:
player_df = player_df[numcols+catcols]

In [11]:
traindf = pd.concat([player_df[numcols], pd.get_dummies(player_df[catcols])],axis=1)
features = traindf.columns

traindf = traindf.dropna()

In [12]:
traindf = pd.DataFrame(traindf,columns=features)

In [13]:
y = traindf['Overall']>=87
X = traindf.copy()
del X['Overall']

In [14]:
X.head()

Unnamed: 0,Crossing,Finishing,ShortPassing,Dribbling,LongPassing,BallControl,Acceleration,SprintSpeed,Agility,Stamina,...,Nationality_Uganda,Nationality_Ukraine,Nationality_United Arab Emirates,Nationality_United States,Nationality_Uruguay,Nationality_Uzbekistan,Nationality_Venezuela,Nationality_Wales,Nationality_Zambia,Nationality_Zimbabwe
0,84.0,95.0,90.0,97.0,87.0,96.0,91.0,86.0,91.0,72.0,...,False,False,False,False,False,False,False,False,False,False
1,84.0,94.0,81.0,88.0,77.0,94.0,89.0,91.0,87.0,88.0,...,False,False,False,False,False,False,False,False,False,False
2,79.0,87.0,84.0,96.0,78.0,95.0,94.0,90.0,96.0,81.0,...,False,False,False,False,False,False,False,False,False,False
3,17.0,13.0,50.0,18.0,51.0,42.0,57.0,58.0,60.0,43.0,...,False,False,False,False,False,False,False,False,False,False
4,93.0,82.0,92.0,86.0,91.0,91.0,78.0,76.0,79.0,90.0,...,False,False,False,False,False,False,False,False,False,False


In [15]:
len(X.columns)

223

### Set some fixed set of features

In [17]:
feature_name = list(X.columns)
# no of maximum features we need to select
num_feats=30

## Filter Feature Selection - Pearson Correlation

### Pearson Correlation function

In [20]:
# Pearson Correlation

def cor_selector(X, y, num_feats):
    cor_list = []
    for i in X.columns:
        cor = np.corrcoef(X[i], y)[0, 1]
        cor_list.append(cor)
    cor_list = [0 if np.isnan(i) else i for i in cor_list]
    cor_abs = np.abs(cor_list)
    cor_support = np.argsort(cor_abs)[-num_feats:]
    cor_feature = X.columns[cor_support].tolist()
    return cor_support, cor_feature

cor_support, cor_feature = cor_selector(X, y, num_feats)
print(f"{len(cor_feature)} selected features: {cor_feature}")

30 selected features: ['Nationality_Costa Rica', 'Position_LAM', 'Nationality_Uruguay', 'Acceleration', 'SprintSpeed', 'Strength', 'Nationality_Gabon', 'Nationality_Slovenia', 'Stamina', 'Weak Foot', 'Agility', 'Crossing', 'Nationality_Belgium', 'Dribbling', 'ShotPower', 'LongShots', 'Finishing', 'BallControl', 'FKAccuracy', 'LongPassing', 'Volleys', 'ShortPassing', 'Position_RF', 'Position_LF', 'Body Type_PLAYER_BODY_TYPE_25', 'Body Type_Courtois', 'Body Type_Neymar', 'Body Type_Messi', 'Body Type_C. Ronaldo', 'Reactions']


In [21]:
cor_support, cor_feature = cor_selector(X, y,num_feats)
print(str(len(cor_feature)), 'selected features')

30 selected features


### List the selected features from Pearson Correlation

In [23]:
cor_feature

['Nationality_Costa Rica',
 'Position_LAM',
 'Nationality_Uruguay',
 'Acceleration',
 'SprintSpeed',
 'Strength',
 'Nationality_Gabon',
 'Nationality_Slovenia',
 'Stamina',
 'Weak Foot',
 'Agility',
 'Crossing',
 'Nationality_Belgium',
 'Dribbling',
 'ShotPower',
 'LongShots',
 'Finishing',
 'BallControl',
 'FKAccuracy',
 'LongPassing',
 'Volleys',
 'ShortPassing',
 'Position_RF',
 'Position_LF',
 'Body Type_PLAYER_BODY_TYPE_25',
 'Body Type_Courtois',
 'Body Type_Neymar',
 'Body Type_Messi',
 'Body Type_C. Ronaldo',
 'Reactions']

## Filter Feature Selection - Chi-Sqaure

In [25]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.preprocessing import MinMaxScaler

### Chi-Squared Selector function

In [27]:
# Chi-Squared Selector

def chi_squared_selector(X, y, num_feats):
    X_norm = MinMaxScaler().fit_transform(X)
    chi_selector = SelectKBest(chi2, k=num_feats)
    chi_selector.fit(X_norm, y)
    chi_support = chi_selector.get_support()
    chi_feature = X.columns[chi_support].tolist()
    return chi_support, chi_feature

chi_support, chi_feature = chi_squared_selector(X, y, num_feats)
print(f"{len(chi_feature)} selected features: {chi_feature}")

30 selected features: ['Finishing', 'ShortPassing', 'LongPassing', 'BallControl', 'Volleys', 'FKAccuracy', 'Reactions', 'LongShots', 'Position_CM', 'Position_LAM', 'Position_LF', 'Position_LW', 'Position_RB', 'Position_RF', 'Body Type_C. Ronaldo', 'Body Type_Courtois', 'Body Type_Messi', 'Body Type_Neymar', 'Body Type_PLAYER_BODY_TYPE_25', 'Nationality_Belgium', 'Nationality_Costa Rica', 'Nationality_Croatia', 'Nationality_Egypt', 'Nationality_England', 'Nationality_France', 'Nationality_Gabon', 'Nationality_Slovakia', 'Nationality_Slovenia', 'Nationality_Spain', 'Nationality_Uruguay']


In [28]:
chi_support, chi_feature = chi_squared_selector(X, y,num_feats)
print(str(len(chi_feature)), 'selected features')

30 selected features


### List the selected features from Chi-Square 

In [30]:
chi_feature

['Finishing',
 'ShortPassing',
 'LongPassing',
 'BallControl',
 'Volleys',
 'FKAccuracy',
 'Reactions',
 'LongShots',
 'Position_CM',
 'Position_LAM',
 'Position_LF',
 'Position_LW',
 'Position_RB',
 'Position_RF',
 'Body Type_C. Ronaldo',
 'Body Type_Courtois',
 'Body Type_Messi',
 'Body Type_Neymar',
 'Body Type_PLAYER_BODY_TYPE_25',
 'Nationality_Belgium',
 'Nationality_Costa Rica',
 'Nationality_Croatia',
 'Nationality_Egypt',
 'Nationality_England',
 'Nationality_France',
 'Nationality_Gabon',
 'Nationality_Slovakia',
 'Nationality_Slovenia',
 'Nationality_Spain',
 'Nationality_Uruguay']

## Wrapper Feature Selection - Recursive Feature Elimination

In [32]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler

### RFE Selector function

In [34]:
def rfe_selector(X, y, num_feats):
    # Scale the features
    scaler = MinMaxScaler()
    X_scaled = scaler.fit_transform(X)
    
    # Initialize Logistic Regression with a higher max_iter
    model = LogisticRegression(max_iter=5000, solver='lbfgs')
    rfe = RFE(estimator=model, n_features_to_select=num_feats)
    rfe.fit(X_scaled, y)
    
    # Get support and selected features
    rfe_support = rfe.get_support()
    rfe_feature = X.columns[rfe_support].tolist()
    return rfe_support, rfe_feature

# Call the function
rfe_support, rfe_feature = rfe_selector(X, y, num_feats)
print(f"{len(rfe_feature)} selected features: {rfe_feature}")

30 selected features: ['Finishing', 'ShortPassing', 'LongPassing', 'BallControl', 'Acceleration', 'SprintSpeed', 'Agility', 'Volleys', 'Reactions', 'Strength', 'Position_CM', 'Position_GK', 'Position_LCB', 'Position_LF', 'Position_LM', 'Position_RB', 'Position_RCB', 'Position_RW', 'Body Type_Courtois', 'Body Type_Lean', 'Body Type_Normal', 'Body Type_Stocky', 'Nationality_Belgium', 'Nationality_Costa Rica', 'Nationality_Croatia', 'Nationality_Gabon', 'Nationality_Netherlands', 'Nationality_Slovenia', 'Nationality_Uruguay', 'Nationality_Wales']


In [35]:
rfe_support, rfe_feature = rfe_selector(X, y,num_feats)
print(str(len(rfe_feature)), 'selected features')

30 selected features


### List the selected features from RFE

In [37]:
rfe_feature

['Finishing',
 'ShortPassing',
 'LongPassing',
 'BallControl',
 'Acceleration',
 'SprintSpeed',
 'Agility',
 'Volleys',
 'Reactions',
 'Strength',
 'Position_CM',
 'Position_GK',
 'Position_LCB',
 'Position_LF',
 'Position_LM',
 'Position_RB',
 'Position_RCB',
 'Position_RW',
 'Body Type_Courtois',
 'Body Type_Lean',
 'Body Type_Normal',
 'Body Type_Stocky',
 'Nationality_Belgium',
 'Nationality_Costa Rica',
 'Nationality_Croatia',
 'Nationality_Gabon',
 'Nationality_Netherlands',
 'Nationality_Slovenia',
 'Nationality_Uruguay',
 'Nationality_Wales']

## Embedded Selection - Lasso: SelectFromModel

In [39]:
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler

In [40]:
# Embedded Method - Logistic Regression (Lasso)

def embedded_log_reg_selector(X, y, num_feats):
    model = SelectFromModel(LogisticRegression(C=0.1, penalty='l1', solver='liblinear', max_iter=1000), max_features=num_feats)
    model.fit(X, y)
    embedded_lr_support = model.get_support()
    embedded_lr_feature = X.columns[embedded_lr_support].tolist()
    return embedded_lr_support, embedded_lr_feature

embedded_lr_support, embedded_lr_feature = embedded_log_reg_selector(X, y, num_feats)
print(f"{len(embedded_lr_feature)} selected features: {embedded_lr_feature}")

19 selected features: ['Crossing', 'Finishing', 'Dribbling', 'LongPassing', 'BallControl', 'Acceleration', 'SprintSpeed', 'Agility', 'Stamina', 'Volleys', 'FKAccuracy', 'Reactions', 'Balance', 'ShotPower', 'Strength', 'Aggression', 'Interceptions', 'Weak Foot', 'Body Type_Lean']


In [41]:
embedded_lr_support, embedded_lr_feature = embedded_log_reg_selector(X, y, num_feats)
print(str(len(embedded_lr_feature)), 'selected features')

19 selected features


In [42]:
embedded_lr_feature

['Crossing',
 'Finishing',
 'Dribbling',
 'LongPassing',
 'BallControl',
 'Acceleration',
 'SprintSpeed',
 'Agility',
 'Stamina',
 'Volleys',
 'FKAccuracy',
 'Reactions',
 'Balance',
 'ShotPower',
 'Strength',
 'Aggression',
 'Interceptions',
 'Weak Foot',
 'Body Type_Lean']

## Tree based(Random Forest): SelectFromModel

In [44]:
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier

In [45]:
# Embedded Method - Random Forest

def embedded_rf_selector(X, y, num_feats):
    model = SelectFromModel(RandomForestClassifier(n_estimators=100), max_features=num_feats)
    model.fit(X, y)
    embedded_rf_support = model.get_support()
    embedded_rf_feature = X.columns[embedded_rf_support].tolist()
    return embedded_rf_support, embedded_rf_feature

embedded_rf_support, embedded_rf_feature = embedded_rf_selector(X, y, num_feats)
print(f"{len(embedded_rf_feature)} selected features: {embedded_rf_feature}")


24 selected features: ['Crossing', 'Finishing', 'ShortPassing', 'Dribbling', 'LongPassing', 'BallControl', 'Acceleration', 'SprintSpeed', 'Agility', 'Stamina', 'Volleys', 'FKAccuracy', 'Reactions', 'Balance', 'ShotPower', 'Strength', 'LongShots', 'Aggression', 'Interceptions', 'Weak Foot', 'Preferred Foot_Right', 'Body Type_Courtois', 'Body Type_Lean', 'Nationality_Slovenia']


In [46]:
embedder_rf_support, embedder_rf_feature = embedded_rf_selector(X, y, num_feats)
print(str(len(embedded_rf_feature)), 'selected features')

24 selected features


In [47]:
embedded_rf_feature

['Crossing',
 'Finishing',
 'ShortPassing',
 'Dribbling',
 'LongPassing',
 'BallControl',
 'Acceleration',
 'SprintSpeed',
 'Agility',
 'Stamina',
 'Volleys',
 'FKAccuracy',
 'Reactions',
 'Balance',
 'ShotPower',
 'Strength',
 'LongShots',
 'Aggression',
 'Interceptions',
 'Weak Foot',
 'Preferred Foot_Right',
 'Body Type_Courtois',
 'Body Type_Lean',
 'Nationality_Slovenia']

## Tree based(Light GBM): SelectFromModel

In [70]:
from sklearn.feature_selection import SelectFromModel
from lightgbm import LGBMClassifier

In [72]:
# Embedded Method - LightGBM
def embedded_lgbm_selector(X, y, num_feats):
    model = SelectFromModel(LGBMClassifier(n_estimators=100), max_features=num_feats)
    model.fit(X, y)
    embedded_lgbm_support = model.get_support()
    embedded_lgbm_feature = X.columns[embedded_lgbm_support].tolist()
    return embedded_lgbm_support, embedded_lgbm_feature

# Call the function
embedded_lgbm_support, embedded_lgbm_feature = embedded_lgbm_selector(X, y, num_feats)
print(f"{len(embedded_lgbm_feature)} selected features: {embedded_lgbm_feature}")

[LightGBM] [Info] Number of positive: 55, number of negative: 18104
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000796 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1812
[LightGBM] [Info] Number of data points in the train set: 18159, number of used features: 124
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.003029 -> initscore=-5.796555
[LightGBM] [Info] Start training from score -5.796555
22 selected features: ['Crossing', 'Finishing', 'ShortPassing', 'Dribbling', 'LongPassing', 'BallControl', 'Acceleration', 'SprintSpeed', 'Agility', 'Stamina', 'Volleys', 'FKAccuracy', 'Reactions', 'Balance', 'ShotPower', 'Strength', 'LongShots', 'Aggression', 'Interceptions', 'Position_LCB', 'Body Type_Lean', 'Nationality_Italy']


In [74]:
embedded_lgbm_feature

['Crossing',
 'Finishing',
 'ShortPassing',
 'Dribbling',
 'LongPassing',
 'BallControl',
 'Acceleration',
 'SprintSpeed',
 'Agility',
 'Stamina',
 'Volleys',
 'FKAccuracy',
 'Reactions',
 'Balance',
 'ShotPower',
 'Strength',
 'LongShots',
 'Aggression',
 'Interceptions',
 'Position_LCB',
 'Body Type_Lean',
 'Nationality_Italy']

## Putting all of it together: AutoFeatureSelector Tool

In [None]:
pip install lightgbm


In [None]:
pip show lightgbm

In [81]:
# Ensure all feature selection lists have the same length
min_length = min(len(feature_name), len(cor_support), len(chi_support), len(rfe_support), 
                 len(embedded_lr_support), len(embedded_rf_support), len(embedded_lgbm_support))

# Truncate arrays to the minimum length (if necessary)
feature_name = feature_name[:min_length]
cor_support = cor_support[:min_length]
chi_support = chi_support[:min_length]
rfe_support = rfe_support[:min_length]
embedded_lr_support = embedded_lr_support[:min_length]
embedded_rf_support = embedded_rf_support[:min_length]
embedded_lgbm_support = embedded_lgbm_support[:min_length]

# Create the DataFrame
feature_selection_df = pd.DataFrame({
    'Feature': feature_name,
    'Pearson': cor_support,
    'Chi-2': chi_support,
    'RFE': rfe_support,
    'Logistics': embedded_lr_support,
    'Random Forest': embedded_rf_support,
    'LightGBM': embedded_lgbm_support
})

# Count the selected times for each feature
feature_selection_df['Total'] = feature_selection_df.iloc[:, 1:].sum(axis=1)

# Sort and reset index
feature_selection_df = feature_selection_df.sort_values(['Total', 'Feature'], ascending=False)
feature_selection_df.index = range(1, len(feature_selection_df) + 1)

# Display the top 100 features
feature_selection_df.head(100)


Unnamed: 0,Feature,Pearson,Chi-2,RFE,Logistics,Random Forest,LightGBM,Total
1,ShortPassing,217,True,True,False,True,True,221
2,SprintSpeed,195,False,True,True,True,True,199
3,Acceleration,115,False,True,True,True,True,119
4,Crossing,93,False,False,True,True,True,96
5,Reactions,72,True,True,True,True,True,77
6,Position_CM,54,True,True,False,False,False,56
7,Position_CDM,56,False,False,False,False,False,56
8,Position_GK,53,False,True,False,False,False,54
9,Position_LAM,50,True,False,False,False,False,51
10,Position_CF,51,False,False,False,False,False,51


## Can you build a Python script that takes dataset and a list of different feature selection methods that you want to try and output the best (maximum votes) features from all methods?

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif, chi2
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE
import lightgbm as lgb

In [92]:
# Define feature selection methods
def cor_selector(X, y, num_feats):
    cor_list = []
    for col in X.columns:
        cor = np.corrcoef(X[col], y)[0, 1]
        cor_list.append(abs(cor))
    cor_feature = X.columns[np.argsort(cor_list)[-num_feats:]]
    cor_support = [True if i in cor_feature else False for i in X.columns]
    return cor_support, cor_feature

def chi_squared_selector(X, y, num_feats):
    chi_selector = SelectKBest(chi2, k=num_feats)
    chi_selector.fit(X, y)
    chi_support = chi_selector.get_support()
    chi_feature = X.loc[:, chi_support].columns.tolist()
    return chi_support, chi_feature

def rfe_selector(X, y, num_feats):
    model = LogisticRegression()
    rfe = RFE(estimator=model, n_features_to_select=num_feats, step=1)
    rfe.fit(X, y)
    rfe_support = rfe.get_support()
    rfe_feature = X.loc[:, rfe_support].columns.tolist()
    return rfe_support, rfe_feature

def embedded_log_reg_selector(X, y, num_feats):
    model = LogisticRegression()
    model.fit(X, y)
    importance = np.abs(model.coef_).flatten()
    idxs = np.argsort(importance)[-num_feats:]
    embedded_lr_feature = X.columns[idxs]
    embedded_lr_support = [True if i in embedded_lr_feature else False for i in X.columns]
    return embedded_lr_support, embedded_lr_feature

def embedded_rf_selector(X, y, num_feats):
    model = RandomForestClassifier()
    model.fit(X, y)
    importances = model.feature_importances_
    idxs = np.argsort(importances)[-num_feats:]
    embedded_rf_feature = X.columns[idxs]
    embedded_rf_support = [True if i in embedded_rf_feature else False for i in X.columns]
    return embedded_rf_support, embedded_rf_feature

def embedded_lgbm_selector(X, y, num_feats):
    model = lgb.LGBMClassifier()
    model.fit(X, y)
    importances = model.feature_importances_
    idxs = np.argsort(importances)[-num_feats:]
    embedded_lgbm_feature = X.columns[idxs]
    embedded_lgbm_support = [True if i in embedded_lgbm_feature else False for i in X.columns]
    return embedded_lgbm_support, embedded_lgbm_feature

# Preprocessing function
def preprocess_dataset(dataset_path, target_column):
    dataset = pd.read_csv(dataset_path)
    # Handle non-numeric features
    dataset = pd.get_dummies(dataset, drop_first=True)
    X = dataset.drop(columns=[target_column])
    y = dataset[target_column]
    # Standardize features
    scaler = StandardScaler()
    X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)
    num_feats = 30
    return X, y, num_feats


In [94]:
# Main function
def autoFeatureSelector(dataset_path, target_column, methods=[]):
    X, y, num_feats = preprocess_dataset(dataset_path, target_column)
    selected_features = {}
    
    if 'pearson' in methods:
        cor_support, cor_feature = cor_selector(X, y, num_feats)
        selected_features['pearson'] = cor_support
    if 'chi-square' in methods:
        chi_support, chi_feature = chi_squared_selector(X, y, num_feats)
        selected_features['chi-square'] = chi_support
    if 'rfe' in methods:
        rfe_support, rfe_feature = rfe_selector(X, y, num_feats)
        selected_features['rfe'] = rfe_support
    if 'log-reg' in methods:
        embedded_lr_support, embedded_lr_feature = embedded_log_reg_selector(X, y, num_feats)
        selected_features['log-reg'] = embedded_lr_support
    if 'rf' in methods:
        embedded_rf_support, embedded_rf_feature = embedded_rf_selector(X, y, num_feats)
        selected_features['rf'] = embedded_rf_support
    if 'lgbm' in methods:
        embedded_lgbm_support, embedded_lgbm_feature = embedded_lgbm_selector(X, y, num_feats)
        selected_features['lgbm'] = embedded_lgbm_support
    
    # Combine results
    feature_votes = pd.DataFrame(selected_features)
    feature_votes['Total'] = feature_votes.sum(axis=1)
    best_features = feature_votes.sort_values('Total', ascending=False).head(num_feats).index.tolist()
    return best_features


In [96]:
# Run the function
best_features = autoFeatureSelector(
    dataset_path="fifa19.csv", 
    target_column='Overall', 
    methods=['pearson', 'chi-square', 'rfe', 'log-reg', 'rf', 'lgbm']
)
print(best_features)

ValueError: Input X contains NaN.
SelectKBest does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [107]:
from sklearn.feature_selection import mutual_info_classif


In [121]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectKBest, chi2, RFE
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
import lightgbm as lgb

def autoFeatureSelector(dataset_path, target_column, methods, num_feats=10):
    # Load the dataset
    data = pd.read_csv(dataset_path)
    
    # Separate features (X) and target (y)
    X = data.drop(columns=[target_column])
    y = data[target_column]
    
    # Split numeric and non-numeric columns
    numeric_cols = X.select_dtypes(include=['number']).columns
    non_numeric_cols = X.select_dtypes(exclude=['number']).columns
    
    # Handle missing values
    imputer_numeric = SimpleImputer(strategy='mean')  # For numeric columns
    imputer_categorical = SimpleImputer(strategy='most_frequent')  # For categorical columns
    
    X_numeric = pd.DataFrame(imputer_numeric.fit_transform(X[numeric_cols]), columns=numeric_cols)
    X_non_numeric = pd.DataFrame(imputer_categorical.fit_transform(X[non_numeric_cols]), columns=non_numeric_cols)
    
    # Combine numeric and non-numeric columns
    X = pd.concat([X_numeric, X_non_numeric], axis=1)
    
    # Scale numeric data
    scaler = StandardScaler()
    X_scaled = pd.DataFrame(scaler.fit_transform(X_numeric), columns=numeric_cols)
    
    # Initialize a dictionary to store selected features
    selected_features = {}
    
    # Pearson correlation
    if 'pearson' in methods:
        cor = X_scaled.corrwith(y).abs().sort_values(ascending=False)
        cor_support = cor.head(num_feats).index.tolist()
        selected_features['pearson'] = cor_support
    
    # Chi-square
    if 'chi-square' in methods:
        # Ensure all values in X_scaled are non-negative
        X_scaled_non_negative = X_scaled - X_scaled.min(axis=0)
        chi_selector = SelectKBest(chi2, k=num_feats)
        chi_selector.fit(X_scaled_non_negative, y)
        chi_support = chi_selector.get_support()
        selected_features['chi-square'] = X_scaled_non_negative.loc[:, chi_support].columns.tolist()
    
    # Recursive Feature Elimination (RFE)
    if 'rfe' in methods:
        model = LogisticRegression(max_iter=1000)
        rfe = RFE(model, n_features_to_select=num_feats)
        rfe.fit(X_scaled, y)
        rfe_support = rfe.get_support()
        selected_features['rfe'] = X_scaled.loc[:, rfe_support].columns.tolist()
    
    # Logistic Regression
    if 'log-reg' in methods:
        log_model = LogisticRegression(max_iter=1000)
        log_model.fit(X_scaled, y)
        log_importances = pd.Series(log_model.coef_[0], index=numeric_cols)
        selected_features['log-reg'] = log_importances.abs().nlargest(num_feats).index.tolist()
    
    # Random Forest
    if 'rf' in methods:
        rf_model = RandomForestClassifier()
        rf_model.fit(X_scaled, y)
        rf_importances = pd.Series(rf_model.feature_importances_, index=numeric_cols)
        selected_features['rf'] = rf_importances.nlargest(num_feats).index.tolist()
    
    # LightGBM
    if 'lgbm' in methods:
        lgb_model = lgb.LGBMClassifier()
        lgb_model.fit(X_scaled, y)
        lgb_importances = pd.Series(lgb_model.feature_importances_, index=numeric_cols)
        selected_features['lgbm'] = lgb_importances.nlargest(num_feats).index.tolist()
    
    return selected_features

# Run the function
best_features = autoFeatureSelector(
    dataset_path="fifa19.csv", 
    target_column='Overall', 
    methods=['pearson', 'chi-square', 'rfe', 'log-reg', 'rf', 'lgbm']
)
print(best_features)


LightGBMError: Do not support special JSON characters in feature name.

### Last, Can you turn this notebook into a python script, run it and submit the python (.py) file that takes dataset and list of methods as inputs and outputs the best features