[Blog to Refer](https://machinelearningmastery.com/feature-selection-with-real-and-categorical-data/)

## Imports

In [5]:
# data manulplation
import pandas as pd
import numpy as np

# visualization
import seaborn as sns
import matplotlib.pyplot as plt

# statstical testing
from scipy import stats

# train test split
from sklearn.model_selection import train_test_split, cross_val_score

# models
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression

# metrcs
from sklearn.metrics import f1_score

# general
import os
import warnings
warnings.filterwarnings('ignore')

## Read Data

In [10]:
DATA_DIR = r"Dataset/clean_data/Transformed_data"
DATA_DIR = "Dataset/clean_data/Transformed_data/train_iteratoin4.csv"
# df = pd.read_csv(os.path.join( DATA_DIR, "train_iteration4.csv"))
df = pd.read_csv(DATA_DIR)

In [11]:
df.shape

(891, 33)

In [12]:
df.head()

Unnamed: 0,Survived,Name_Words,Name_Length,Name_Init_labelencode,Name_Init_ordinalencode,Name_Init_Capt,Name_Init_Col,Name_Init_Countess,Name_Init_Don,Name_Init_Dr,...,Embraked_labelencoded,Embarked_ordinalencode,Embarked_C,Embarked_Q,Embarked_S,Pclass,SibSp,Parch,Age_MinMax,Fare_MinMax
0,0,4,23,12,0.0,0.0,0.0,0.0,0.0,0.0,...,2,0.0,0.0,0.0,1.0,3,1,0,0.271174,0.014151
1,1,7,51,13,2.0,0.0,0.0,0.0,0.0,0.0,...,0,1.0,1.0,0.0,0.0,1,1,0,0.472229,0.139136
2,1,3,22,9,1.0,0.0,0.0,0.0,0.0,0.0,...,2,0.0,0.0,0.0,1.0,3,0,0,0.321438,0.015469
3,1,7,44,13,2.0,0.0,0.0,0.0,0.0,0.0,...,2,0.0,0.0,0.0,1.0,1,1,0,0.434531,0.103644
4,0,4,24,12,0.0,0.0,0.0,0.0,0.0,0.0,...,2,0.0,0.0,0.0,1.0,3,0,0,0.434531,0.015713


In [None]:
TARGET_COL = "Survived"


# Feature Selection

## Filter Methods

In [17]:
# Define custom colors and boundaries
colors = ['#0D47A1', '#1976D2', '#64B5F6', '#90CAF9',  # Blue shades for positive values
          '#FFEB3B', '#FFC107', '#FF9800', '#F57C00']  # Orange shades for negative values

In [18]:
def highlight_scores(value):
    # print(value)
    if value > 0:
        if value >= 0.75: # 0.76 to 1.0
            return f'background-color: {colors[0]}; color: green'
        elif value >= 0.50: # 0.51 to 0.75 
            return f'background-color: {colors[1]}; color: black'
        elif value >= 0.25: # 0.26 to 0.50
            return f'background-color: {colors[2]}; color: black'
        else: # 0.0 to 0.25  
            return f'background-color: {colors[3]}; color: red'
    else:
        if  value >=-0.25 : # 0.0 to -0.25 
            return f'background-color: {colors[4]}; color: red'
        elif value >= -0.50: # -0.26 to -0.50
            return f'background-color: {colors[5]}; color: black'
        elif value >= -0.75: # -0.51 to -0.75
            return f'background-color: {colors[6]}; color: black'
        else:  # -0.76 to -1
            return f'background-color: {colors[7]}; color: green'

### Correlation Coefficient

In [19]:
df_corr_person = df.corr(method='pearson') # assumption : if data had linear relation : parametric
df_corr_kendall = df.corr(method='kendall') # two categorical variables
df_corr_spearman = df.corr(method='spearman') # assumption : if data had non linear relation : non-parametric


In [20]:
df_corr_person.style.applymap(highlight_scores, subset=df.columns)

Unnamed: 0,Survived,Name_Words,Name_Length,Name_Init_labelencode,Name_Init_ordinalencode,Name_Init_Capt,Name_Init_Col,Name_Init_Countess,Name_Init_Don,Name_Init_Dr,Name_Init_Jonkheer,Name_Init_Lady,Name_Init_Major,Name_Init_Master,Name_Init_Miss,Name_Init_Mlle,Name_Init_Mme,Name_Init_Mr,Name_Init_Mrs,Name_Init_Ms,Name_Init_Rev,Name_Init_Sir,Sex_labelencode,Embraked_labelencoded,Embarked_ordinalencode,Embarked_C,Embarked_Q,Embarked_S,Pclass,SibSp,Parch,Age_MinMax,Fare_MinMax
Survived,1.0,0.276512,0.33235,-0.201345,0.294715,-0.026456,0.011329,0.04247,-0.026456,0.008185,-0.026456,0.04247,0.011329,0.085221,0.327093,0.060095,0.04247,-0.549199,0.33904,0.04247,-0.064988,0.04247,-0.543351,-0.167675,0.106811,0.16824,0.00365,-0.149683,-0.338481,-0.035322,0.081629,-0.06491,0.257307
Name_Words,0.276512,1.0,0.922586,0.193669,0.295847,-0.001932,-0.023035,0.112842,-0.001932,-0.037775,-0.001932,0.112842,-0.002734,-0.007858,-0.124525,0.037868,-0.001932,-0.353938,0.637876,-0.030626,-0.016493,0.084148,-0.375778,0.031348,-0.117474,0.021336,-0.147658,0.07423,-0.221348,0.141757,0.215977,0.064964,0.152981
Name_Length,0.33235,0.922586,1.0,0.099694,0.365015,0.003739,-0.010048,0.104917,-0.010715,-0.027074,0.01458,0.137438,0.023187,0.053399,-0.042814,0.03597,0.007353,-0.45648,0.638699,-0.007101,-0.008567,0.061555,-0.448759,0.022885,-0.107749,0.027481,-0.140941,0.06462,-0.220001,0.165019,0.252282,0.039702,0.155832
Name_Init_labelencode,-0.201345,0.193669,0.099694,1.0,-0.405843,-0.201798,-0.260099,-0.165831,-0.147847,-0.34475,-0.111879,-0.093895,-0.107416,-0.374668,-0.603751,-0.031074,-0.003976,0.491325,0.38554,0.049975,0.166935,0.085943,0.256407,0.066462,-0.122448,-0.023314,-0.122203,0.097327,0.045541,-0.191786,-0.119721,0.272384,-0.086359
Name_Init_ordinalencode,0.294715,0.295847,0.365015,-0.405843,1.0,0.192735,0.21315,0.171685,0.297986,0.176364,0.319036,0.255886,0.183364,0.293539,0.049636,0.153578,0.276936,-0.623161,0.293258,0.213786,0.214897,0.234836,-0.339311,-0.093935,0.050181,0.09948,-0.012511,-0.079275,-0.21264,0.155206,0.175888,0.000764,0.098668
Name_Init_Capt,-0.026456,-0.001932,0.003739,-0.201798,0.192735,1.0,-0.00159,-0.001124,-0.001124,-0.002983,-0.001124,-0.001124,-0.00159,-0.007267,-0.016983,-0.00159,-0.001124,-0.039411,-0.013541,-0.001124,-0.00276,-0.001124,0.024728,0.019641,-0.019067,-0.016158,-0.01031,0.020643,-0.052496,0.014507,0.025731,0.104685,0.026184
Name_Init_Col,0.011329,-0.023035,-0.010048,-0.260099,0.21315,-0.00159,1.0,-0.00159,-0.00159,-0.004221,-0.00159,-0.00159,-0.00225,-0.010283,-0.024031,-0.00225,-0.00159,-0.055767,-0.01916,-0.00159,-0.003905,-0.00159,0.03499,-0.032167,0.010348,0.037766,-0.014588,-0.023904,-0.074282,-0.022508,-0.022467,0.104389,-0.001126
Name_Init_Countess,0.04247,0.112842,0.104917,-0.165831,0.171685,-0.001124,-0.00159,1.0,-0.001124,-0.002983,-0.001124,-0.001124,-0.00159,-0.007267,-0.016983,-0.00159,-0.001124,-0.039411,-0.013541,-0.001124,-0.00276,-0.001124,-0.045439,0.019641,-0.019067,-0.016158,-0.01031,0.020643,-0.052496,-0.015907,-0.015878,0.009373,0.036645
Name_Init_Don,-0.026456,-0.001932,-0.010715,-0.147847,0.297986,-0.001124,-0.00159,-0.001124,1.0,-0.002983,-0.001124,-0.001124,-0.00159,-0.007267,-0.016983,-0.00159,-0.001124,-0.039411,-0.013541,-0.001124,-0.00276,-0.001124,0.024728,-0.065106,0.033694,0.069538,-0.01031,-0.05443,-0.052496,-0.015907,-0.015878,0.027405,-0.003026
Name_Init_Dr,0.008185,-0.037775,-0.027074,-0.34475,0.176364,-0.002983,-0.004221,-0.002983,-0.002983,1.0,-0.002983,-0.002983,-0.004221,-0.019292,-0.045085,-0.004221,-0.002983,-0.104624,-0.035947,-0.002983,-0.007327,-0.002983,0.039034,-0.028208,0.029419,0.022104,0.017874,-0.030612,-0.108935,0.00391,-0.04215,0.072752,0.030395


In [21]:
df_corr_kendall.style.applymap(highlight_scores, subset=df.columns)

Unnamed: 0,Survived,Name_Words,Name_Length,Name_Init_labelencode,Name_Init_ordinalencode,Name_Init_Capt,Name_Init_Col,Name_Init_Countess,Name_Init_Don,Name_Init_Dr,Name_Init_Jonkheer,Name_Init_Lady,Name_Init_Major,Name_Init_Master,Name_Init_Miss,Name_Init_Mlle,Name_Init_Mme,Name_Init_Mr,Name_Init_Mrs,Name_Init_Ms,Name_Init_Rev,Name_Init_Sir,Sex_labelencode,Embraked_labelencoded,Embarked_ordinalencode,Embarked_C,Embarked_Q,Embarked_S,Pclass,SibSp,Parch,Age_MinMax,Fare_MinMax
Survived,1.0,0.218077,0.254681,-0.081571,0.483828,-0.026456,0.011329,0.04247,-0.026456,0.008185,-0.026456,0.04247,0.011329,0.085221,0.327093,0.060095,0.04247,-0.549199,0.33904,0.04247,-0.064988,0.04247,-0.543351,-0.155747,0.132087,0.16824,0.00365,-0.149683,-0.323533,0.085915,0.133933,-0.031055,0.266229
Name_Words,0.218077,1.0,0.77203,0.256211,0.31183,0.006992,-0.021896,0.056846,0.006992,-0.032559,0.006992,0.056846,0.009894,0.021367,-0.086387,0.011223,0.006992,-0.281975,0.483035,-0.03794,-0.006759,0.053803,-0.285649,0.104597,-0.132372,-0.029916,-0.154178,0.123232,-0.195132,0.253047,0.220443,0.022368,0.221703
Name_Length,0.254681,0.77203,1.0,0.123827,0.376856,0.01445,-0.009416,0.047586,-0.006953,-0.013144,0.025911,0.048129,0.037856,0.082794,0.020996,0.014451,0.018524,-0.361098,0.417389,-0.00163,0.006271,0.041013,-0.331131,0.069712,-0.09741,-0.008044,-0.126905,0.086908,-0.171896,0.243909,0.22129,-0.010721,0.20644
Name_Init_labelencode,-0.081571,0.256211,0.123827,1.0,-0.20103,-0.061143,-0.086226,-0.06073,-0.060593,-0.159398,-0.059494,-0.059356,-0.083698,-0.363913,-0.61992,-0.040148,-0.028167,0.260245,0.620941,0.060181,0.149009,0.061143,0.086121,0.063639,-0.095023,0.005422,-0.138661,0.082509,-0.058717,-0.010587,-0.115206,0.289514,-0.007265
Name_Init_ordinalencode,0.483828,0.31183,0.376856,-0.20103,1.0,0.060318,0.084865,0.060181,0.061005,0.153927,0.061143,0.06073,0.084476,0.354138,0.337477,0.084087,0.060868,-0.901218,0.523246,0.060456,0.144622,0.060593,-0.694563,-0.083016,0.079331,0.07038,0.036184,-0.084426,-0.156041,0.307525,0.359764,-0.087882,0.275251
Name_Init_Capt,-0.026456,0.006992,0.01445,-0.061143,0.060318,1.0,-0.00159,-0.001124,-0.001124,-0.002983,-0.001124,-0.001124,-0.00159,-0.007267,-0.016983,-0.00159,-0.001124,-0.039411,-0.013541,-0.001124,-0.00276,-0.001124,0.024728,0.019848,-0.019848,-0.016158,-0.01031,0.020643,-0.046561,0.041114,0.049335,0.048357,0.036608
Name_Init_Col,0.011329,-0.021896,-0.009416,-0.086226,0.084865,-0.00159,1.0,-0.00159,-0.00159,-0.004221,-0.00159,-0.00159,-0.00225,-0.010283,-0.024031,-0.00225,-0.00159,-0.055767,-0.01916,-0.00159,-0.003905,-0.00159,0.03499,-0.027397,0.01857,0.037766,-0.014588,-0.023904,-0.065884,-0.030831,-0.025505,0.064611,0.03174
Name_Init_Countess,0.04247,0.056846,0.047586,-0.06073,0.060181,-0.001124,-0.00159,1.0,-0.001124,-0.002983,-0.001124,-0.001124,-0.00159,-0.007267,-0.016983,-0.00159,-0.001124,-0.039411,-0.013541,-0.001124,-0.00276,-0.001124,-0.045439,0.019848,-0.019848,-0.016158,-0.01031,0.020643,-0.046561,-0.021789,-0.018025,0.020575,0.040842
Name_Init_Don,-0.026456,0.006992,-0.006953,-0.060593,0.061005,-0.001124,-0.00159,-0.001124,1.0,-0.002983,-0.001124,-0.001124,-0.00159,-0.007267,-0.016983,-0.00159,-0.001124,-0.039411,-0.013541,-0.001124,-0.00276,-0.001124,0.024728,-0.058571,0.046095,0.069538,-0.01031,-0.05443,-0.046561,-0.021789,-0.018025,0.031798,0.019403
Name_Init_Dr,0.008185,-0.032559,-0.013144,-0.159398,0.153927,-0.002983,-0.004221,-0.002983,-0.002983,1.0,-0.002983,-0.002983,-0.004221,-0.019292,-0.045085,-0.004221,-0.002983,-0.104624,-0.035947,-0.002983,-0.007327,-0.002983,0.039034,-0.029003,0.029863,0.022104,0.017874,-0.030612,-0.102678,0.003708,-0.047851,0.05944,0.052383


In [23]:
df_corr_spearman.style.applymap(highlight_scores, subset=df.columns)

Unnamed: 0,Survived,Name_Words,Name_Length,Name_Init_labelencode,Name_Init_ordinalencode,Name_Init_Capt,Name_Init_Col,Name_Init_Countess,Name_Init_Don,Name_Init_Dr,Name_Init_Jonkheer,Name_Init_Lady,Name_Init_Major,Name_Init_Master,Name_Init_Miss,Name_Init_Mlle,Name_Init_Mme,Name_Init_Mr,Name_Init_Mrs,Name_Init_Ms,Name_Init_Rev,Name_Init_Sir,Sex_labelencode,Embraked_labelencoded,Embarked_ordinalencode,Embarked_C,Embarked_Q,Embarked_S,Pclass,SibSp,Parch,Age_MinMax,Fare_MinMax
Survived,1.0,0.234616,0.305809,-0.086869,0.515248,-0.026456,0.011329,0.04247,-0.026456,0.008185,-0.026456,0.04247,0.011329,0.085221,0.327093,0.060095,0.04247,-0.549199,0.33904,0.04247,-0.064988,0.04247,-0.543351,-0.160196,0.135861,0.16824,0.00365,-0.149683,-0.339668,0.088879,0.138266,-0.037004,0.323736
Name_Words,0.234616,1.0,0.880472,0.297565,0.356879,0.007523,-0.023556,0.061157,0.007523,-0.035028,0.007523,0.061157,0.010645,0.022987,-0.092938,0.012074,0.007523,-0.303359,0.519667,-0.040817,-0.007272,0.057883,-0.307312,0.114058,-0.148167,-0.032185,-0.16587,0.132578,-0.220829,0.285157,0.244629,0.031629,0.289798
Name_Length,0.305809,0.880472,1.0,0.177046,0.475457,0.01735,-0.011306,0.057139,-0.008349,-0.015782,0.031113,0.057791,0.045456,0.099416,0.025211,0.017352,0.022242,-0.433589,0.501181,-0.001957,0.007531,0.049246,-0.397607,0.086044,-0.12036,-0.009659,-0.152381,0.104356,-0.216458,0.306576,0.273374,-0.011747,0.298848
Name_Init_labelencode,-0.086869,0.297565,0.177046,1.0,-0.16103,-0.065113,-0.091825,-0.064674,-0.064528,-0.16975,-0.063357,-0.063211,-0.089134,-0.387546,-0.660178,-0.042755,-0.029996,0.277145,0.661266,0.064089,0.158686,0.065113,0.091714,0.069137,-0.104657,0.005774,-0.147665,0.087868,-0.064992,-0.019445,-0.127648,0.352187,-0.01127
Name_Init_ordinalencode,0.515248,0.356879,0.475457,-0.16103,1.0,0.064235,0.090376,0.064089,0.064967,0.163923,0.065113,0.064674,0.089962,0.377136,0.359393,0.089548,0.064821,-0.959744,0.557227,0.064382,0.154014,0.064528,-0.739669,-0.090005,0.087825,0.07495,0.038534,-0.089909,-0.174481,0.33873,0.397939,-0.10667,0.357841
Name_Init_Capt,-0.026456,0.007523,0.01735,-0.065113,0.064235,1.0,-0.00159,-0.001124,-0.001124,-0.002983,-0.001124,-0.001124,-0.00159,-0.007267,-0.016983,-0.00159,-0.001124,-0.039411,-0.013541,-0.001124,-0.00276,-0.001124,0.024728,0.020415,-0.020415,-0.016158,-0.01031,0.020643,-0.048883,0.042532,0.050931,0.057622,0.044515
Name_Init_Col,0.011329,-0.023556,-0.011306,-0.091825,0.090376,-0.00159,1.0,-0.00159,-0.00159,-0.004221,-0.00159,-0.00159,-0.00225,-0.010283,-0.024031,-0.00225,-0.00159,-0.055767,-0.01916,-0.00159,-0.003905,-0.00159,0.03499,-0.02818,0.019101,0.037766,-0.014588,-0.023904,-0.06917,-0.031895,-0.02633,0.07699,0.038596
Name_Init_Countess,0.04247,0.061157,0.057139,-0.064674,0.064089,-0.001124,-0.00159,1.0,-0.001124,-0.002983,-0.001124,-0.001124,-0.00159,-0.007267,-0.016983,-0.00159,-0.001124,-0.039411,-0.013541,-0.001124,-0.00276,-0.001124,-0.045439,0.020415,-0.020415,-0.016158,-0.01031,0.020643,-0.048883,-0.02254,-0.018608,0.024517,0.049664
Name_Init_Don,-0.026456,0.007523,-0.008349,-0.064528,0.064967,-0.001124,-0.00159,-0.001124,1.0,-0.002983,-0.001124,-0.001124,-0.00159,-0.007267,-0.016983,-0.00159,-0.001124,-0.039411,-0.013541,-0.001124,-0.00276,-0.001124,0.024728,-0.060245,0.047412,0.069538,-0.01031,-0.05443,-0.048883,-0.02254,-0.018608,0.03789,0.023594
Name_Init_Dr,0.008185,-0.035028,-0.015782,-0.16975,0.163923,-0.002983,-0.004221,-0.002983,-0.002983,1.0,-0.002983,-0.002983,-0.004221,-0.019292,-0.045085,-0.004221,-0.002983,-0.104624,-0.035947,-0.002983,-0.007327,-0.002983,0.039034,-0.029831,0.030716,0.022104,0.017874,-0.030612,-0.107798,0.003836,-0.049399,0.070829,0.063698


### Statistical Tests: 
Select features based on statistical significance

## Wrapper Methods

### Forward Selection: 
Add one feature at a time based on model performance.

### Backward Elimination: 
Start with all features and eliminate one at a time based on model performance.

## Embedded Methods

### Lasso Regression (L1 regularization): 
Penalize coefficients to zero, effectively performing feature selection.

### Decision Trees/Random Forests: 
Feature importance from ensemble methods.

## Dimensionality Reduction