In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import VarianceThreshold
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_selection import mutual_info_classif, mutual_info_regression
from sklearn.feature_selection import chi2
from sklearn.feature_selection import f_classif
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import roc_auc_score
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from mlxtend.feature_selection import ExhaustiveFeatureSelector as EFS
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.feature_selection import RFE

In [2]:
filepath2 = "/kaggle/input/scs-train/scs_train2.csv"
df2 = pd.read_csv(filepath2)
print(df2.shape)

(53214, 14)


In [3]:
x_train2 , x_test2 , y_train2 , y_test2 = train_test_split(
    df2.drop(labels=['TARGET'],axis=1),
    df2['TARGET'],
    test_size=0.3,
    random_state=0
)
print(x_train2.shape)
print(x_test2.shape)

(37249, 13)
(15965, 13)


In [4]:
sfs1 = SFS(RandomForestRegressor(n_jobs=4),
           k_features = 5,
           forward = True,
           floating = False,
           verbose = 2,
           scoring = 'r2',
           cv = 3)
sfs1 = sfs1.fit(np.array(x_train2), y_train2)
x_train2.columns[list(sfs1.k_feature_idx_)]

[Parallel(n_jobs=1)]: Done  13 out of  13 | elapsed:   14.9s finished

[2025-08-20 08:48:21] Features: 1/5 -- score: 0.021097760901693168[Parallel(n_jobs=1)]: Done  12 out of  12 | elapsed:   13.9s finished

[2025-08-20 08:48:35] Features: 2/5 -- score: 0.03673369893829548[Parallel(n_jobs=1)]: Done  11 out of  11 | elapsed:   17.6s finished

[2025-08-20 08:48:52] Features: 3/5 -- score: 0.03274898439646775[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:   17.4s finished

[2025-08-20 08:49:10] Features: 4/5 -- score: 0.030178603043354218[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:   16.5s finished

[2025-08-20 08:49:26] Features: 5/5 -- score: 0.028032453376166815

Index(['var15', 'ind_var30', 'num_var4', 'num_var30', 'num_var42'], dtype='object')

In [5]:
sfs1 = SFS(RandomForestRegressor(n_jobs=4),
           k_features = 5,
           forward = False,
           floating = False,
           verbose = 2,
           scoring = 'r2',
           cv = 3)
sfs1 = sfs1.fit(np.array(x_train2), y_train2)
x_train2.columns[list(sfs1.k_feature_idx_)]

[Parallel(n_jobs=1)]: Done  13 out of  13 | elapsed:  1.1min finished

[2025-08-20 08:50:41] Features: 12/5 -- score: 0.020070302107834375[Parallel(n_jobs=1)]: Done  12 out of  12 | elapsed:   51.4s finished

[2025-08-20 08:51:32] Features: 11/5 -- score: 0.021890400224188273[Parallel(n_jobs=1)]: Done  11 out of  11 | elapsed:   45.8s finished

[2025-08-20 08:52:18] Features: 10/5 -- score: 0.02075153578075987[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:   40.5s finished

[2025-08-20 08:52:59] Features: 9/5 -- score: 0.02146476812803862[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:   35.7s finished

[2025-08-20 08:53:34] Features: 8/5 -- score: 0.02042234429916523[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:   30.7s finished

[2025-08-20 08:54:05] Features: 7/5 -- score: 0.018270417010266587[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:   25.5s finished

[2025-08-20 08:54:31] Features: 6/5 -- score: 0.018762400118806182[Parallel(n_jobs=1)]: Done   6 out of   6

Index(['var15', 'saldo_var30', 'var36', 'saldo_medio_var5_hace2',
       'saldo_medio_var5_ult1'],
      dtype='object')

In [6]:
lr = make_pipeline(StandardScaler(), LogisticRegression(max_iter=500, solver='lbfgs'))
efs = EFS(estimator=lr,
          min_features=1,
          max_features=5,
          scoring='accuracy',
          print_progress=True,
          cv=5
)

efs = efs.fit(x_train2, y_train2)

print("Best score:", efs.best_score_)
print("Best feature indices:", efs.best_idx_)
print("Best feature names:", efs.best_feature_names_)

Features: 2379/2379

Best score: 0.9610996296066047
Best feature indices: (0,)
Best feature names: ('var15',)


In [7]:
lr = LogisticRegression(max_iter=500, solver='lbfgs')
rfe = RFE(estimator=lr, n_features_to_select=5)
pipeline = make_pipeline(StandardScaler(), rfe)
pipeline.fit(x_train2, y_train2)
selected_mask = pipeline.named_steps['rfe'].support_
rankings = pipeline.named_steps['rfe'].ranking_

print("Selected Features:", x_train2.columns[selected_mask].tolist())
print("Feature Ranking:", dict(zip(x_train2.columns, rankings)))

Selected Features: ['var15', 'ind_var30', 'saldo_var5', 'saldo_var30', 'saldo_medio_var5_ult3']
Feature Ranking: {'var15': 1, 'ind_var5': 4, 'ind_var30': 1, 'num_var4': 3, 'num_var30_0': 6, 'num_var30': 2, 'num_var42': 9, 'saldo_var5': 1, 'saldo_var30': 1, 'var36': 7, 'saldo_medio_var5_hace2': 5, 'saldo_medio_var5_ult1': 8, 'saldo_medio_var5_ult3': 1}
