# Customer Satisfication Dataset Feature Extraction

In [10]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from mlxtend.feature_selection import ExhaustiveFeatureSelector as EFS
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFECV
from sklearn.model_selection import StratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import Lasso  
from sklearn.linear_model import Ridge       
from sklearn.linear_model import ElasticNet

**Importing the dataset where Feature selection was made**

In [3]:
df = pd.read_csv("/kaggle/input/customer-satisfication-dataset/customer_satisfaction_fs.csv")
print(df)

       ind_var30  saldo_var5  saldo_var30  saldo_var42  num_meses_var5_ult3  \
0              0        0.00         0.00         0.00                    0   
1              1        0.00       300.00         0.00                    1   
2              1        3.00         3.00         3.00                    3   
3              1       70.62        70.62        70.62                    2   
4              1        0.00    135003.00    135003.00                    3   
...          ...         ...          ...          ...                  ...   
76015          0        0.00         0.00         0.00                    0   
76016          1        0.00     48191.22     48191.22                    1   
76017          1        3.00         3.00         3.00                    2   
76018          1        3.00         3.00         3.00                    3   
76019          0        0.00         0.00         0.00                    0   

       saldo_medio_var5_ult1  saldo_medio_var5_ult3

In [4]:
X = df.drop('target', axis = 1)
y = df['target']

# Wrapper Methods

**Forward Selection**

In [7]:


sfs1 = SFS(RandomForestClassifier(n_jobs=4, random_state=42),
           k_features=5,
           forward=True,
           floating=False,
           verbose=2,
           scoring='accuracy', 
           cv=3)

sfs1 = sfs1.fit(X, y)

print('Best accuracy score: %.4f' % sfs1.k_score_)
print('Best subset (indices):', sfs1.k_feature_idx_)
print('Best subset (names):', sfs1.k_feature_names_)

x_forward = X[list(sfs1.k_feature_names_)]



[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:   31.9s finished

[2025-08-27 11:48:35] Features: 1/5 -- score: 0.9604314654038411[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:   29.9s finished

[2025-08-27 11:49:04] Features: 2/5 -- score: 0.9604314654038411[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   24.7s finished

[2025-08-27 11:49:29] Features: 3/5 -- score: 0.9564982899237043[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:   21.2s finished

[2025-08-27 11:49:50] Features: 4/5 -- score: 0.9594448829255459

Best accuracy score: 0.9591
Best subset (indices): (0, 1, 4, 5, 6)
Best subset (names): ('ind_var30', 'saldo_var5', 'num_meses_var5_ult3', 'saldo_medio_var5_ult1', 'saldo_medio_var5_ult3')


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   17.0s finished

[2025-08-27 11:50:07] Features: 5/5 -- score: 0.9591423309655354

**Backward Selection**

In [8]:
sfs1 = SFS(RandomForestClassifier(n_jobs=4, random_state=42),
           k_features=5,
           forward=False,
           floating=False,
           verbose=2,
           scoring='accuracy',
           cv=3)

sfs1 = sfs1.fit(X, y)

print('Best precision score: %.4f' % sfs1.k_score_)
print('Best subset (indices):', sfs1.k_feature_idx_)
print('Best subset (names):', sfs1.k_feature_names_)
x_backward = X[list(sfs1.k_feature_names_)]


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:   45.7s finished

[2025-08-27 11:51:23] Features: 6/5 -- score: 0.9578531965272297

Best precision score: 0.9580
Best subset (indices): (0, 2, 3, 4, 5)
Best subset (names): ('ind_var30', 'saldo_var30', 'saldo_var42', 'num_meses_var5_ult3', 'saldo_medio_var5_ult1')


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:   38.7s finished

[2025-08-27 11:52:02] Features: 5/5 -- score: 0.9580242041568008

**Exhaustive Search**

In [14]:
# Initialize the model
clf = RandomForestClassifier(n_jobs=4, n_estimators=50 , random_state=42)

# Run Exhaustive Feature Selection
efs = EFS(clf,
          min_features=2,           
          max_features=5,            
          scoring='f1',        
          print_progress=True, 
          cv=3,                     
          n_jobs=1)                  

efs = efs.fit(X, y)

print('Best precision score: %.4f' % efs.best_score_)
print('Best subset (indices):', efs.best_idx_)
print('Best subset (names):', efs.best_feature_names_)

X_exhaustive = X[list(efs.best_feature_names_)]



Features: 112/112

Best precision score: 0.0231
Best subset (indices): (1, 2)
Best subset (names): ('saldo_var5', 'saldo_var30')


**Recursive Feature Elimination**

In [15]:

# Step 1: Create the model
clf = RandomForestClassifier(n_jobs=4, random_state=42)

# Step 2: Initialize RFECV
rfecv = RFECV(estimator=clf,
              step=1,
              cv=StratifiedKFold(3),
              scoring='precision',
              n_jobs=4,
              verbose=2)

# Step 3: Fit the selector
rfecv.fit(X, y)

print("Optimal number of features: %d" % rfecv.n_features_)
print("Selected feature indices:", rfecv.support_.nonzero()[0])
print("Selected feature names:", X.columns[rfecv.support_].tolist())
X_selected_rs = X.loc[:, rfecv.support_]


Fitting estimator with 7 features.
Fitting estimator with 6 features.
Fitting estimator with 5 features.
Fitting estimator with 4 features.
Fitting estimator with 3 features.
Optimal number of features: 2
Selected feature indices: [2 6]
Selected feature names: ['saldo_var30', 'saldo_medio_var5_ult3']


# Embedded Methods

**Lasso L1 Regularization**

In [18]:
lasso = Lasso(alpha=0.01)
lasso.fit(X, y)

selected_lasso = X.columns[(lasso.coef_ != 0)]
print("LASSO selected features:", list(selected_lasso))


LASSO selected features: ['saldo_var5', 'saldo_var30', 'saldo_var42', 'num_meses_var5_ult3', 'saldo_medio_var5_ult1', 'saldo_medio_var5_ult3']


**L2 Ridge Regularization**

In [19]:
from sklearn.linear_model import Ridge

ridge = Ridge(alpha=0.01)
ridge.fit(X, y)

selected_ridge = X.columns[ridge.coef_ > 0]

print("RIDGE selected features: ", list(selected_ridge))


RIDGE selected features:  ['saldo_var42', 'saldo_medio_var5_ult3']


**L1/L2 Regularization Elastic Net**

In [20]:
elastic_net = ElasticNet(alpha=0.02, l1_ratio=0.5, random_state=42)

elastic_net.fit(X, y)

selected_elastic = X.columns[elastic_net.coef_ != 0]

print("Elastic Net selected features:", list(selected_elastic))


Elastic Net selected features: ['saldo_var5', 'saldo_var30', 'saldo_var42', 'num_meses_var5_ult3', 'saldo_medio_var5_ult1', 'saldo_medio_var5_ult3']
Fitting estimator with 7 features.
Fitting estimator with 6 features.
Fitting estimator with 5 features.
Fitting estimator with 4 features.
Fitting estimator with 3 features.
Fitting estimator with 2 features.
Fitting estimator with 7 features.
Fitting estimator with 6 features.
Fitting estimator with 5 features.
Fitting estimator with 4 features.
Fitting estimator with 3 features.
Fitting estimator with 2 features.
Fitting estimator with 7 features.
Fitting estimator with 6 features.
Fitting estimator with 5 features.
Fitting estimator with 4 features.
Fitting estimator with 3 features.
Fitting estimator with 2 features.
