# Automated feature selection
- In this note book we will run automated feature selection to select the most relevant features for our training model

In [1]:
import pandas as pd
import numpy as np

In [8]:
train = pd.read_csv('../data/features/train_final.csv')

In [18]:
train.columns

Index(['CustomerID', 'days_since_last_purchase', 'customer_age_days',
       'active_days', 'total_orders', 'purchase_frequency',
       'avg_items_per_order', 'avg_days_between_orders', 'avg_order_value',
       'avg_item_value', 'COUNT(transactions)', 'MAX(transactions.Quantity)',
       'MAX(transactions.Revenue)', 'MAX(transactions.UnitPrice)',
       'MEAN(transactions.Quantity)', 'MEAN(transactions.Revenue)',
       'MEAN(transactions.UnitPrice)', 'MIN(transactions.Quantity)',
       'MIN(transactions.Revenue)', 'MIN(transactions.UnitPrice)',
       'SUM(transactions.Quantity)', 'SUM(transactions.Revenue)',
       'COUNT(transactions) + MAX(transactions.Quantity)',
       'COUNT(transactions) + MAX(transactions.Revenue)',
       'MAX(transactions.Quantity) + MEAN(transactions.Revenue)',
       'MAX(transactions.UnitPrice) + MIN(transactions.Quantity)',
       'MEAN(transactions.UnitPrice) + MIN(transactions.UnitPrice)',
       'CLV_Target'],
      dtype='object')

In [None]:
X_train = train.drop(columns=['CustomerID', 'CLV_Target'])
y_train = train['CLV_Target']



In [16]:
from sklearn.feature_selection import mutual_info_regression

mi_scores = mutual_info_regression(X_train, y_train, random_state=42)

mi_results = pd.DataFrame({
    'Feature': X_train.columns,
    'MI_Score': mi_scores
    
}).sort_values(by='MI_Score', ascending=False)

print(mi_results)


                                              Feature  MI_Score
20                          SUM(transactions.Revenue)  0.243239
19                         SUM(transactions.Quantity)  0.226035
21   COUNT(transactions) + MAX(transactions.Quantity)  0.167715
3                                        total_orders  0.165782
22    COUNT(transactions) + MAX(transactions.Revenue)  0.159390
9                                 COUNT(transactions)  0.143423
4                                  purchase_frequency  0.130179
2                                         active_days  0.128075
6                             avg_days_between_orders  0.114082
5                                 avg_items_per_order  0.113136
7                                     avg_order_value  0.112213
0                            days_since_last_purchase  0.091131
11                          MAX(transactions.Revenue)  0.083917
10                         MAX(transactions.Quantity)  0.076366
23  MAX(transactions.Quantity) + MEAN(tr

In [29]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import ElasticNet, ElasticNetCV

num_features = X_train.select_dtypes(include=['int64', 'float64']).columns

num_transformer = Pipeline([
    ('scaler', StandardScaler()),
    ('imputer', SimpleImputer(strategy='median'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_transformer, num_features)
    ]
)

pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', ElasticNetCV(cv=5, random_state=42))
])

pipeline.fit(X_train, y_train)


0,1,2
,steps,"[('preprocessor', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,l1_ratio,0.5
,eps,0.001
,n_alphas,'deprecated'
,alphas,'warn'
,fit_intercept,True
,precompute,'auto'
,max_iter,1000
,tol,0.0001
,cv,5
,copy_X,True


In [30]:
Elastic_results = pd.DataFrame({
    'Feature': X_train.columns,
    'Coefficient': pipeline.named_steps['model'].coef_
}).sort_values(by='Coefficient', key=abs, ascending=False)

print(Elastic_results)

                                              Feature  Coefficient
20                          SUM(transactions.Revenue)   485.393268
19                         SUM(transactions.Quantity)   352.656750
22    COUNT(transactions) + MAX(transactions.Revenue)   213.032354
11                          MAX(transactions.Revenue)   198.291977
3                                        total_orders   157.929951
7                                     avg_order_value   135.541408
9                                 COUNT(transactions)   115.021711
21   COUNT(transactions) + MAX(transactions.Quantity)   114.525768
4                                  purchase_frequency    75.766788
14                         MEAN(transactions.Revenue)    69.555851
23  MAX(transactions.Quantity) + MEAN(transactions...    67.372955
5                                 avg_items_per_order    61.263668
10                         MAX(transactions.Quantity)    47.512375
17                          MIN(transactions.Revenue)   -40.22

In [37]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.inspection import permutation_importance


num_transformer_rf = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
])

preprocessor_rf = ColumnTransformer(
    transformers=[
        ("num", num_transformer_rf, num_features)
    ],
    remainder="drop"   # or "passthrough" if you have other cols you want included
)

pipeline_rf = Pipeline(steps=[
    ("preprocessor", preprocessor_rf),
    ("model", RandomForestRegressor(
        n_estimators=200,
        max_depth=15,
        min_samples_leaf=5,
        random_state=42,
        n_jobs=-1
    ))
])

rf_model = pipeline_rf.fit(X_train, y_train)

In [39]:
val_set = pd.read_csv('../data/features/val_final.csv')
X_val = val_set.drop(columns=['CustomerID', 'CLV_Target'])
y_val = val_set['CLV_Target']

permutation_importances = permutation_importance(
    rf_model,
    X_val,
    y_val,
    n_repeats=10,
    random_state=42,
    scoring='neg_mean_squared_error'
)

perm_results = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance_Mean': permutation_importances.importances_mean,
    'Importance_Std': permutation_importances.importances_std
}).sort_values(by='Importance_Mean', ascending=False)

print(perm_results)

                                              Feature  Importance_Mean  \
20                          SUM(transactions.Revenue)     5.608237e+06   
19                         SUM(transactions.Quantity)     3.147260e+05   
22    COUNT(transactions) + MAX(transactions.Revenue)     8.563844e+04   
7                                     avg_order_value     7.359179e+04   
24  MAX(transactions.UnitPrice) + MIN(transactions...     3.549490e+04   
15                       MEAN(transactions.UnitPrice)     1.889360e+04   
12                        MAX(transactions.UnitPrice)     1.593624e+04   
5                                 avg_items_per_order     1.508264e+04   
21   COUNT(transactions) + MAX(transactions.Quantity)     1.395387e+04   
25  MEAN(transactions.UnitPrice) + MIN(transaction...     7.341292e+03   
2                                         active_days     7.158323e+03   
18                        MIN(transactions.UnitPrice)     4.487976e+03   
13                        MEAN(transac