In [5]:
# !pip install feature_engine

In [1]:
import numpy as np
import pandas as pd
import sklearn
from sklearn.preprocessing import (
    StandardScaler, 
    LabelEncoder, 
    OneHotEncoder,
    MinMaxScaler)
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.compose import ColumnTransformer


from feature_engine.encoding import (
    RareLabelEncoder,
    MeanEncoder,
    CountFrequencyEncoder,
    # OneHotEncoder
)

import warnings


In [2]:
# pip show feature-engine

In [7]:
# Display settings

pd.set_option("display.max_columns", None)

In [10]:
sklearn.set_config(transform_output = 'pandas')

In [12]:
warnings.filterwarnings('ignore')

In [14]:
# Read the data

file_path = r"E:\1. Projects\ML-internship\customer-churn-prediction\Research\data\train.csv"

train = pd.read_csv(file_path)

In [16]:
train.head()

Unnamed: 0,rownumber,customerid,surname,creditscore,geography,gender,age,tenure,balance,numofproducts,hascrcard,isactivemember,estimatedsalary,exited
0,9470,15674573,Gearhart,713,France,Female,25.0,4,121172.97,1,1.0,1.0,56268.98,0
1,4798,15666173,Chidumaga,793,Germany,Female,32.0,1,96408.98,1,1.0,1.0,138191.81,0
2,943,15804586,Lin,376,France,Female,46.0,6,0.0,1,1.0,0.0,157333.69,1
3,9466,15815259,Fang,835,France,Female,56.0,2,0.0,2,1.0,1.0,39820.13,0
4,4147,15698246,Gordon,658,France,Female,24.0,2,0.0,2,1.0,1.0,84694.49,0


In [18]:
x_train = train.drop(columns = 'exited')
y_train = train.exited.copy()

In [20]:
#onehot encoding

gender_transformer = Pipeline(
    steps = [
        ('encoder', OneHotEncoder(sparse_output=False, handle_unknown="ignore"))
    ]
)

In [22]:
gender_transformer.fit_transform(x_train.loc[:, ['gender']])

Unnamed: 0,gender_Female,gender_Male
0,1.0,0.0
1,1.0,0.0
2,1.0,0.0
3,1.0,0.0
4,1.0,0.0
...,...,...
6391,1.0,0.0
6392,0.0,1.0
6393,1.0,0.0
6394,0.0,1.0


In [24]:
x_train.head()

Unnamed: 0,rownumber,customerid,surname,creditscore,geography,gender,age,tenure,balance,numofproducts,hascrcard,isactivemember,estimatedsalary
0,9470,15674573,Gearhart,713,France,Female,25.0,4,121172.97,1,1.0,1.0,56268.98
1,4798,15666173,Chidumaga,793,Germany,Female,32.0,1,96408.98,1,1.0,1.0,138191.81
2,943,15804586,Lin,376,France,Female,46.0,6,0.0,1,1.0,0.0,157333.69
3,9466,15815259,Fang,835,France,Female,56.0,2,0.0,2,1.0,1.0,39820.13
4,4147,15698246,Gordon,658,France,Female,24.0,2,0.0,2,1.0,1.0,84694.49


In [26]:
x_train.surname.value_counts()

surname
Smith        20
Wilson       19
Trevisani    18
Thompson     18
Lu           17
             ..
Menkens       1
David         1
Ingrassia     1
Spyer         1
Cullen        1
Name: count, Length: 2300, dtype: int64

In [28]:
rare_label_encoder = RareLabelEncoder(tol = 0.0026, replace_with="others", n_categories=70)

In [30]:
rare_label_encoder.fit_transform(x_train[['surname']]).value_counts()

surname  
others       6253
Smith          20
Wilson         19
Thompson       18
Trevisani      18
Chu            17
Maclean        17
Genovese       17
Lu             17
Name: count, dtype: int64

In [32]:
surname = Pipeline(steps=
                   [
                       ("label_encoder", RareLabelEncoder(tol = 0.0026, replace_with="others", n_categories=70)),
                       ("encoding", OneHotEncoder(sparse_output=False, handle_unknown="ignore"))
                   ])

In [34]:
geo_transformer = Pipeline(
    steps = [
        ('encoder', OneHotEncoder(sparse_output=False, handle_unknown="ignore"))
    ]
)

In [36]:
x_train.head()

Unnamed: 0,rownumber,customerid,surname,creditscore,geography,gender,age,tenure,balance,numofproducts,hascrcard,isactivemember,estimatedsalary
0,9470,15674573,Gearhart,713,France,Female,25.0,4,121172.97,1,1.0,1.0,56268.98
1,4798,15666173,Chidumaga,793,Germany,Female,32.0,1,96408.98,1,1.0,1.0,138191.81
2,943,15804586,Lin,376,France,Female,46.0,6,0.0,1,1.0,0.0,157333.69
3,9466,15815259,Fang,835,France,Female,56.0,2,0.0,2,1.0,1.0,39820.13
4,4147,15698246,Gordon,658,France,Female,24.0,2,0.0,2,1.0,1.0,84694.49


In [38]:
balance_transformer = Pipeline(
    steps=[
        ('scaling', StandardScaler())
    ]
)

In [40]:
est_sal_transformer = Pipeline(
    steps=[
        ('scaling', StandardScaler())
    ]
)

In [42]:
age_tranformer = Pipeline(
    steps=[
        ('min_max_scaling', MinMaxScaler())
    ]
)

In [44]:
credit_score_transformer = Pipeline(
    steps=[
        ('scaling', MinMaxScaler())
    ]
)

In [46]:

# age_tranformer.fit_transform(x_train[['age']])

In [48]:
x_train.head()

Unnamed: 0,rownumber,customerid,surname,creditscore,geography,gender,age,tenure,balance,numofproducts,hascrcard,isactivemember,estimatedsalary
0,9470,15674573,Gearhart,713,France,Female,25.0,4,121172.97,1,1.0,1.0,56268.98
1,4798,15666173,Chidumaga,793,Germany,Female,32.0,1,96408.98,1,1.0,1.0,138191.81
2,943,15804586,Lin,376,France,Female,46.0,6,0.0,1,1.0,0.0,157333.69
3,9466,15815259,Fang,835,France,Female,56.0,2,0.0,2,1.0,1.0,39820.13
4,4147,15698246,Gordon,658,France,Female,24.0,2,0.0,2,1.0,1.0,84694.49


In [50]:
x_train = x_train.drop(columns = ['rownumber', 'customerid'])

In [52]:
x_train.head()

Unnamed: 0,surname,creditscore,geography,gender,age,tenure,balance,numofproducts,hascrcard,isactivemember,estimatedsalary
0,Gearhart,713,France,Female,25.0,4,121172.97,1,1.0,1.0,56268.98
1,Chidumaga,793,Germany,Female,32.0,1,96408.98,1,1.0,1.0,138191.81
2,Lin,376,France,Female,46.0,6,0.0,1,1.0,0.0,157333.69
3,Fang,835,France,Female,56.0,2,0.0,2,1.0,1.0,39820.13
4,Gordon,658,France,Female,24.0,2,0.0,2,1.0,1.0,84694.49


# Column transformer

In [55]:
column_transformer = ColumnTransformer(transformers=[
    ('surname', surname, ['surname']),
    ('credits', credit_score_transformer, ['creditscore']),
    ('geo', geo_transformer, ['geography']),
    ('gender', gender_transformer, ['gender']),
    ('age', age_tranformer, ['age']),
    ('balance',balance_transformer, ['balance']),
    ('estimated_salary', est_sal_transformer, ['estimatedsalary'] )
],remainder="passthrough"
)

# Feature Selection

In [58]:

from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from feature_engine.selection import SelectBySingleFeaturePerformance


In [60]:
clf = RandomForestClassifier(n_estimators=50, max_depth=5, random_state=42)

In [62]:
selector = SelectBySingleFeaturePerformance(
    estimator=clf,
    scoring= 'precision',
    threshold=0.05
)

In [64]:
preprocessor= Pipeline(steps=[
    ("ct", column_transformer),
    ("selector", selector)
])

In [66]:
preprocessor.fit_transform(x_train, y_train)

Unnamed: 0,credits__creditscore,age__age,balance__balance,remainder__numofproducts
0,0.726,0.094595,0.711826,1
1,0.886,0.189189,0.315563,1
2,0.052,0.378378,-1.227134,1
3,0.970,0.513514,-1.227134,2
4,0.616,0.081081,-1.227134,2
...,...,...,...,...
6391,0.458,0.202703,-0.176345,2
6392,0.828,0.081081,0.343399,1
6393,0.664,0.554054,0.735643,1
6394,0.474,0.391892,-0.090593,1


In [67]:
x_train.head()

Unnamed: 0,surname,creditscore,geography,gender,age,tenure,balance,numofproducts,hascrcard,isactivemember,estimatedsalary
0,Gearhart,713,France,Female,25.0,4,121172.97,1,1.0,1.0,56268.98
1,Chidumaga,793,Germany,Female,32.0,1,96408.98,1,1.0,1.0,138191.81
2,Lin,376,France,Female,46.0,6,0.0,1,1.0,0.0,157333.69
3,Fang,835,France,Female,56.0,2,0.0,2,1.0,1.0,39820.13
4,Gordon,658,France,Female,24.0,2,0.0,2,1.0,1.0,84694.49


In [72]:

feature_performances = preprocessor.named_steps["selector"].feature_performance_

In [74]:

feature_performances

{'surname__surname_Chu': np.float64(0.0),
 'surname__surname_Genovese': np.float64(0.0),
 'surname__surname_Lu': np.float64(0.0),
 'surname__surname_Maclean': np.float64(0.0),
 'surname__surname_Smith': np.float64(0.0),
 'surname__surname_Thompson': np.float64(0.0),
 'surname__surname_Trevisani': np.float64(0.0),
 'surname__surname_Wilson': np.float64(0.0),
 'surname__surname_others': np.float64(0.0),
 'credits__creditscore': np.float64(0.8857142857142857),
 'geo__geography_France': np.float64(0.0),
 'geo__geography_Germany': np.float64(0.0),
 'geo__geography_Spain': np.float64(0.0),
 'gender__gender_Female': np.float64(0.0),
 'gender__gender_Male': np.float64(0.0),
 'age__age': np.float64(0.5609155739590522),
 'balance__balance': np.float64(0.4904761904761905),
 'estimated_salary__estimatedsalary': np.float64(0.0),
 'remainder__tenure': np.float64(0.0),
 'remainder__numofproducts': np.float64(0.8694647588765236),
 'remainder__hascrcard': np.float64(0.0),
 'remainder__isactivemember': 