In [100]:
import numpy as np
import pandas as pd
import sklearn
from sklearn.preprocessing import (
    StandardScaler, 
    LabelEncoder, 
    OneHotEncoder,
    MinMaxScaler)
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.compose import ColumnTransformer


from feature_engine.encoding import (
    RareLabelEncoder,
    MeanEncoder,
    CountFrequencyEncoder,
    # OneHotEncoder
)

import warnings


In [2]:
# Display settings

pd.set_option("display.max_columns", None)

In [3]:
sklearn.set_config(transform_output = 'pandas')

In [4]:
warnings.filterwarnings('ignore')

In [27]:
# Read the data

file_path = r"E:\1. Projects\ML-internship\customer-churn-prediction\Research\data\train.csv"

train = pd.read_csv(file_path)

In [6]:
train.head()

Unnamed: 0,rownumber,customerid,surname,creditscore,geography,gender,age,tenure,balance,numofproducts,hascrcard,isactivemember,estimatedsalary,exited
0,9470,15674573,Gearhart,713,France,Female,25.0,4,121172.97,1,1.0,1.0,56268.98,0
1,4798,15666173,Chidumaga,793,Germany,Female,32.0,1,96408.98,1,1.0,1.0,138191.81,0
2,943,15804586,Lin,376,France,Female,46.0,6,0.0,1,1.0,0.0,157333.69,1
3,9466,15815259,Fang,835,France,Female,56.0,2,0.0,2,1.0,1.0,39820.13,0
4,4147,15698246,Gordon,658,France,Female,24.0,2,0.0,2,1.0,1.0,84694.49,0


In [35]:
x_train = train.drop(columns = 'exited')
y_train = train.exited.copy()

In [13]:
#onehot encoding

gender_transformer = Pipeline(
    steps = [
        ('encoder', OneHotEncoder(sparse_output=False, handle_unknown="ignore"))
    ]
)

In [14]:
gender_transformer.fit_transform(x_train.loc[:, ['gender']])

Unnamed: 0,gender_Female,gender_Male
0,1.0,0.0
1,1.0,0.0
2,1.0,0.0
3,1.0,0.0
4,1.0,0.0
...,...,...
6391,1.0,0.0
6392,0.0,1.0
6393,1.0,0.0
6394,0.0,1.0


In [36]:
x_train.head()

Unnamed: 0,rownumber,customerid,surname,creditscore,geography,gender,age,tenure,balance,numofproducts,hascrcard,isactivemember,estimatedsalary
0,9470,15674573,Gearhart,713,France,Female,25.0,4,121172.97,1,1.0,1.0,56268.98
1,4798,15666173,Chidumaga,793,Germany,Female,32.0,1,96408.98,1,1.0,1.0,138191.81
2,943,15804586,Lin,376,France,Female,46.0,6,0.0,1,1.0,0.0,157333.69
3,9466,15815259,Fang,835,France,Female,56.0,2,0.0,2,1.0,1.0,39820.13
4,4147,15698246,Gordon,658,France,Female,24.0,2,0.0,2,1.0,1.0,84694.49


In [16]:
x_train.surname.value_counts()

surname
Smith        20
Wilson       19
Trevisani    18
Thompson     18
Lu           17
             ..
Menkens       1
David         1
Ingrassia     1
Spyer         1
Cullen        1
Name: count, Length: 2300, dtype: int64

In [82]:
rare_label_encoder = RareLabelEncoder(tol = 0.0026, replace_with="others", n_categories=70)

In [83]:
rare_label_encoder.fit_transform(x_train[['surname']]).value_counts()

surname  
others       6253
Smith          20
Wilson         19
Thompson       18
Trevisani      18
Chu            17
Maclean        17
Genovese       17
Lu             17
Name: count, dtype: int64

In [84]:
surname = Pipeline(steps=
                   [
                       ("label_encoder", RareLabelEncoder(tol = 0.0026, replace_with="others", n_categories=70)),
                       ("encoding", OneHotEncoder(sparse_output=False, handle_unknown="ignore"))
                   ])

In [85]:
geo_transformer = Pipeline(
    steps = [
        ('encoder', OneHotEncoder(sparse_output=False, handle_unknown="ignore"))
    ]
)

In [93]:
x_train.head()

Unnamed: 0,rownumber,customerid,surname,creditscore,geography,gender,age,tenure,balance,numofproducts,hascrcard,isactivemember,estimatedsalary
0,9470,15674573,Gearhart,713,France,Female,25.0,4,121172.97,1,1.0,1.0,56268.98
1,4798,15666173,Chidumaga,793,Germany,Female,32.0,1,96408.98,1,1.0,1.0,138191.81
2,943,15804586,Lin,376,France,Female,46.0,6,0.0,1,1.0,0.0,157333.69
3,9466,15815259,Fang,835,France,Female,56.0,2,0.0,2,1.0,1.0,39820.13
4,4147,15698246,Gordon,658,France,Female,24.0,2,0.0,2,1.0,1.0,84694.49


In [95]:
balance_transformer = Pipeline(
    steps=[
        ('scaling', StandardScaler())
    ]
)

In [99]:
est_sal_transformer = Pipeline(
    steps=[
        ('scaling', StandardScaler())
    ]
)

In [101]:
age_tranformer = Pipeline(
    steps=[
        ('min_max_scaling', MinMaxScaler())
    ]
)

In [105]:
credit_score_transformer = Pipeline(
    steps=[
        ('scaling', MinMaxScaler())
    ]
)

In [103]:

# age_tranformer.fit_transform(x_train[['age']])

In [108]:
x_train.head()

Unnamed: 0,rownumber,customerid,surname,creditscore,geography,gender,age,tenure,balance,numofproducts,hascrcard,isactivemember,estimatedsalary
0,9470,15674573,Gearhart,713,France,Female,25.0,4,121172.97,1,1.0,1.0,56268.98
1,4798,15666173,Chidumaga,793,Germany,Female,32.0,1,96408.98,1,1.0,1.0,138191.81
2,943,15804586,Lin,376,France,Female,46.0,6,0.0,1,1.0,0.0,157333.69
3,9466,15815259,Fang,835,France,Female,56.0,2,0.0,2,1.0,1.0,39820.13
4,4147,15698246,Gordon,658,France,Female,24.0,2,0.0,2,1.0,1.0,84694.49


In [109]:
x_train = x_train.drop(columns = ['rownumber', 'customerid'])

In [110]:
x_train.head()

Unnamed: 0,surname,creditscore,geography,gender,age,tenure,balance,numofproducts,hascrcard,isactivemember,estimatedsalary
0,Gearhart,713,France,Female,25.0,4,121172.97,1,1.0,1.0,56268.98
1,Chidumaga,793,Germany,Female,32.0,1,96408.98,1,1.0,1.0,138191.81
2,Lin,376,France,Female,46.0,6,0.0,1,1.0,0.0,157333.69
3,Fang,835,France,Female,56.0,2,0.0,2,1.0,1.0,39820.13
4,Gordon,658,France,Female,24.0,2,0.0,2,1.0,1.0,84694.49


# Column transformer

In [111]:
column_transformer = ColumnTransformer(transformers=[
    ('surname', surname, ['surname']),
    ('credits', credit_score_transformer, ['creditscore']),
    ('geo', geo_transformer, ['geography']),
    ('gender', gender_transformer, ['gender']),
    ('age', age_tranformer, ['age']),
    ('balance',balance_transformer, ['balance']),
    ('estimated_salary', est_sal_transformer, ['estimatedsalary'] )
],remainder="passthrough"
)

In [112]:
column_transformer.fit_transform(x_train, y_train)

Unnamed: 0,surname__surname_Chu,surname__surname_Genovese,surname__surname_Lu,surname__surname_Maclean,surname__surname_Smith,surname__surname_Thompson,surname__surname_Trevisani,surname__surname_Wilson,surname__surname_others,credits__creditscore,geo__geography_France,geo__geography_Germany,geo__geography_Spain,gender__gender_Female,gender__gender_Male,age__age,balance__balance,estimated_salary__estimatedsalary,remainder__tenure,remainder__numofproducts,remainder__hascrcard,remainder__isactivemember
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.726,1.0,0.0,0.0,1.0,0.0,0.094595,0.711826,-0.765071,4,1,1.0,1.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.886,0.0,1.0,0.0,1.0,0.0,0.189189,0.315563,0.652938,1,1,1.0,1.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.052,1.0,0.0,0.0,1.0,0.0,0.378378,-1.227134,0.984266,6,1,1.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.970,1.0,0.0,0.0,1.0,0.0,0.513514,-1.227134,-1.049786,2,2,1.0,1.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.616,1.0,0.0,0.0,1.0,0.0,0.081081,-1.227134,-0.273052,2,2,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6391,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.458,1.0,0.0,0.0,1.0,0.0,0.202703,-0.176345,1.110194,1,2,0.0,0.0
6392,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.828,1.0,0.0,0.0,0.0,1.0,0.081081,0.343399,-1.274395,7,1,1.0,0.0
6393,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.664,0.0,0.0,1.0,1.0,0.0,0.554054,0.735643,-0.271161,0,1,0.0,1.0
6394,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.474,1.0,0.0,0.0,0.0,1.0,0.391892,-0.090593,-0.735760,6,1,1.0,0.0


In [104]:
x_train.head()

Unnamed: 0,rownumber,customerid,surname,creditscore,geography,gender,age,tenure,balance,numofproducts,hascrcard,isactivemember,estimatedsalary
0,9470,15674573,Gearhart,713,France,Female,25.0,4,121172.97,1,1.0,1.0,56268.98
1,4798,15666173,Chidumaga,793,Germany,Female,32.0,1,96408.98,1,1.0,1.0,138191.81
2,943,15804586,Lin,376,France,Female,46.0,6,0.0,1,1.0,0.0,157333.69
3,9466,15815259,Fang,835,France,Female,56.0,2,0.0,2,1.0,1.0,39820.13
4,4147,15698246,Gordon,658,France,Female,24.0,2,0.0,2,1.0,1.0,84694.49
