# Predict Customer Churn - Create data pipeline for baseline models

This project is to build a predictive machine learning model using the customer churn data available at Kaggle website. <br>
This botebook will look at creating data pipelines for transforming features to improve model performance.<br>

The pipelines to be considered:
1. Imputing missing value for categorical variables
2. Imputing missing value for numerical variables
3. categorical variables: one hot encoding and ordinal encoding 
4. numerical variable: log transformation and standard scaling

## Global functions

In [9]:
import pandas as pd
import numpy as np
import joblib

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from customfunctions import combine_cat, log_transform, impute_balance, impute_credit_debit

## Read in data and split into train and test
### Read in data

In [3]:
## Read in data
df0 = pd.read_csv('./data/visathon_train_data.csv')#,index=customer_id)
print('Shape: {}'.format(df0.shape))
df0.sample(5)

Shape: (17654, 20)


Unnamed: 0,customer_id,vintage,age,gender,dependents,occupation,customer_nw_category,branch_code,days_since_last_transaction,current_balance,previous_month_end_balance,average_monthly_balance_prevQ,average_monthly_balance_prevQ2,current_month_credit,previous_month_credit,current_month_debit,previous_month_debit,current_month_balance,previous_month_balance,churn
5409,7379.0,1486.0,31.0,Female,0.0,Engineer,High,127.0,26.0,2297.61,2654.75,2889.52,3391.77,428.67,428.67,714.39,714.39,2361.65,3193.92,Yes
4183,5696.0,3342.0,54.0,Female,0.0,Engineer,Medium,47.0,336.0,5834.52,5834.52,5834.52,5694.04,0.31,0.31,0.31,0.31,5834.52,5834.52,Yes
10331,14170.0,1349.0,49.0,Male,3.0,Engineer,Low,713.0,103.0,3792.74,4291.36,4156.39,2630.9,0.47,0.47,499.09,499.09,3844.32,4339.61,Yes
5898,8035.0,2585.0,47.0,Male,0.0,Accountant,High,6.0,26.0,166.01,1111.41,1492.46,1976.41,38.03,0.59,6509.14,556.63,1058.88,1210.27,Yes
6452,8800.0,602.0,30.0,Male,3.0,Engineer,Low,296.0,153.0,3325.03,3727.15,3767.88,5300.66,0.01,0.01,234.64,234.64,3387.44,3772.56,Yes


### Group features

In [4]:
depend = ['dependents']
cat_vars = ['gender', 'occupation']#,'branch_code']

nw_cat = ['customer_nw_category']
bal_vars = ['current_balance', 'current_month_balance', 
            'previous_month_end_balance','previous_month_balance',
            'average_monthly_balance_prevQ', 'average_monthly_balance_prevQ2']
debit_vars = ['current_month_debit','previous_month_debit']
credit_vars = ['current_month_credit', 'previous_month_credit']
days_br_vars = ['days_since_last_transaction','branch_code']
vin_age_vars = ['vintage', 'age']
vin_day_vars = ['days_since_last_transaction','vintage']
vin_dep_vars = ['dependents','vintage']

target = 'churn'

### Split data

In [5]:
X = df0.drop(target,axis=1).copy()
y = df0[target].copy()

In [6]:
X_train,X_valid,y_train,y_valid = train_test_split(X,y,test_size=0.25, random_state=10086)
print('X_train shape: {}\nX_valid shape: {}\ny_train shape: {}\ny_valid shape: {}'.format( X_train.shape, X_valid.shape, y_train.shape, y_valid.shape))

X_train shape: (13240, 19)
X_valid shape: (4414, 19)
y_train shape: (13240,)
y_valid shape: (4414,)


## Make pipelines
### 1) Convert custom functions into transformers

In [11]:
# Original features
comb_cat_depend = FunctionTransformer(func=combine_cat, kw_args={'cutoff':6, 'replace':6}, validate=False)
log_tsfm = FunctionTransformer(func=log_transform)
impute_crdt_dbt = FunctionTransformer(func=impute_credit_debit)
impute_bal = FunctionTransformer(func=impute_balance)

### 2) Create pipelines

In [12]:
# cat_vars
pipe_depend = Pipeline([('comb_depend',comb_cat_depend),
                        ('imput_depend',SimpleImputer(strategy='constant',fill_value=1000)),
                        ('ohe_depend',OneHotEncoder(handle_unknown='ignore',sparse=False))
                       ])
pipe_cat = Pipeline([('imput_cat',SimpleImputer(strategy='constant',fill_value='other')),
                     ('ohe_cat',OneHotEncoder(handle_unknown='ignore',sparse=False))
                    ])

In [13]:
# num_vars
pipe_vin_age = Pipeline([('imput_base',SimpleImputer(strategy='median')),
                         ('scal_base',StandardScaler()) 
                        ]) # vintage, age
pipe_days_br = Pipeline([('imput_days_br',SimpleImputer(strategy='median')),
                         ('log_days_br',log_tsfm),
                         ('scal_days_br',StandardScaler())
                        ]) # days_since_last_transaction
pipe_bal = Pipeline([('imput_bal',impute_bal),#AmongFeaturesMeanImputer(features=bal_vars)),
                     ('log_bal',log_tsfm),
                     ('scal_bal',StandardScaler())
                    ])
pipe_credit = Pipeline([('imput_credit', impute_crdt_dbt), #AmongFeaturesMeanImputer(features=credit_vars)),
                     ('log_credit',log_tsfm),
                     ('scal_credit',StandardScaler()) ]) # credit columns
pipe_debit = Pipeline([('imput_debit',impute_crdt_dbt),
                       ('log_debit',log_tsfm),
                       ('scal_debit',StandardScaler()) 
                      ]) # debit columns

### 3) Assemble pipelines

In [14]:
nw_categories = [['Low','Medium','High']]
pipeline_data = ColumnTransformer([
            # categorical vars
           ('depend', pipe_depend, depend), # 7 cols
           ('cat_vars', pipe_cat, cat_vars), # 8 colc
           ('ode_nw', OrdinalEncoder(categories=nw_categories), nw_cat), # 1 col
           # numeric vars
           ('vin_age', pipe_vin_age, vin_age_vars), # 2 cols
           ('days_br', pipe_days_br, days_br_vars), # 2 col
           ('num_bal', pipe_bal, bal_vars), # 6 cols
           ('num_debit', pipe_debit, debit_vars), # 2 cols
           ('num_credit', pipe_credit, credit_vars), # 2 cols
  ]) #totla cols = 47

## Apply pipeline

In [15]:
pipeline_data.fit(X_train)

X_train_transformed = pipeline_data.transform(X_train)
X_valid_transformed = pipeline_data.transform(X_valid)

X_train_transformed.shape, X_valid_transformed.shape

((13240, 30), (4414, 30))

## Save pipeline

In [17]:
joblib.dump(pipeline_data,'pipeline_data0.joblib')
# pipeline_data = joblib.load('pipeline_data0.joblib')

['pipeline_data0.joblib']

## Next step
Next step is to feed the transformed data into models.