# Machine learning project -- Part 3  Create data pipeline

This project is to build a predictive machine learning model using the customer churn data available at Kaggle website. <br>
Part 3 will look at create data pipelines for transforming and engineering new features to improve model performance.<br>

The transformation to be considered:
1. Imputing missing value for categorical variables
2. Imputing missing value for numerical variables
3. categorical variables: one hot encoding and ordinal encoding 
4. numerical variable: log transformation and standard scaling
5. Feature engineering: binning/bucketing, percent change for features of consecutive periods

## Global functions

In [1]:
import pandas as pd
import numpy as np
import joblib

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from customfunctions import combine_cat, log_transform, impute_balance, impute_credit_debit
from customfunctions import calculate_pct_change, calculate_ratio 
from customfunctions import calculate_vintage_age_score_eqdist, calculate_vintage_age_score_eqdens

## Read in data nad split into train and test
### Read in data

In [2]:
## Read in data
df0 = pd.read_csv('./data/visathon_train_data.csv')#,index=customer_id)
print('Shape: {}'.format(df0.shape))
df0.sample(5)

Shape: (17654, 20)


Unnamed: 0,customer_id,vintage,age,gender,dependents,occupation,customer_nw_category,branch_code,days_since_last_transaction,current_balance,previous_month_end_balance,average_monthly_balance_prevQ,average_monthly_balance_prevQ2,current_month_credit,previous_month_credit,current_month_debit,previous_month_debit,current_month_balance,previous_month_balance,churn
14312,19609.0,938.0,46.0,Female,0.0,Engineer,High,136.0,217.0,6272.55,6272.55,6272.55,4600.54,0.26,0.26,0.26,0.26,6272.55,6272.55,Yes
5836,7953.0,2402.0,29.0,Male,0.0,Doctor,Low,641.0,303.0,2449.16,2449.16,2448.85,2417.84,0.03,0.03,0.03,325.73,2449.16,2479.67,Yes
9318,12779.0,4063.0,50.0,Male,0.0,Engineer,Low,419.0,49.0,9673.66,1662.99,3863.46,7407.13,0.51,643.37,561.27,286.23,2862.84,1547.78,Yes
15678,21478.0,4863.0,48.0,Male,0.0,Engineer,High,115.0,35.0,7357.85,8934.46,6536.3,4621.96,0.27,8463.13,642.09,3214.56,7933.18,6787.25,Yes
7520,10273.0,2596.0,90.0,Male,0.0,Engineer,Medium,60.0,54.0,4596.36,5612.16,4914.64,3328.07,0.24,1616.81,1016.04,,4915.99,5470.39,Yes


### Group features

In [3]:
depend = ['dependents']
cat_vars = ['gender', 'occupation']#,'branch_code']

nw_cat = ['customer_nw_category']
bal_vars = ['current_balance', 'current_month_balance', 
            'previous_month_end_balance','previous_month_balance',
            'average_monthly_balance_prevQ', 'average_monthly_balance_prevQ2']
debit_vars = ['current_month_debit','previous_month_debit']
credit_vars = ['current_month_credit', 'previous_month_credit']
vin_age_vars = ['vintage', 'age']
vin_day_vars = ['days_since_last_transaction','vintage']
vin_dep_vars = ['dependents','vintage']

target = 'churn'

### Split data

In [4]:
target = 'churn'
X = df0.drop(target,axis=1).copy()
y = df0[target].copy()

In [5]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.25, random_state=10086)
print('X_train shape: {}\nX_test shape: {}\ny_train shape: {}\ny_test shape: {}'.format( X_train.shape, X_test.shape, y_train.shape, y_test.shape))

X_train shape: (13240, 19)
X_test shape: (4414, 19)
y_train shape: (13240,)
y_test shape: (4414,)


## Make pipelines
### 1) Convert custom functions into transformers

In [6]:
# Original features
comb_cat_depend = FunctionTransformer(func=combine_cat, kw_args={'cutoff':6, 'replace':6}, validate=False)
log_tsfm = FunctionTransformer(func=log_transform)
impute_crdt_dbt = FunctionTransformer(func=impute_credit_debit)
impute_bal = FunctionTransformer(func=impute_balance)

# feature engineering
pct_tsfm = FunctionTransformer(func=calculate_pct_change)
ratio_tsfm = FunctionTransformer(func=calculate_ratio)
seniority_tsfm_dist = FunctionTransformer(func=calculate_vintage_age_score_eqdist)
seniority_tsfm_dens= FunctionTransformer(func=calculate_vintage_age_score_eqdens)

### 2) Create pipelines

In [7]:
pipe_depend = Pipeline([('comb_depend',comb_cat_depend),
                        ('imput_depend',SimpleImputer(strategy='constant',fill_value=1000)),
                        ('ohe_depend',OneHotEncoder(handle_unknown='ignore',sparse=False))
                       ])
pipe_cat = Pipeline([('imput_cat',SimpleImputer(strategy='constant',fill_value='other')),
                     ('ohe_cat',OneHotEncoder(handle_unknown='ignore',sparse=False))])
# num_vars
pipe_vin_age = Pipeline([('imput_base',SimpleImputer(strategy='median')),
                      ('scal_base',StandardScaler()) ]) # vintage, age
pipe_days_br = Pipeline([('imput_days_br',SimpleImputer(strategy='median')),
                     ('log_days_br',log_tsfm),
#                      ('scal_days_br',StandardScaler())
                        ]) # days_since_last_transaction
pipe_bal = Pipeline([('imput_bal',impute_bal),#AmongFeaturesMeanImputer(features=bal_vars)),
                     ('log_bal',log_tsfm),
                     ('scal_bal',StandardScaler())
                    ])
pipe_credit = Pipeline([('imput_credit', impute_crdt_dbt), #AmongFeaturesMeanImputer(features=credit_vars)),
                     ('log_credit',log_tsfm),
                     ('scal_credit',StandardScaler())]) # credit columns
pipe_debit = Pipeline([('imput_debit',impute_crdt_dbt),
                     ('log_debit',log_tsfm),
                     ('scal_debit',StandardScaler())]) # debit columns
# feature engineering
pipe_pct = Pipeline([('imput_bal_pct', impute_bal), #AmongFeaturesMeanImputer(features=bal_vars)),
                     ('pct_chg',pct_tsfm),
                     ('log_bal_pct',log_tsfm),
                     ('scal_bal_pct',StandardScaler()) 
                    ]) # consecutive balance percent change
pipe_vin_days = Pipeline([('imput_vinday',SimpleImputer(strategy='median')),
                          ('ratio_vinday',ratio_tsfm), # df['days_since_last_transactio','vintage']
                          ('scal_vinday',StandardScaler())
                         ]) # vintage per days_since_last_transaction
pipe_bal_person = Pipeline([('imput_bal_pers', impute_bal), #AmongFeaturesMeanImputer(features=bal_vars)),
                            ('ratio_bal_pers',ratio_tsfm), # df['dependents',balance columns]
                            ('log_bal_pers',log_tsfm),
                            ('scal_bal_pers',StandardScaler()) 
                           ]) # balance per person
pipe_credit_person = Pipeline([('imput_credit_pers', impute_crdt_dbt), #AmongFeaturesMeanImputer(features=credit_vars)),
                               ('ratio_credit_pers',ratio_tsfm), # df['dependents',credit columns]
                               ('log_credit_pers',log_tsfm),
                               ('scal_credit_pers',StandardScaler())]) # credit per person
pipe_debit_person = Pipeline([('imput_debit',impute_crdt_dbt),
                              ('ratio_debit_pers',ratio_tsfm), # df['dependents', debit columns]
                              ('log_debit_pers',log_tsfm),
                              ('scal_debit_pers',StandardScaler())]) # debit per person
pipe_seniority = Pipeline([('imput_vinage',SimpleImputer(strategy='median')),                      
                           ('sr_tsfm', seniority_tsfm_dens),
                           ('scal_vinage',StandardScaler()) 
                          ]) # vintage, age]) 

### 3) Assemble pipelines

In [8]:
nw_categories = [['Low','Medium','High']]
pipeline_data = ColumnTransformer([
            # categorical vars
           ('depend', pipe_depend, depend), # 7 cols
           ('cat_vars', pipe_cat, cat_vars), # 8 colc
           ('ode_nw', OrdinalEncoder(categories=nw_categories), nw_cat), # 1 col
           # numeric vars
           ('vin_age', pipe_vin_age, vin_age_vars), # 2 cols
           ('days_br', pipe_days_br, ['days_since_last_transaction','branch_code']), # 2 col
           ('num_bal', pipe_bal, bal_vars), # 6 cols
           ('num_debit', pipe_debit, debit_vars), # 2 cols
           ('num_credit', pipe_credit, credit_vars), # 2 cols
           # engineered features
           ('eng_pct',pipe_pct, bal_vars), # 5 cols
           ('eng_vinday', pipe_vin_days, vin_day_vars), # 1 cols
           ('eng_balpers', pipe_bal_person, depend+bal_vars), # 6 cols
           ('eng_crdpers', pipe_credit_person, depend+credit_vars), # 2 cols
           ('eng_dbtpers', pipe_debit_person, depend+debit_vars), # 2 cols
           ('eng_srscore', pipe_seniority, vin_age_vars) #[vintage, age] # 1 cols
  ]) #totla cols = 47

## Apply pipeline

In [13]:
pipeline_data.fit(X_train)

X_train_transformed = pipeline_data.transform(X_train)
X_test_transformed = pipeline_data.transform(X_test)

X_train_transformed.shape, X_test_transformed.shape

((13240, 47), (4414, 47))

## Save pipeline

In [15]:
joblib.dump(pipeline_data,'pipeline_dat.joblib')
# pipeline_data = joblib.load('pipeline_dat.joblib')

['pipeline_data.joblib']

## Next step
Next step is to feed the transformed data into models.