# Machine learning project -- Part 2  Build data Pipelines

This project is to build a predictive machine learning model using the customer churn data available at Kaggle website. <br>
Part 2 will look at building data pipelines that include engineering new features.

## Import packages

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder, FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer


## Read in data nad split into train and test
### Read in data

In [3]:
## Read in data
df0 = pd.read_csv('./data/visathon_train_data.csv')#,index=customer_id)
print('Shape: {}'.format(df0.shape))
df0.sample(5)

Shape: (17654, 20)


Unnamed: 0,customer_id,vintage,age,gender,dependents,occupation,customer_nw_category,branch_code,days_since_last_transaction,current_balance,previous_month_end_balance,average_monthly_balance_prevQ,average_monthly_balance_prevQ2,current_month_credit,previous_month_credit,current_month_debit,previous_month_debit,current_month_balance,previous_month_balance,churn
16405,22495.0,2690.0,60.0,Male,2.0,Engineer,Low,1006.0,56.0,5389.28,5524.81,6441.43,1849.92,0.53,29.1,78.96,0.53,5450.94,5527.21,Yes
7032,9585.0,1537.0,37.0,Female,0.0,Engineer,Medium,236.0,26.0,23112.39,22701.04,22629.35,21324.33,411.84,411.84,0.49,,,22555.07,Yes
13313,18232.0,1212.0,86.0,Female,0.0,Engineer,Medium,17.0,73.0,2771.3,4585.75,3204.37,3343.61,0.33,0.33,1971.76,0.33,3341.84,3806.77,Yes
10602,14534.0,310.0,43.0,Male,2.0,Engineer,Low,1712.0,35.0,18226.8,18226.8,8140.3,14407.7,0.23,16457.37,0.23,0.23,18226.8,4424.2,Yes
4769,6508.0,1047.0,39.0,Female,0.0,Engineer,Medium,1054.0,230.0,2785.53,2783.58,2784.07,4760.12,0.03,0.03,0.03,0.03,2785.06,2783.58,Yes


### Group features

In [4]:
depend = ['dependents']
cat_vars = ['gender', 'occupation']#,'branch_code']

nw_cat = ['customer_nw_category']
bal_vars = ['current_balance', 'current_month_balance', 
            'previous_month_end_balance','previous_month_balance',
            'average_monthly_balance_prevQ', 'average_monthly_balance_prevQ2']
debit_vars = ['current_month_debit','previous_month_debit']
credit_vars = ['current_month_credit', 'previous_month_credit']
vin_age_vars = ['vintage', 'age']
vin_day_vars = ['days_since_last_transaction','vintage']
vin_dep_vars = ['dependents','vintage']

target = 'churn'

### Split data

In [5]:
X = df0.drop(target,axis=1).copy()
y = df0[target].copy()

In [9]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.25, random_state=10086)
print('X_train shape: {}\nX_test shape: {}\ny_train shape: {}\ny_test shape: {}'.format( X_train.shape, X_test.shape, y_train.shape, y_test.shape))

X_train shape: (13240, 19)
X_test shape: (4414, 19)
y_train shape: (13240,)
y_test shape: (4414,)


## Define custom functions
### 1. Use BaseEstimator and TransformerMixin
These transformers have been tested, can be used in the pipelines to replace corresponding tranformers from custom functions in following section 2.

In [10]:
##################################################
from sklearn.base import BaseEstimator, TransformerMixin

## balance features
class AmongFeaturesMeanImputer(BaseEstimator, TransformerMixin):
    def __init__(self, column=[], features=[]):
        """ column: feature to be imputed
            features: features to for average values (can include column)
        """
        self.features = features
        self.column = column

    def fit(self, X, y = None):
        return self

    def transform(self, X, y = None):
        for feature in self.features:
            X[feature].fillna(X[self.features].mean(axis=1,skipna=True),inplace=True)
        return X

# debit and credit features
class FromFeatureImputer(BaseEstimator, TransformerMixin):
    def __init__(self, features=[]):
        """ Column: feature to be imputed
            feature: feature to provide value
        """
        self.features = features
#         self.column = column

    def fit(self, X, y = None):
        return self

    def transform(self, X, y = None):
        print(self.features)
        print([self.features[0]])
        print(self.features + [self.features[0]])
        feature_list = self.features + [self.features[0]]
        for i, feature in enumerate(feature_list):
            if i < len(feature_list)-1:
                X[feature] = X[feature].fillna(X[feature_list[i+1]]*(1+np.random.randn()))
        return X 

### 2. Define custom tranformation functions
#### 1) Impute and log transform

In [11]:
## combine non-frequent categories into one
def combine_cat(s, cutoff=6, replace=6):
    """ Replace the categories that are greater than or equal to cutoff with value cat
        s: a Pandas series
        cutoff: a scalar
        cat: scalar
        
        return: a series with replaced values
    """
    s_ = s.copy()
    mask = s > cutoff
    s_[mask==True] = replace
    return s_

# balance, debit, credit, days_since_last_transaction
def log_transform(df):
#     print('This starts')
#     print(df)
#     print('this ends')
    if df.size == len(df):
        s = df
        s_ = [np.log(x+1) if x>=0 else -np.log(-x+1) for x in s]
    else:
        s_= []
        for i in range(df.shape[1]):
            s = df.iloc[:,i] 
            s_.append([np.log(x+1) if x>=0 else -np.log(-x+1) for x in s])
        s_ = pd.DataFrame.from_records(s_).transpose()
    return s_

## balance features
def impute_balance(df):
    """ s: a panda series. To be imputed.
        features: features to for average values (can include column)
    """
    s_= []
    for i in range(df.shape[1]):
        s = df.iloc[:,i] 
        s_.append(s.fillna(df.mean(axis=1,skipna=True)))
    s_ = pd.DataFrame.from_records(s_).transpose()
    return s_

# debit and credit features
def impute_credit_debit(df):
    s_= []
    for i in range(df.shape[1]):
        s = df.iloc[:,i] 
        s_.append(s.fillna(df.mean(axis=1)*(1+np.random.randn())))
    s_ = pd.DataFrame.from_records(s_).transpose()
    return s_

#### 2) Engineer new features

In [12]:
## Engineer new features 
# percentage changes
def calculate_pct_change(df):
    """ calculate percent changes in balance between consecutive periods
        df: Pandas dataframe or array. Balance columns of two consecutive peroids
        
        return: a series containing percent changes.
    """
    df_ = pd.DataFrame(df.copy())
    s_ = []
    for i in range(df_.shape[1]-1):
        s1 = df_.iloc[:,i]
        s2 = df_.iloc[:,i+1]
        s_.append((s1-s2)/(s2+1)*100) # s2+1 to avoid dividing-by-zero
    df_ = pd.DataFrame.from_records(s_).transpose()
    return df_ 

# vintage/(day_since_last_transaction) & per person values in a household
def calculate_ratio(df):
    """ calculate the ratio of two features
        s1, s2: Pandas series.
        
        return: a series containing ratio.
    """
    s_ = []
    df_ = pd.DataFrame(df.copy())
    s1 = df_.iloc[:,0]
    for i in range(1,df_.shape[1]):
        s2 = df_.iloc[:,i]
        s_.append(s2/(s1+1)) #to avoid dividing by zero
    df_ = pd.DataFrame.from_records(s_).transpose()
    return df_ 

# vintage_age score 1 - equal distance
def calculate_vintage_age_score_eqdist(df):
    """ df: an array of shape (*,2)
        df.shape[0]: for vintage column
        df.shape[1]: for age column
    """
    # df = df[['vintage','age']]
    # Vintage
    df_ = pd.DataFrame(df.copy())
    mask = df_.iloc[:,0] >= 7000
    df_['vintage_score'] = df_.iloc[:,0]//1000 + 1
    df_['vintage_score'][mask] = 8
    
    # age
    cut_score = [1, 2, 3, 4, 5, 6, 7, 8]
    cut_bins = [0, 10, 19, 29, 39, 49, 59, 69, 100]
    df_['age_score'] = pd.cut(df_.iloc[:,1], bins=cut_bins, labels=cut_score).astype(int)
    df_['vintage_age_score'] = df_['vintage_score'] * df_['age_score']
    # return 2D arrage required
    return df_['vintage_age_score'].values.reshape(-1,1)

# vintage_age score 2 - equal population
def calculate_vintage_age_score_eqdens(df):
    """ df: an array of shape (*,2)
        df.shape[0]: for vintage column
        df.shape[1]: for age column
    """
    # df = df[['vintage','age']]
    df_ = pd.DataFrame(df.copy())
    cut_score = [1, 2, 3, 4, 5, 6, 7, 8]
    df_['vintage_score'] = pd.qcut(df_.iloc[:,0], q=8, labels=cut_score).astype(int)
    df_['age_score'] = pd.qcut(df_.iloc[:,1], q=8, labels=cut_score).astype(int)
    df_['vintage_age_score'] = df_['vintage_score'] * df_['age_score']
    # return 2D arrage required
    return df_['vintage_age_score'].values.reshape(-1,1)

#### 3) Convert custom functions into transformers

In [13]:
# Original features
comb_cat_depend = FunctionTransformer(func=combine_cat, kw_args={'cutoff':6, 'replace':6}, validate=False)
log_tsfm = FunctionTransformer(func=log_transform)
impute_crdt_dbt = FunctionTransformer(func=impute_credit_debit)
impute_bal = FunctionTransformer(func=impute_balance)

# feature engineering
pct_tsfm = FunctionTransformer(func=calculate_pct_change)
ratio_tsfm = FunctionTransformer(func=calculate_ratio)
seniority_tsfm_dist = FunctionTransformer(func=calculate_vintage_age_score_eqdist)
seniority_tsfm_dens= FunctionTransformer(func=calculate_vintage_age_score_eqdens)

## Make pipelines

In [14]:
pipe_depend = Pipeline([('comb_depend',comb_cat_depend),
                        ('imput_depend',SimpleImputer(strategy='constant',fill_value=1000)),
                        ('ohe_depend',OneHotEncoder(handle_unknown='ignore',sparse=False))
                       ])
pipe_cat = Pipeline([('imput_cat',SimpleImputer(strategy='constant',fill_value='other')),
                     ('ohe_cat',OneHotEncoder(handle_unknown='ignore',sparse=False))])
# num_vars
pipe_vin_age = Pipeline([('imput_base',SimpleImputer(strategy='median')),
                      ('scal_base',StandardScaler()) ]) # vintage, age
pipe_days = Pipeline([('imput_days',SimpleImputer(strategy='median')),
                     ('log_days',log_tsfm),
                     ('scal_days',StandardScaler())]) # days_since_last_transaction
pipe_bal = Pipeline([('imput_bal',impute_bal),#AmongFeaturesMeanImputer(features=bal_vars)),
                     ('log_bal',log_tsfm),
                     ('scal_bal',StandardScaler())
                    ])
pipe_credit = Pipeline([('imput_credit', impute_crdt_dbt), #AmongFeaturesMeanImputer(features=credit_vars)),
                     ('log_credit',log_tsfm),
                     ('scal_credit',StandardScaler())]) # credit columns
pipe_debit = Pipeline([('imput_debit',impute_crdt_dbt),
                     ('log_debit',log_tsfm),
                     ('scal_debit',StandardScaler())]) # debit columns
# feature engineering
pipe_pct = Pipeline([('imput_bal_pct',AmongFeaturesMeanImputer(features=bal_vars)),
                     ('pct_chg',pct_tsfm),
                     ('log_bal_pct',log_tsfm),
                     ('scal_bal_pct',StandardScaler()) 
                    ]) # consecutive balance percent change
pipe_vin_days = Pipeline([('imput_vinday',SimpleImputer(strategy='median')),
                          ('ratio_vinday',ratio_tsfm), # df['days_since_last_transactio','vintage']
                          ('scal_vinday',StandardScaler())
                         ]) # vintage per days_since_last_transaction
pipe_bal_person = Pipeline([('imput_bal_pers',AmongFeaturesMeanImputer(features=bal_vars)),
                            ('ratio_bal_pers',ratio_tsfm), # df['dependents',balance columns]
                            ('log_bal_pers',log_tsfm),
                            ('scal_bal_pers',StandardScaler()) 
                           ]) # balance per person
pipe_credit_person = Pipeline([('imput_credit_pers',AmongFeaturesMeanImputer(features=credit_vars)),
                               ('ratio_credit_pers',ratio_tsfm), # df['dependents',credit columns]
                               ('log_credit_pers',log_tsfm),
                               ('scal_credit_pers',StandardScaler())]) # credit per person
pipe_debit_person = Pipeline([('imput_debit',impute_crdt_dbt),
                              ('ratio_debit_pers',ratio_tsfm), # df['dependents', debit columns]
                              ('log_debit_pers',log_tsfm),
                              ('scal_debit_pers',StandardScaler())]) # debit per person
pipe_seniority = Pipeline([('imput_vinage',SimpleImputer(strategy='median')),                      
                           ('sr_tsfm', seniority_tsfm_dens),
                           ('scal_vinage',StandardScaler()) 
                          ]) # vintage, age]) 

In [15]:
nw_categories = [['Low','Medium','High']]
pipeline_full = ColumnTransformer([
            # categorical vars
           ('depend', pipe_depend, depend), # 7 cols
           ('cat_vars', pipe_cat, cat_vars), # 8 colc
           ('ode_nw', OrdinalEncoder(categories=nw_categories), nw_cat), # 1 col
           # numeric vars
           ('vin_age', pipe_vin_age, vin_age_vars+['branch_code']), # 3 cols
           #('vin_age', pipe_vin_age, vin_age_vars), # 2 cols
           ('days', pipe_days, ['days_since_last_transaction']), # 1 col
           ('num_bal', pipe_bal, bal_vars), # 6 cols
           ('num_debit', pipe_debit, debit_vars), # 2 cols
           ('num_credit', pipe_credit, credit_vars), # 2 cols
           # engineered features
           ('eng_pct',pipe_pct, bal_vars), # 5 cols
           ('eng_vinday', pipe_vin_days, vin_day_vars), # 1 cols
           ('eng_balpers', pipe_bal_person, depend+bal_vars), # 6 cols
           ('eng_crdpers', pipe_credit_person, depend+credit_vars), # 2 cols
           ('eng_dbtpers', pipe_debit_person, depend+debit_vars), # 2 cols
           ('eng_srscore', pipe_seniority, vin_age_vars) #[vintage, age] # 1 cols
  ]) #totla cols = 47

## Transform datasets

In [16]:
pipeline_full.fit(X_train)
X_train_transformed = pipeline_full.transform(X_train)

In [17]:
X_test_transformed = pipeline_full.transform(X_test)

In [18]:
X_train_transformed.shape, X_test_transformed.shape

((13240, 47), (4414, 47))

## Next step
Next step is to feed the transformed data into models.