# Sklearn Pipeline for Scoring New Data

In [1]:
import pickle
import pandas as pd

# Import Data
the dataset is available here: kaggle

In [2]:
# load the local dataset 
df = pd.read_csv(
        filepath_or_buffer = '../data/bank_customers_churn_dataset.csv',
        index_col='customer_id'
)

# drop churn column for segmentation
df.drop('churn', axis=1, inplace=True)

df.head()

Unnamed: 0_level_0,credit_score,country,gender,age,tenure,balance,products_number,credit_card,active_member,estimated_salary
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
15634602,619,France,Female,42,2,0.0,1,1,1,101348.88
15647311,608,Spain,Female,41,1,83807.86,1,0,1,112542.58
15619304,502,France,Female,42,8,159660.8,3,1,0,113931.57
15701354,699,France,Female,39,1,0.0,2,0,0,93826.63
15737888,850,Spain,Female,43,2,125510.82,1,1,1,79084.1


# Load Model Artifacts

In [3]:
with open('../artifacts/numeric_imputer.pickle', 'rb') as filename: # trained model to impute missing numeric data
    numeric_imputer = pickle.load(filename)

with open('../artifacts/categorical_imputer.pickle', 'rb') as filename: # trained model to impute missing categorical data
    categorical_imputer = pickle.load(filename) 

with open('../artifacts/rare_encoder.pickle', 'rb') as filename: # trained model to encode rare labels
    rare_encoder = pickle.load(filename)

with open('../artifacts/capper.pickle', 'rb') as filename: # trained model to cap outliers
    capper = pickle.load(filename)   

with open('../artifacts/enc.pickle', 'rb') as filename: # trained one hot encoder
    enc = pickle.load(filename)

with open('../artifacts/model.pickle', 'rb') as filename: # trained random forrest classifier
    model = pickle.load(filename)

# Transform Dataset

In [None]:
# get numeric and categorical columns
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
numeric_columns = df.select_dtypes(include=numerics).columns.to_list()
categorical_columns = df.select_dtypes(exclude=numerics).columns.to_list()

In [None]:
# impute mising numeric features
df_numeric = pd.DataFrame(
    numeric_imputer.transform(df[numeric_columns]), 
    columns=numeric_columns, 
    index=df.index)

# impute mising categorical features
df_categorical = pd.DataFrame(
    categorical_imputer.transform(df[categorical_columns]), 
    columns=categorical_columns, 
    index=df.index)

# concate numeric and categorical features
df = pd.concat([df_numeric, df_categorical], axis=1)

# remove rare labels
df[categorical_columns] = rare_encoder.transform(df[categorical_columns])

# remove outliers
df[numeric_columns] = capper.transform(df[numeric_columns])

# one hot encoding categorical features
df_cat_hotenc = pd.DataFrame(
    enc.transform(df[categorical_columns]), 
    columns=enc.get_feature_names_out(),
    index=df.index) 

# concate numeric and hot-encoded categorical features
df_hotenc = pd.concat([df[numeric_columns], df_cat_hotenc], axis=1)

# predict cluster
labels = model.predict(df_hotenc)

# add cluster label to df
df['cluster'] = labels

df.head()

Unnamed: 0_level_0,credit_score,age,tenure,balance,products_number,credit_card,active_member,estimated_salary,country,gender,cluster
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
15634602,619.0,42.0,2.0,0.0,1.0,1.0,1.0,101348.88,France,Female,7
15647311,608.0,41.0,1.0,83807.86,1.0,0.0,1.0,112542.58,Spain,Female,0
15619304,502.0,42.0,8.0,159660.8,3.0,1.0,0.0,113931.57,France,Female,0
15701354,699.0,39.0,1.0,0.0,2.0,0.0,0.0,93826.63,France,Female,9
15737888,850.0,43.0,2.0,125510.82,1.0,1.0,1.0,79084.1,Spain,Female,1


In [None]:
# Cluster Counts
df['cluster'].value_counts()

0     1414
4     1011
6      947
7      803
10     758
3      509
5      489
13     457
12     455
11     433
16     397
1      388
2      379
14     371
8      360
15     320
9      315
18     112
17      82
Name: cluster, dtype: int64

In [None]:
# Cluster Centers
pd.concat([
    df.groupby(by='cluster').mean(),
    df.groupby(by='cluster').aggregate(pd.Series.mode)[categorical_columns] ],
    axis=1)

  df.groupby(by='cluster').mean(),
  df.groupby(by='cluster').aggregate(pd.Series.mode)[categorical_columns] ],


Unnamed: 0_level_0,credit_score,age,tenure,balance,products_number,credit_card,active_member,estimated_salary,country,gender
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,655.366337,40.833249,4.864215,96223.207482,1.570065,0.046676,0.52546,101502.572765,Spain,Female
1,651.958763,40.005147,4.976804,56574.89817,1.52908,1.0,1.0,99109.426108,Spain,Female
2,653.248021,39.643792,4.76781,120840.295963,1.545775,1.0,1.0,101213.050765,Germany,Female
3,652.300589,39.065265,5.074656,65603.672141,1.483301,1.0,1.0,95539.391591,Spain,Male
4,652.347181,39.125613,5.091988,60009.775242,1.53892,1.0,1.0,99398.792305,France,Male
5,654.233129,39.806855,4.95092,120010.719489,1.558551,1.0,1.0,100452.116708,Germany,Male
6,649.157339,36.931127,5.139388,64589.57038,1.479707,1.0,0.0,100604.350391,France,Male
7,653.882939,39.489958,4.975093,58125.600349,1.522146,1.0,1.0,96253.002304,France,Female
8,648.922222,38.044444,5.0,53953.735278,1.589675,1.0,0.0,97977.700222,Spain,Female
9,645.222222,37.058905,5.260317,59770.595587,1.553279,0.0,0.0,103169.650381,France,Female
