In [116]:
# Lets import some dependencies first
# Importing all the dependencies
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

pd.set_option("display.precision",2)
pd.set_option("display.max_columns",None)

# Graphics in SVG format are more sharp and legible
%config InlineBackend.figure_format = 'svg'

import warnings
warnings.filterwarnings("ignore")

import pickle

In [117]:
# load the data
data = pd.read_pickle('./pickled/Train_set.pkl')

In [118]:
data.head(10)

Unnamed: 0,ID,Year_Birth,Education,Marital_Status,Income,Kidhome,Teenhome,Dt_Customer,Recency,MntWines,MntFruits,MntMeatProducts,MntFishProducts,MntSweetProducts,MntGoldProds,NumDealsPurchases,NumWebPurchases,NumCatalogPurchases,NumStorePurchases,NumWebVisitsMonth,AcceptedCmp3,AcceptedCmp4,AcceptedCmp5,AcceptedCmp1,AcceptedCmp2,Complain,Z_CostContact,Z_Revenue,Response
1998,5370,1973,PhD,Married,32644.0,1,0,16-01-2013,38,239,3,141,0,7,23,4,7,1,6,8,1,0,0,0,0,0,3,11,0
814,6303,1986,PhD,Together,91820.0,0,0,23-11-2013,72,410,73,747,76,161,30,0,5,5,12,1,0,0,0,0,0,0,3,11,0
1513,9264,1986,Graduation,Married,79529.0,0,0,27-04-2014,1,423,42,706,73,197,197,1,4,8,9,2,0,0,0,0,0,0,3,11,0
1381,7514,1956,2n Cycle,Together,54342.0,1,1,03-07-2013,74,84,10,34,11,10,28,4,3,1,4,6,0,0,0,0,0,0,3,11,0
919,4637,1954,PhD,Single,74637.0,0,0,18-05-2013,73,960,64,464,146,0,16,1,6,9,9,3,0,0,0,1,0,0,3,11,0
902,6445,1967,Graduation,Together,66825.0,0,0,14-07-2013,73,243,101,405,29,40,40,1,4,5,6,2,0,0,0,0,0,0,3,11,0
1205,1118,1956,Master,Married,50965.0,0,1,20-02-2013,87,544,13,85,8,6,29,3,10,4,5,8,1,0,0,0,0,0,3,11,0
173,1880,1959,PhD,Together,53537.0,1,1,30-01-2014,17,81,0,6,0,0,6,2,2,1,3,5,0,0,0,0,0,0,3,11,0
1557,4037,1976,Graduation,Divorced,31859.0,1,0,14-06-2013,77,3,1,3,8,0,5,1,1,0,2,7,0,0,0,0,0,0,3,11,0
1216,232,1965,Graduation,Single,61559.0,0,1,17-07-2013,8,279,83,88,32,14,34,1,4,2,10,3,0,0,0,0,0,0,3,11,0


In [119]:
data.shape

(1770, 29)

**WHAT TO DO:**:
- *Education, Marital_Status, DT_Customer* needs to be converted to numerical features!

#### *Education* : Ordinal Encoding

In [120]:
data.Education.value_counts()

Graduation    890
PhD           393
Master        287
2n Cycle      155
Basic          45
Name: Education, dtype: int64

Note:
- This is an *ORDINAL* categorical variable! So I don't wanna use one hot encoding as I want to preserve the order in some sense!

The order I want to assign is followed:

PhD>2n Cycnle>Master>Graduation>Basic

In [121]:
from sklearn.preprocessing import OrdinalEncoder

oe = OrdinalEncoder(categories=[['Basic','Graduation','Master','2n Cycle','PhD']])
data['Education'] = oe.fit_transform(data[['Education']])

In [122]:
data.head()

Unnamed: 0,ID,Year_Birth,Education,Marital_Status,Income,Kidhome,Teenhome,Dt_Customer,Recency,MntWines,MntFruits,MntMeatProducts,MntFishProducts,MntSweetProducts,MntGoldProds,NumDealsPurchases,NumWebPurchases,NumCatalogPurchases,NumStorePurchases,NumWebVisitsMonth,AcceptedCmp3,AcceptedCmp4,AcceptedCmp5,AcceptedCmp1,AcceptedCmp2,Complain,Z_CostContact,Z_Revenue,Response
1998,5370,1973,4.0,Married,32644.0,1,0,16-01-2013,38,239,3,141,0,7,23,4,7,1,6,8,1,0,0,0,0,0,3,11,0
814,6303,1986,4.0,Together,91820.0,0,0,23-11-2013,72,410,73,747,76,161,30,0,5,5,12,1,0,0,0,0,0,0,3,11,0
1513,9264,1986,1.0,Married,79529.0,0,0,27-04-2014,1,423,42,706,73,197,197,1,4,8,9,2,0,0,0,0,0,0,3,11,0
1381,7514,1956,3.0,Together,54342.0,1,1,03-07-2013,74,84,10,34,11,10,28,4,3,1,4,6,0,0,0,0,0,0,3,11,0
919,4637,1954,4.0,Single,74637.0,0,0,18-05-2013,73,960,64,464,146,0,16,1,6,9,9,3,0,0,0,1,0,0,3,11,0


#### Marital Status : OnehotEncoded

In [123]:
from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder(sparse=False)
result = pd.DataFrame(ohe.fit_transform(data[['Marital_Status']]))

result.columns = ohe.get_feature_names(['Marital Status'])
data.drop(['Marital_Status'],axis=1,inplace=True)

data = pd.concat([data.reset_index(),pd.DataFrame(result)],axis=1)

In [124]:
data.head()

Unnamed: 0,index,ID,Year_Birth,Education,Income,Kidhome,Teenhome,Dt_Customer,Recency,MntWines,MntFruits,MntMeatProducts,MntFishProducts,MntSweetProducts,MntGoldProds,NumDealsPurchases,NumWebPurchases,NumCatalogPurchases,NumStorePurchases,NumWebVisitsMonth,AcceptedCmp3,AcceptedCmp4,AcceptedCmp5,AcceptedCmp1,AcceptedCmp2,Complain,Z_CostContact,Z_Revenue,Response,Marital Status_Divorced,Marital Status_Married,Marital Status_Single,Marital Status_Together,Marital Status_Widow
0,1998,5370,1973,4.0,32644.0,1,0,16-01-2013,38,239,3,141,0,7,23,4,7,1,6,8,1,0,0,0,0,0,3,11,0,0.0,1.0,0.0,0.0,0.0
1,814,6303,1986,4.0,91820.0,0,0,23-11-2013,72,410,73,747,76,161,30,0,5,5,12,1,0,0,0,0,0,0,3,11,0,0.0,0.0,0.0,1.0,0.0
2,1513,9264,1986,1.0,79529.0,0,0,27-04-2014,1,423,42,706,73,197,197,1,4,8,9,2,0,0,0,0,0,0,3,11,0,0.0,1.0,0.0,0.0,0.0
3,1381,7514,1956,3.0,54342.0,1,1,03-07-2013,74,84,10,34,11,10,28,4,3,1,4,6,0,0,0,0,0,0,3,11,0,0.0,0.0,0.0,1.0,0.0
4,919,4637,1954,4.0,74637.0,0,0,18-05-2013,73,960,64,464,146,0,16,1,6,9,9,3,0,0,0,1,0,0,3,11,0,0.0,0.0,1.0,0.0,0.0


#### Dt_Customer : Date

In [125]:
data.Dt_Customer.dtype

dtype('O')

In [126]:
data.Dt_Customer.head(10)

0    16-01-2013
1    23-11-2013
2    27-04-2014
3    03-07-2013
4    18-05-2013
5    14-07-2013
6    20-02-2013
7    30-01-2014
8    14-06-2013
9    17-07-2013
Name: Dt_Customer, dtype: object

So its in the basic object dtype!

The way I wanna encode this to numeric is first converting it to a date, and the counting how many days he's in with the company! The reason being I cant find any cyclical nature in the feature! So its kinda dumb to encode this using sine,cosine transformation!

In [127]:
from datetime import datetime
dates = pd.to_datetime(data['Dt_Customer'].astype('str'))
elapsed_time = (datetime.now() - dates)
elapsed_days = elapsed_time.apply(lambda x: x.days)
data['Dt_Customer'] = elapsed_days

In [128]:
data.Dt_Customer.head(10)

0    3499
1    3188
2    3033
3    3449
4    3377
5    3320
6    3464
7    3120
8    3350
9    3317
Name: Dt_Customer, dtype: int64

Looks good now

In [129]:
data.head(10)

Unnamed: 0,index,ID,Year_Birth,Education,Income,Kidhome,Teenhome,Dt_Customer,Recency,MntWines,MntFruits,MntMeatProducts,MntFishProducts,MntSweetProducts,MntGoldProds,NumDealsPurchases,NumWebPurchases,NumCatalogPurchases,NumStorePurchases,NumWebVisitsMonth,AcceptedCmp3,AcceptedCmp4,AcceptedCmp5,AcceptedCmp1,AcceptedCmp2,Complain,Z_CostContact,Z_Revenue,Response,Marital Status_Divorced,Marital Status_Married,Marital Status_Single,Marital Status_Together,Marital Status_Widow
0,1998,5370,1973,4.0,32644.0,1,0,3499,38,239,3,141,0,7,23,4,7,1,6,8,1,0,0,0,0,0,3,11,0,0.0,1.0,0.0,0.0,0.0
1,814,6303,1986,4.0,91820.0,0,0,3188,72,410,73,747,76,161,30,0,5,5,12,1,0,0,0,0,0,0,3,11,0,0.0,0.0,0.0,1.0,0.0
2,1513,9264,1986,1.0,79529.0,0,0,3033,1,423,42,706,73,197,197,1,4,8,9,2,0,0,0,0,0,0,3,11,0,0.0,1.0,0.0,0.0,0.0
3,1381,7514,1956,3.0,54342.0,1,1,3449,74,84,10,34,11,10,28,4,3,1,4,6,0,0,0,0,0,0,3,11,0,0.0,0.0,0.0,1.0,0.0
4,919,4637,1954,4.0,74637.0,0,0,3377,73,960,64,464,146,0,16,1,6,9,9,3,0,0,0,1,0,0,3,11,0,0.0,0.0,1.0,0.0,0.0
5,902,6445,1967,1.0,66825.0,0,0,3320,73,243,101,405,29,40,40,1,4,5,6,2,0,0,0,0,0,0,3,11,0,0.0,0.0,0.0,1.0,0.0
6,1205,1118,1956,2.0,50965.0,0,1,3464,87,544,13,85,8,6,29,3,10,4,5,8,1,0,0,0,0,0,3,11,0,0.0,1.0,0.0,0.0,0.0
7,173,1880,1959,4.0,53537.0,1,1,3120,17,81,0,6,0,0,6,2,2,1,3,5,0,0,0,0,0,0,3,11,0,0.0,0.0,0.0,1.0,0.0
8,1557,4037,1976,1.0,31859.0,1,0,3350,77,3,1,3,8,0,5,1,1,0,2,7,0,0,0,0,0,0,3,11,0,1.0,0.0,0.0,0.0,0.0
9,1216,232,1965,1.0,61559.0,0,1,3317,8,279,83,88,32,14,34,1,4,2,10,3,0,0,0,0,0,0,3,11,0,0.0,0.0,1.0,0.0,0.0


In [130]:
data.dtypes

index                        int64
ID                           int64
Year_Birth                   int64
Education                  float64
Income                     float64
Kidhome                      int64
Teenhome                     int64
Dt_Customer                  int64
Recency                      int64
MntWines                     int64
MntFruits                    int64
MntMeatProducts              int64
MntFishProducts              int64
MntSweetProducts             int64
MntGoldProds                 int64
NumDealsPurchases            int64
NumWebPurchases              int64
NumCatalogPurchases          int64
NumStorePurchases            int64
NumWebVisitsMonth            int64
AcceptedCmp3                 int64
AcceptedCmp4                 int64
AcceptedCmp5                 int64
AcceptedCmp1                 int64
AcceptedCmp2                 int64
Complain                     int64
Z_CostContact                int64
Z_Revenue                    int64
Response            

In [131]:
data.to_pickle('./pickled/Train Set All Numeric')

#### Feature Scaling

There's basically two scaling technique to apply depending on the scenerio. 1. Normalization and Standarization. I'm gonna use Standarization as most of the features are normally distributed as observed in the EDA part of this project!

In sklearn its implemented as Standard Scaler

Also, as not all features not need to be transformed, lets use ColumnTransformer to make our life easier.

In [132]:
data.head(10)

Unnamed: 0,index,ID,Year_Birth,Education,Income,Kidhome,Teenhome,Dt_Customer,Recency,MntWines,MntFruits,MntMeatProducts,MntFishProducts,MntSweetProducts,MntGoldProds,NumDealsPurchases,NumWebPurchases,NumCatalogPurchases,NumStorePurchases,NumWebVisitsMonth,AcceptedCmp3,AcceptedCmp4,AcceptedCmp5,AcceptedCmp1,AcceptedCmp2,Complain,Z_CostContact,Z_Revenue,Response,Marital Status_Divorced,Marital Status_Married,Marital Status_Single,Marital Status_Together,Marital Status_Widow
0,1998,5370,1973,4.0,32644.0,1,0,3499,38,239,3,141,0,7,23,4,7,1,6,8,1,0,0,0,0,0,3,11,0,0.0,1.0,0.0,0.0,0.0
1,814,6303,1986,4.0,91820.0,0,0,3188,72,410,73,747,76,161,30,0,5,5,12,1,0,0,0,0,0,0,3,11,0,0.0,0.0,0.0,1.0,0.0
2,1513,9264,1986,1.0,79529.0,0,0,3033,1,423,42,706,73,197,197,1,4,8,9,2,0,0,0,0,0,0,3,11,0,0.0,1.0,0.0,0.0,0.0
3,1381,7514,1956,3.0,54342.0,1,1,3449,74,84,10,34,11,10,28,4,3,1,4,6,0,0,0,0,0,0,3,11,0,0.0,0.0,0.0,1.0,0.0
4,919,4637,1954,4.0,74637.0,0,0,3377,73,960,64,464,146,0,16,1,6,9,9,3,0,0,0,1,0,0,3,11,0,0.0,0.0,1.0,0.0,0.0
5,902,6445,1967,1.0,66825.0,0,0,3320,73,243,101,405,29,40,40,1,4,5,6,2,0,0,0,0,0,0,3,11,0,0.0,0.0,0.0,1.0,0.0
6,1205,1118,1956,2.0,50965.0,0,1,3464,87,544,13,85,8,6,29,3,10,4,5,8,1,0,0,0,0,0,3,11,0,0.0,1.0,0.0,0.0,0.0
7,173,1880,1959,4.0,53537.0,1,1,3120,17,81,0,6,0,0,6,2,2,1,3,5,0,0,0,0,0,0,3,11,0,0.0,0.0,0.0,1.0,0.0
8,1557,4037,1976,1.0,31859.0,1,0,3350,77,3,1,3,8,0,5,1,1,0,2,7,0,0,0,0,0,0,3,11,0,1.0,0.0,0.0,0.0,0.0
9,1216,232,1965,1.0,61559.0,0,1,3317,8,279,83,88,32,14,34,1,4,2,10,3,0,0,0,0,0,0,3,11,0,0.0,0.0,1.0,0.0,0.0


In [133]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
cols = ['Year_Birth','Education', 'Income', 'Kidhome',
        'Teenhome', 'Dt_Customer', 'Recency', 'MntWines', 'MntFruits',
        'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts',
        'MntGoldProds', 'NumDealsPurchases', 'NumWebPurchases',
        'NumCatalogPurchases', 'NumStorePurchases',
        'NumWebVisitsMonth', 'Z_CostContact', 'Z_Revenue']

# Init the Standard Scaler
scaler = StandardScaler()

# Init the Column Transformer
ct = ColumnTransformer([
    ('standarizer', scaler, cols)
], remainder='passthrough')


In [134]:
data = pd.DataFrame(ct.fit_transform(data),columns=data.columns)
data.head(10)

Unnamed: 0,index,ID,Year_Birth,Education,Income,Kidhome,Teenhome,Dt_Customer,Recency,MntWines,MntFruits,MntMeatProducts,MntFishProducts,MntSweetProducts,MntGoldProds,NumDealsPurchases,NumWebPurchases,NumCatalogPurchases,NumStorePurchases,NumWebVisitsMonth,AcceptedCmp3,AcceptedCmp4,AcceptedCmp5,AcceptedCmp1,AcceptedCmp2,Complain,Z_CostContact,Z_Revenue,Response,Marital Status_Divorced,Marital Status_Married,Marital Status_Single,Marital Status_Together,Marital Status_Widow
0,0.34,1.61,-0.88,1.02,-0.93,0.75,-0.38,-0.19,-0.59,-0.1,-0.69,-0.49,-0.41,0.88,1.07,-0.57,0.06,1.11,0.0,0.0,1998.0,5370.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,1.46,1.61,1.87,-0.82,-0.93,-0.57,0.79,0.33,1.18,2.61,0.71,3.27,-0.27,-1.22,0.34,0.8,1.92,-1.78,0.0,0.0,814.0,6303.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,1.46,-0.78,1.29,-0.82,-0.93,-1.23,-1.66,0.36,0.4,2.43,0.66,4.15,2.92,-0.7,-0.03,1.82,0.99,-1.37,0.0,0.0,1513.0,9264.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,-1.11,0.81,0.12,1.02,0.92,0.54,0.86,-0.66,-0.41,-0.58,-0.48,-0.41,-0.31,0.88,-0.39,-0.57,-0.55,0.28,0.0,0.0,1381.0,7514.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,-1.29,1.61,1.07,-0.82,-0.93,0.23,0.83,1.98,0.95,1.34,2.0,-0.66,-0.54,-0.7,0.7,2.16,0.99,-0.96,0.0,0.0,919.0,4637.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
5,-0.17,-0.78,0.7,-0.82,-0.93,-0.01,0.83,-0.18,1.89,1.08,-0.15,0.32,-0.08,-0.7,-0.03,0.8,0.06,-1.37,0.0,0.0,902.0,6445.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
6,-1.11,0.02,-0.03,-0.82,0.92,0.6,1.31,0.73,-0.33,-0.36,-0.54,-0.51,-0.29,0.35,2.16,0.46,-0.24,1.11,0.0,0.0,1205.0,1118.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
7,-0.86,1.61,0.09,1.02,0.92,-0.86,-1.11,-0.67,-0.66,-0.71,-0.69,-0.66,-0.73,-0.17,-0.76,-0.57,-0.86,-0.13,0.0,0.0,173.0,1880.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
8,0.6,-0.78,-0.92,1.02,-0.93,0.12,0.97,-0.9,-0.64,-0.72,-0.54,-0.66,-0.75,-0.7,-1.12,-0.91,-1.17,0.69,0.0,0.0,1557.0,4037.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
9,-0.34,-0.78,0.46,-0.82,0.92,-0.02,-1.42,-0.07,1.44,-0.34,-0.1,-0.32,-0.2,-0.7,-0.03,-0.22,1.3,-0.96,0.0,0.0,1216.0,232.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [136]:
data.to_pickle('./pickled/Train_Set_Scaled')