**Creating Entities and relationships**

In [0]:
# Defining the path to the 
file_url = 'https://raw.githubusercontent.com/PacktWorkshops/The-Data-Science-Workshop/master/Chapter17/Datasets/bank-full.csv'

In [3]:
# Loading data using pandas
import pandas as pd
bankData = pd.read_csv(file_url,sep=";")
bankData.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


In [0]:
# Removing the target variable
Y = bankData.pop('y')

In [0]:
# Creating the Ids for Demographic Entity
bankData['custID'] = bankData.index.values

bankData['custID'] = 'cust' + bankData['custID'].astype(str)

In [0]:
# Creating AssetId
bankData['AssetId'] = 0
bankData.loc[bankData.housing == 'yes','AssetId']= 1

In [0]:
# Creating LoanId
bankData['LoanId'] = 0
bankData.loc[bankData.loan == 'yes','LoanId']= 1

In [0]:
# Creating Financial behaviour ID
bankData['FinbehId'] = 0
bankData.loc[bankData.default == 'yes','FinbehId']= 1

In [0]:
# Importing necessary libraries
import featuretools as ft
import numpy as np

In [0]:
# creating the entity set 'Bankentities'
Bankentities = ft.EntitySet(id = 'Bank')

In [0]:
# Mapping a dataframe to the entityset to form the parent entity
Bankentities.entity_from_dataframe(entity_id = 'Demographic Data', dataframe = bankData, index = 'custID')

Entityset: Bank
  Entities:
    Demographic Data [Rows: 45211, Columns: 20]
  Relationships:
    No relationships

In [0]:
# Mapping Assets and setting the relationship
Bankentities.normalize_entity(base_entity_id='Demographic Data', new_entity_id='Assets', index = 'AssetId', 
additional_variables = ['housing'])

Entityset: Bank
  Entities:
    Demographic Data [Rows: 45211, Columns: 19]
    Assets [Rows: 2, Columns: 2]
  Relationships:
    Demographic Data.AssetId -> Assets.AssetId

In [0]:
# Mapping Loans and Financial behavior entities
Bankentities.normalize_entity(base_entity_id='Demographic Data', new_entity_id='Liability', index = 'LoanId', 
additional_variables = ['loan'])

Bankentities.normalize_entity(base_entity_id='Demographic Data', new_entity_id='FinBehaviour', index = 'FinbehId', 
additional_variables = ['default'])

Entityset: Bank
  Entities:
    Demographic Data [Rows: 45211, Columns: 17]
    Assets [Rows: 2, Columns: 2]
    Liability [Rows: 2, Columns: 2]
    FinBehaviour [Rows: 2, Columns: 2]
  Relationships:
    Demographic Data.AssetId -> Assets.AssetId
    Demographic Data.LoanId -> Liability.LoanId
    Demographic Data.FinbehId -> FinBehaviour.FinbehId

**Feature Engineering**

In [0]:
# Creating feature sets using Deep Feature Synthesis
feature_set, feature_names = ft.dfs(entityset=Bankentities, 
target_entity = 'Demographic Data', 
max_depth = 2, 
verbose = 1, 
n_jobs = 1)

Built 196 features
Elapsed: 00:13 | Remaining: 00:00 | Progress: 100%|██████████| Calculated: 11/11 chunks


In [0]:
# Reindexing the feature_set
feature_set = feature_set.reindex(index=bankData['custID'])
feature_set = feature_set.reset_index()

In [0]:
# Verifying the shape of the features and original bank data
print(feature_set.shape)
print(bankData.shape)

(45211, 197)
(45211, 20)


In [0]:
# Printing head of the feature set
feature_set.head()

Unnamed: 0,custID,age,job,marital,education,balance,contact,day,month,duration,campaign,pdays,previous,poutcome,AssetId,LoanId,FinbehId,Assets.housing,Liability.loan,FinBehaviour.default,Assets.SUM(Demographic Data.age),Assets.SUM(Demographic Data.balance),Assets.SUM(Demographic Data.day),Assets.SUM(Demographic Data.duration),Assets.SUM(Demographic Data.campaign),Assets.SUM(Demographic Data.pdays),Assets.SUM(Demographic Data.previous),Assets.STD(Demographic Data.age),Assets.STD(Demographic Data.balance),Assets.STD(Demographic Data.day),Assets.STD(Demographic Data.duration),Assets.STD(Demographic Data.campaign),Assets.STD(Demographic Data.pdays),Assets.STD(Demographic Data.previous),Assets.MAX(Demographic Data.age),Assets.MAX(Demographic Data.balance),Assets.MAX(Demographic Data.day),Assets.MAX(Demographic Data.duration),Assets.MAX(Demographic Data.campaign),Assets.MAX(Demographic Data.pdays),...,FinBehaviour.MAX(Demographic Data.pdays),FinBehaviour.MAX(Demographic Data.previous),FinBehaviour.SKEW(Demographic Data.age),FinBehaviour.SKEW(Demographic Data.balance),FinBehaviour.SKEW(Demographic Data.day),FinBehaviour.SKEW(Demographic Data.duration),FinBehaviour.SKEW(Demographic Data.campaign),FinBehaviour.SKEW(Demographic Data.pdays),FinBehaviour.SKEW(Demographic Data.previous),FinBehaviour.MIN(Demographic Data.age),FinBehaviour.MIN(Demographic Data.balance),FinBehaviour.MIN(Demographic Data.day),FinBehaviour.MIN(Demographic Data.duration),FinBehaviour.MIN(Demographic Data.campaign),FinBehaviour.MIN(Demographic Data.pdays),FinBehaviour.MIN(Demographic Data.previous),FinBehaviour.MEAN(Demographic Data.age),FinBehaviour.MEAN(Demographic Data.balance),FinBehaviour.MEAN(Demographic Data.day),FinBehaviour.MEAN(Demographic Data.duration),FinBehaviour.MEAN(Demographic Data.campaign),FinBehaviour.MEAN(Demographic Data.pdays),FinBehaviour.MEAN(Demographic Data.previous),FinBehaviour.COUNT(Demographic Data),FinBehaviour.NUM_UNIQUE(Demographic Data.job),FinBehaviour.NUM_UNIQUE(Demographic Data.marital),FinBehaviour.NUM_UNIQUE(Demographic Data.education),FinBehaviour.NUM_UNIQUE(Demographic Data.contact),FinBehaviour.NUM_UNIQUE(Demographic Data.month),FinBehaviour.NUM_UNIQUE(Demographic Data.poutcome),FinBehaviour.NUM_UNIQUE(Demographic Data.AssetId),FinBehaviour.NUM_UNIQUE(Demographic Data.LoanId),FinBehaviour.MODE(Demographic Data.job),FinBehaviour.MODE(Demographic Data.marital),FinBehaviour.MODE(Demographic Data.education),FinBehaviour.MODE(Demographic Data.contact),FinBehaviour.MODE(Demographic Data.month),FinBehaviour.MODE(Demographic Data.poutcome),FinBehaviour.MODE(Demographic Data.AssetId),FinBehaviour.MODE(Demographic Data.LoanId)
0,cust0,58,management,married,tertiary,2143,unknown,5,may,261,1,-1,0,unknown,1,0,0,yes,no,no,984475,29530340,391984,6517000,67813,1289483,16502,8.926807,2483.285761,8.026836,258.321907,3.140979,113.862848,2.707428,78,58544,31,4918,63,854,...,871,275,0.687031,8.334719,0.094405,3.152469,4.926324,2.600138,41.986484,18,-4057,1,0,1,-1,0,40.961934,1389.806424,15.795792,258.512749,2.75678,40.604536,0.586044,44396,12,3,4,3,12,4,2,2,blue-collar,married,secondary,cellular,may,unknown,1,0
1,cust1,44,technician,single,secondary,29,unknown,5,may,151,1,-1,0,unknown,1,0,0,yes,no,no,984475,29530340,391984,6517000,67813,1289483,16502,8.926807,2483.285761,8.026836,258.321907,3.140979,113.862848,2.707428,78,58544,31,4918,63,854,...,871,275,0.687031,8.334719,0.094405,3.152469,4.926324,2.600138,41.986484,18,-4057,1,0,1,-1,0,40.961934,1389.806424,15.795792,258.512749,2.75678,40.604536,0.586044,44396,12,3,4,3,12,4,2,2,blue-collar,married,secondary,cellular,may,unknown,1,0
2,cust2,33,entrepreneur,married,secondary,2,unknown,5,may,76,1,-1,0,unknown,1,1,0,yes,yes,no,984475,29530340,391984,6517000,67813,1289483,16502,8.926807,2483.285761,8.026836,258.321907,3.140979,113.862848,2.707428,78,58544,31,4918,63,854,...,871,275,0.687031,8.334719,0.094405,3.152469,4.926324,2.600138,41.986484,18,-4057,1,0,1,-1,0,40.961934,1389.806424,15.795792,258.512749,2.75678,40.604536,0.586044,44396,12,3,4,3,12,4,2,2,blue-collar,married,secondary,cellular,may,unknown,1,0
3,cust3,47,blue-collar,married,unknown,1506,unknown,5,may,92,1,-1,0,unknown,1,0,0,yes,no,no,984475,29530340,391984,6517000,67813,1289483,16502,8.926807,2483.285761,8.026836,258.321907,3.140979,113.862848,2.707428,78,58544,31,4918,63,854,...,871,275,0.687031,8.334719,0.094405,3.152469,4.926324,2.600138,41.986484,18,-4057,1,0,1,-1,0,40.961934,1389.806424,15.795792,258.512749,2.75678,40.604536,0.586044,44396,12,3,4,3,12,4,2,2,blue-collar,married,secondary,cellular,may,unknown,1,0
4,cust4,33,unknown,single,unknown,1,unknown,5,may,198,1,-1,0,unknown,0,0,0,no,no,no,866292,32059342,322640,5154811,57143,527901,9735,12.058696,3613.405339,8.671437,256.529524,3.041508,77.461032,1.660227,95,102127,31,3881,41,871,...,871,275,0.687031,8.334719,0.094405,3.152469,4.926324,2.600138,41.986484,18,-4057,1,0,1,-1,0,40.961934,1389.806424,15.795792,258.512749,2.75678,40.604536,0.586044,44396,12,3,4,3,12,4,2,2,blue-collar,married,secondary,cellular,may,unknown,1,0


In [0]:
# Verifying the features for Assets.SUM(Demographic Data.balance)
bankData.groupby('AssetId')['balance'].agg('sum')

AssetId
0    32059342
1    29530340
Name: balance, dtype: int64

In [0]:
# Printing the list of all features
feature_names

[<Feature: age>,
 <Feature: job>,
 <Feature: marital>,
 <Feature: education>,
 <Feature: balance>,
 <Feature: contact>,
 <Feature: day>,
 <Feature: month>,
 <Feature: duration>,
 <Feature: campaign>,
 <Feature: pdays>,
 <Feature: previous>,
 <Feature: poutcome>,
 <Feature: AssetId>,
 <Feature: LoanId>,
 <Feature: FinbehId>,
 <Feature: Assets.housing>,
 <Feature: Liability.loan>,
 <Feature: FinBehaviour.default>,
 <Feature: Assets.SUM(Demographic Data.age)>,
 <Feature: Assets.SUM(Demographic Data.balance)>,
 <Feature: Assets.SUM(Demographic Data.day)>,
 <Feature: Assets.SUM(Demographic Data.duration)>,
 <Feature: Assets.SUM(Demographic Data.campaign)>,
 <Feature: Assets.SUM(Demographic Data.pdays)>,
 <Feature: Assets.SUM(Demographic Data.previous)>,
 <Feature: Assets.STD(Demographic Data.age)>,
 <Feature: Assets.STD(Demographic Data.balance)>,
 <Feature: Assets.STD(Demographic Data.day)>,
 <Feature: Assets.STD(Demographic Data.duration)>,
 <Feature: Assets.STD(Demographic Data.campaign)

**Configuring the primitives**

In [0]:
# Creating aggregation and transformation primitives
aggPrimitives=[
        'std', 'min', 'max', 'mean', 
         'last', 'count'
        
]
tranPrimitives=[
        'percentile', 
         'subtract', 'divide']

In [0]:
# Defining the new set of features
feature_set, feature_names = ft.dfs(entityset=Bankentities, 
target_entity = 'Demographic Data',
agg_primitives=aggPrimitives,
trans_primitives=tranPrimitives, 
max_depth = 2, 
verbose = 1, 
n_jobs = 1)

Built 3420 features
Elapsed: 01:35 | Remaining: 00:00 | Progress: 100%|██████████| Calculated: 11/11 chunks


In [0]:
# Displaying the feature set 
feature_set.head()

Unnamed: 0_level_0,age,job,marital,education,balance,contact,day,month,duration,campaign,pdays,previous,poutcome,AssetId,LoanId,FinbehId,PERCENTILE(age),PERCENTILE(balance),PERCENTILE(day),PERCENTILE(duration),PERCENTILE(campaign),PERCENTILE(pdays),PERCENTILE(previous),day - balance,campaign - previous,day - campaign,duration - balance,balance - pdays,campaign - age,day - previous,balance - duration,balance - day,duration - campaign,day - pdays,balance - previous,previous - balance,pdays - age,day - duration,balance - age,previous - pdays,...,FinBehaviour.STD(Demographic Data.campaign),FinBehaviour.STD(Demographic Data.pdays),FinBehaviour.STD(Demographic Data.previous),FinBehaviour.MIN(Demographic Data.age),FinBehaviour.MIN(Demographic Data.balance),FinBehaviour.MIN(Demographic Data.day),FinBehaviour.MIN(Demographic Data.duration),FinBehaviour.MIN(Demographic Data.campaign),FinBehaviour.MIN(Demographic Data.pdays),FinBehaviour.MIN(Demographic Data.previous),FinBehaviour.MAX(Demographic Data.age),FinBehaviour.MAX(Demographic Data.balance),FinBehaviour.MAX(Demographic Data.day),FinBehaviour.MAX(Demographic Data.duration),FinBehaviour.MAX(Demographic Data.campaign),FinBehaviour.MAX(Demographic Data.pdays),FinBehaviour.MAX(Demographic Data.previous),FinBehaviour.MEAN(Demographic Data.age),FinBehaviour.MEAN(Demographic Data.balance),FinBehaviour.MEAN(Demographic Data.day),FinBehaviour.MEAN(Demographic Data.duration),FinBehaviour.MEAN(Demographic Data.campaign),FinBehaviour.MEAN(Demographic Data.pdays),FinBehaviour.MEAN(Demographic Data.previous),FinBehaviour.LAST(Demographic Data.age),FinBehaviour.LAST(Demographic Data.job),FinBehaviour.LAST(Demographic Data.marital),FinBehaviour.LAST(Demographic Data.education),FinBehaviour.LAST(Demographic Data.balance),FinBehaviour.LAST(Demographic Data.contact),FinBehaviour.LAST(Demographic Data.day),FinBehaviour.LAST(Demographic Data.month),FinBehaviour.LAST(Demographic Data.duration),FinBehaviour.LAST(Demographic Data.campaign),FinBehaviour.LAST(Demographic Data.pdays),FinBehaviour.LAST(Demographic Data.previous),FinBehaviour.LAST(Demographic Data.poutcome),FinBehaviour.LAST(Demographic Data.AssetId),FinBehaviour.LAST(Demographic Data.LoanId),FinBehaviour.COUNT(Demographic Data)
custID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1
cust0,58,management,married,tertiary,2143,unknown,5,may,261,1,-1,0,unknown,1,0,0,0.935337,0.822919,0.112683,0.671717,0.194035,0.408695,0.408695,-2138,1,4,-1882,2144,-57,5,1882,2138,260,6,2143,-2143,-59,-256,2085,1,...,3.087038,100.50463,2.313596,18,-4057,1,0,1,-1,0,95,102127,31,4918,63,871,275,40.961934,1389.806424,15.795792,258.512749,2.75678,40.604536,0.586044,42,technician,married,secondary,8036,unknown,9,jun,948,5,-1,0,unknown,0,0,44396
cust1,44,technician,single,secondary,29,unknown,5,may,151,1,-1,0,unknown,1,0,0,0.640983,0.20819,0.112683,0.413373,0.194035,0.408695,0.408695,-24,1,4,122,30,-43,5,-122,24,150,6,29,-29,-45,-146,-15,1,...,3.087038,100.50463,2.313596,18,-4057,1,0,1,-1,0,95,102127,31,4918,63,871,275,40.961934,1389.806424,15.795792,258.512749,2.75678,40.604536,0.586044,42,technician,married,secondary,8036,unknown,9,jun,948,5,-1,0,unknown,0,0,44396
cust10,41,admin.,divorced,secondary,270,unknown,5,may,222,1,-1,0,unknown,1,0,0,0.560992,0.398863,0.112683,0.598549,0.194035,0.408695,0.408695,-265,1,4,-48,271,-40,5,48,265,221,6,270,-270,-42,-217,229,1,...,3.087038,100.50463,2.313596,18,-4057,1,0,1,-1,0,95,102127,31,4918,63,871,275,40.961934,1389.806424,15.795792,258.512749,2.75678,40.604536,0.586044,42,technician,married,secondary,8036,unknown,9,jun,948,5,-1,0,unknown,0,0,44396
cust100,44,blue-collar,married,secondary,-674,unknown,5,may,257,1,-1,0,unknown,1,0,0,0.640983,0.008847,0.112683,0.664993,0.194035,0.408695,0.408695,679,1,4,931,-673,-43,5,-931,-679,256,6,-674,674,-45,-252,-718,1,...,3.087038,100.50463,2.313596,18,-4057,1,0,1,-1,0,95,102127,31,4918,63,871,275,40.961934,1389.806424,15.795792,258.512749,2.75678,40.604536,0.586044,42,technician,married,secondary,8036,unknown,9,jun,948,5,-1,0,unknown,0,0,44396
cust1000,47,admin.,married,unknown,0,unknown,7,may,164,1,-1,0,unknown,1,0,0,0.718465,0.122172,0.196634,0.454358,0.194035,0.408695,0.408695,7,1,6,164,1,-46,7,-164,-7,163,8,0,0,-48,-157,-47,1,...,3.087038,100.50463,2.313596,18,-4057,1,0,1,-1,0,95,102127,31,4918,63,871,275,40.961934,1389.806424,15.795792,258.512749,2.75678,40.604536,0.586044,42,technician,married,secondary,8036,unknown,9,jun,948,5,-1,0,unknown,0,0,44396
