In [1]:
import pandas as pd
import numpy as np
import featuretools as ft

es = ft.EntitySet(id = 'clients')
print(es)

Entityset: clients
  Entities:
  Relationships:
    No relationships


In [10]:
clients = pd.read_csv('data/clients.csv', parse_dates = ['joined'])
loans = pd.read_csv('data/loans.csv', parse_dates = ['loan_start', 'loan_end'])
payments = pd.read_csv('data/payments.csv', parse_dates = ['payment_date'])

clients

Unnamed: 0,client_id,joined,income,credit_score
0,46109,2002-04-16,172677,527
1,49545,2007-11-14,104564,770
2,41480,2013-03-11,122607,585
3,46180,2001-11-06,43851,562
4,25707,2006-10-06,211422,621
5,39505,2011-10-14,153873,610
6,32726,2006-05-01,235705,730
7,35089,2010-03-01,131176,771
8,35214,2003-08-08,95849,696
9,48177,2008-06-09,190632,769


In [3]:
es = ft.EntitySet(id = 'clients')
es = es.entity_from_dataframe(entity_id= 'clients', dataframe= clients, index = 'client_id', time_index='joined')
es = es.entity_from_dataframe(entity_id= 'loans', dataframe= loans, index = 'loan_id')
print(es)

Entityset: clients
  Entities:
    clients [Rows: 25, Columns: 4]
    loans [Rows: 443, Columns: 8]
  Relationships:
    No relationships


In [4]:
es = es.entity_from_dataframe(entity_id = 'payments',
                              dataframe = payments,
                              variable_types = {'missed': ft.variable_types.Categorical},
                              make_index = True,
                              index = 'payment_id',
                              time_index = 'payment_date')

In [5]:
print(es)
# es['payments']

Entityset: clients
  Entities:
    clients [Rows: 25, Columns: 4]
    loans [Rows: 443, Columns: 8]
    payments [Rows: 3456, Columns: 5]
  Relationships:
    No relationships


In [6]:
# Relationship between clients and previous loans
r_client_previous = ft.Relationship(es['clients']['client_id'],
                                    es['loans']['client_id'])

# Add the relationship to the entity set
es = es.add_relationship(r_client_previous)

# Relationship between previous loans and previous payments
r_payments = ft.Relationship(es['loans']['loan_id'],
                                      es['payments']['loan_id'])

# Add the relationship to the entity set
es = es.add_relationship(r_payments)

es

Entityset: clients
  Entities:
    clients [Rows: 25, Columns: 4]
    loans [Rows: 443, Columns: 8]
    payments [Rows: 3456, Columns: 5]
  Relationships:
    loans.client_id -> clients.client_id
    payments.loan_id -> loans.loan_id

In [27]:
# 결과로는 매개변수로 전달된 값에 기반한 새로운 client 데이터프레임이 생성된다.

features, feature_names = ft.dfs(entityset = es, target_entity = 'clients',
                                 agg_primitives = ['min', 'mean', 'max', 'percent_true', 'last'],
                                 trans_primitives = ['year', 'month'])  # 뒤에 subtract, devide가 더 있었는데

# features['MONTH(joined)']
features

  agg_primitives: ['percent_true']
This may be caused by a using a value of max_depth that is too small, not setting interesting values, or it may indicate no compatible variable types for the primitive were found in the data.


Unnamed: 0_level_0,income,credit_score,LAST(loans.loan_amount),LAST(loans.loan_id),LAST(loans.loan_type),LAST(loans.rate),LAST(loans.repaid),MAX(loans.loan_amount),MAX(loans.rate),MAX(loans.repaid),...,MEAN(payments.loans.repaid),MIN(payments.loans.loan_amount),MIN(payments.loans.rate),MIN(payments.loans.repaid),MONTH(LAST(loans.loan_end)),MONTH(LAST(loans.loan_start)),MONTH(LAST(payments.payment_date)),YEAR(LAST(loans.loan_end)),YEAR(LAST(loans.loan_start)),YEAR(LAST(payments.payment_date))
client_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
42320,229481,563,1070,10769,credit,0.95,1,13887,6.74,1,...,0.583333,1070,0.38,0,2,11,4,2014,2011,2013
39384,191204,617,10703,11038,credit,9.01,1,14654,9.23,1,...,0.575342,1770,0.43,0,5,5,3,2004,2002,2015
26945,214516,806,3643,11434,home,0.13,0,14593,5.65,1,...,0.339286,653,0.13,0,12,3,7,2011,2010,2014
41472,152214,638,5554,11792,credit,4.6,1,13657,9.82,1,...,0.533333,986,0.01,0,8,2,3,2008,2006,2015
46180,43851,562,9221,10893,credit,0.88,1,14081,9.26,1,...,0.496644,1607,0.57,0,2,7,2,2004,2002,2015
46109,172677,527,559,10599,credit,4.15,1,14049,9.48,1,...,0.539007,559,0.5,0,11,2,6,2009,2008,2014
32885,58955,642,7914,11006,credit,5.25,1,14162,9.11,1,...,0.547826,3704,0.1,0,7,12,12,2011,2008,2014
29841,38354,523,9394,11617,cash,6.12,1,14837,6.76,1,...,0.56,2778,0.26,0,4,4,1,2013,2011,2015
38537,127183,643,10498,11197,credit,4.09,1,14804,8.01,1,...,0.585366,2396,0.35,0,12,5,9,2009,2007,2014
35214,95849,696,3872,10741,cash,0.57,1,14767,8.44,1,...,0.391566,667,0.16,0,12,2,11,2003,2002,2014


In [33]:
import pandas as pd
import numpy as np
import featuretools as ft

clients = pd.read_csv('data/clients.csv', parse_dates=['joined'])
loans = pd.read_csv('data/loans.csv', parse_dates=['loan_start', 'loan_end'])
payments = pd.read_csv('data/payments.csv', parse_dates=['payment_date'])

es = ft.EntitySet(id='loans')
es = es.entity_from_dataframe(entity_id='loans', dataframe=loans, index='loan_id')

features, feature_names = ft.dfs(entityset=es, target_entity='loans',
                                 trans_primitives=['cum_sum'])
features



Unnamed: 0_level_0,client_id,loan_type,loan_amount,repaid,rate,CUM_SUM(client_id),CUM_SUM(loan_amount),CUM_SUM(rate),CUM_SUM(repaid)
loan_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
10243,46109,home,13672,0,2.15,46109,13672,2.15,0
10984,46109,credit,9794,0,1.25,92218,23466,3.40,0
10990,46109,home,12734,1,0.68,138327,36200,4.08,1
10596,46109,cash,12518,1,1.24,184436,48718,5.32,2
11415,46109,credit,14049,1,3.13,230545,62767,8.45,3
...,...,...,...,...,...,...,...,...,...
10330,26945,other,12963,0,2.46,17129820,3517267,1409.65,236
10248,26945,credit,1728,1,5.27,17156765,3518995,1414.92,237
10154,26945,other,9329,0,5.65,17183710,3528324,1420.57,237
10333,26945,home,4197,0,4.50,17210655,3532521,1425.07,237
