In [1]:
 !pip install featuretools

Collecting featuretools
  Downloading featuretools-1.31.0-py3-none-any.whl.metadata (15 kB)
Collecting holidays>=0.17 (from featuretools)
  Downloading holidays-0.89-py3-none-any.whl.metadata (50 kB)
Collecting woodwork>=0.28.0 (from featuretools)
  Downloading woodwork-0.31.0-py3-none-any.whl.metadata (10 kB)
Collecting importlib-resources>=5.10.0 (from woodwork>=0.28.0->featuretools)
  Downloading importlib_resources-6.5.2-py3-none-any.whl.metadata (3.9 kB)
Downloading featuretools-1.31.0-py3-none-any.whl (587 kB)
   ---------------------------------------- 0.0/587.9 kB ? eta -:--:--
   ---------------------------------------- 0.0/587.9 kB ? eta -:--:--
   ----------------- ---------------------- 262.1/587.9 kB ? eta -:--:--
   ---------------------------------------- 587.9/587.9 kB 1.3 MB/s  0:00:00
Downloading holidays-0.89-py3-none-any.whl (1.3 MB)
   ---------------------------------------- 0.0/1.3 MB ? eta -:--:--
   ------- -------------------------------- 0.3/1.3 MB ? eta -:--

In [5]:
import pandas as pd
import numpy as np

import featuretools as ft

import matplotlib.pyplot as plt
plt.rcParams['font.size'] = 22
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

In [4]:

app_train = pd.read_csv('./data/application_train.csv').sort_values('SK_ID_CURR').reset_index(drop = True).loc[:1000, :]
app_test = pd.read_csv('./data/application_test.csv').sort_values('SK_ID_CURR').reset_index(drop = True).loc[:1000, :]
bureau = pd.read_csv('./data/bureau.csv').sort_values(['SK_ID_CURR', 'SK_ID_BUREAU']).reset_index(drop = True).loc[:1000, :]
bureau_balance = pd.read_csv('./data/bureau_balance.csv').sort_values('SK_ID_BUREAU').reset_index(drop = True).loc[:1000, :]
cash = pd.read_csv('./data/POS_CASH_balance.csv').sort_values(['SK_ID_CURR', 'SK_ID_PREV']).reset_index(drop = True).loc[:1000, :]
credit = pd.read_csv('./data/credit_card_balance.csv').sort_values(['SK_ID_CURR', 'SK_ID_PREV']).reset_index(drop = True).loc[:1000, :]
previous = pd.read_csv('./data/previous_application.csv').sort_values(['SK_ID_CURR', 'SK_ID_PREV']).reset_index(drop = True).loc[:1000, :]
installments = pd.read_csv('./data/installments_payments.csv').sort_values(['SK_ID_CURR', 'SK_ID_PREV']).reset_index(drop = True).loc[:1000, :]

In [34]:
app_train['set'] = 'train'
app_test['set'] = 'test'
app_test['TARGET'] = np.nan

app = pd.concat([app_train, app_test], ignore_index=True)

In [35]:
es = ft.EntitySet(id = 'clients')


In [37]:
import featuretools as ft

es = ft.EntitySet(id='home_credit')

es = es.add_dataframe(
    dataframe_name='app',
    dataframe=app,
    index='SK_ID_CURR'
)

es = es.add_dataframe(
    dataframe_name='bureau',
    dataframe=bureau,
    index='SK_ID_BUREAU'
)

es = es.add_dataframe(
    dataframe_name='previous',
    dataframe=previous,
    index='SK_ID_PREV'
)

es = es.add_dataframe(
    dataframe_name='bureau_balance',
    dataframe=bureau_balance,
    index='bureaubalance_index',
    make_index=True
)

es = es.add_dataframe(
    dataframe_name='cash',
    dataframe=cash,
    index='cash_index',
    make_index=True
)

es = es.add_dataframe(
    dataframe_name='installments',
    dataframe=installments,
    index='installments_index',
    make_index=True
)

es = es.add_dataframe(
    dataframe_name='credit',
    dataframe=credit,
    index='credit_index',
    make_index=True
)


In [38]:
print('Parent: app, Parent Variable: SK_ID_CURR\n\n', app.iloc[:, 111:115].head())
print('\nChild: bureau, Child Variable: SK_ID_CURR\n\n', bureau.iloc[10:30, :4].head())

Parent: app, Parent Variable: SK_ID_CURR

         FLAG_DOCUMENT_17  FLAG_DOCUMENT_18  FLAG_DOCUMENT_19  FLAG_DOCUMENT_20
100002                 0                 0                 0                 0
100003                 0                 0                 0                 0
100004                 0                 0                 0                 0
100006                 0                 0                 0                 0
100007                 0                 0                 0                 0

Child: bureau, Child Variable: SK_ID_CURR

          SK_ID_CURR  SK_ID_BUREAU CREDIT_ACTIVE CREDIT_CURRENCY
6158905      100002       6158905        Closed      currency 1
6158906      100002       6158906        Closed      currency 1
6158907      100002       6158907        Closed      currency 1
6158908      100002       6158908        Closed      currency 1
6158909      100002       6158909        Active      currency 1


In [41]:

relationships = [
    ft.Relationship(
        es,
        parent_dataframe_name='app',
        parent_column_name='SK_ID_CURR',
        child_dataframe_name='bureau',
        child_column_name='SK_ID_CURR'
    ),

    ft.Relationship(
        es,
        parent_dataframe_name='bureau',
        parent_column_name='SK_ID_BUREAU',
        child_dataframe_name='bureau_balance',
        child_column_name='SK_ID_BUREAU'
    ),

    ft.Relationship(
        es,
        parent_dataframe_name='app',
        parent_column_name='SK_ID_CURR',
        child_dataframe_name='previous',
        child_column_name='SK_ID_CURR'
    ),

    ft.Relationship(
        es,
        parent_dataframe_name='previous',
        parent_column_name='SK_ID_PREV',
        child_dataframe_name='cash',
        child_column_name='SK_ID_PREV'
    ),

    ft.Relationship(
        es,
        parent_dataframe_name='previous',
        parent_column_name='SK_ID_PREV',
        child_dataframe_name='installments',
        child_column_name='SK_ID_PREV'
    ),

    ft.Relationship(
        es,
        parent_dataframe_name='previous',
        parent_column_name='SK_ID_PREV',
        child_dataframe_name='credit',
        child_column_name='SK_ID_PREV'
    )
]

es = es.add_relationships(relationships)


In [40]:
es = es.add_relationships([r_app_bureau, r_bureau_balance, r_app_previous,
                           r_previous_cash, r_previous_installments, r_previous_credit])
es

NameError: name 'r_app_bureau' is not defined

In [None]:
primitives = ft.list_primitives()
pd.options.display.max_colwidth = 100
primitives[primitives['type'] == 'aggregation'].head(10)

In [None]:
primitives[primitives['type'] == 'transform'].head(10)


In [None]:
default_agg_primitives =  ["sum", "std", "max", "skew", "min", "mean", "count", "percent_true", "num_unique", "mode"]
default_trans_primitives =  ["day", "year", "month", "weekday", "haversine", "numwords", "characters"]

feature_names = ft.dfs(entityset = es, target_entity = 'app',
                       trans_primitives = default_trans_primitives,
                       agg_primitives=default_agg_primitives, 
                       max_depth = 2, features_only=True)

print('%d Total Features' % len(feature_names))

In [7]:
feature_matrix, feature_names = ft.dfs(entityset = es, target_entity = 'app',
                                       trans_primitives = default_trans_primitives,
                                       agg_primitives=default_agg_primitives, 
                                        max_depth = 2, features_only=False, verbose = True)

pd.options.display.max_columns = 1700
feature_matrix.head(10)

NameError: name 'es' is not defined

In [14]:
feature_names[-20:]


NameError: name 'feature_names' is not defined

In [15]:
feature_matrix_spec, feature_names_spec = ft.dfs(entityset = es, target_entity = 'app',  
                                                 agg_primitives = ['sum', 'count', 'min', 'max', 'mean', 'mode'], 
                                                 max_depth = 2, features_only = False, verbose = True)

TypeError: dfs() got an unexpected keyword argument 'target_entity'

In [16]:
pd.options.display.max_columns = 1000
feature_matrix_spec.head(10)

NameError: name 'feature_matrix_spec' is not defined

In [17]:
correlations = pd.read_csv('../input/home-credit-default-risk-feature-tools/correlations_spec.csv', index_col = 0)
correlations.index.name = 'Variable'
correlations.head()

FileNotFoundError: [Errno 2] No such file or directory: '../input/home-credit-default-risk-feature-tools/correlations_spec.csv'

In [18]:
correlations_target = correlations.sort_values('TARGET')['TARGET']
correlations_target.head()

NameError: name 'correlations' is not defined

In [19]:
correlations_target.dropna().tail()


NameError: name 'correlations_target' is not defined

In [20]:
features_sample = pd.read_csv('../input/home-credit-default-risk-feature-tools/feature_matrix.csv', nrows = 20000)
features_sample = features_sample[features_sample['set'] == 'train']
features_sample.head()

FileNotFoundError: [Errno 2] No such file or directory: '../input/home-credit-default-risk-feature-tools/feature_matrix.csv'

In [21]:
def kde_target_plot(df, feature):
    """Kernel density estimate plot of a feature colored
    by value of the target."""
    
    # Need to reset index for loc to workBU
    df = df.reset_index()
    plt.figure(figsize = (10, 6))
    plt.style.use('fivethirtyeight')
    
    # plot repaid loans
    sns.kdeplot(df.loc[df['TARGET'] == 0, feature], label = 'target == 0')
    # plot loans that were not repaid
    sns.kdeplot(df.loc[df['TARGET'] == 1, feature], label = 'target == 1')
    
    # Label the plots
    plt.title('Distribution of Feature by Target Value')
    plt.xlabel('%s' % feature); plt.ylabel('Density');
    plt.show()

In [22]:
kde_target_plot(features_sample, feature = 'MAX(previous_app.MEAN(credit.CNT_DRAWINGS_ATM_CURRENT))')

NameError: name 'features_sample' is not defined

In [23]:
threshold = 0.9

correlated_pairs = {}

# Iterate through the columns
for col in correlations:
    # Find correlations above the threshold
    above_threshold_vars = [x for x in list(correlations.index[correlations[col] > threshold]) if x != col]
    correlated_pairs[col] = above_threshold_vars

NameError: name 'correlations' is not defined

In [24]:
correlated_pairs['MEAN(credit.AMT_PAYMENT_TOTAL_CURRENT)']


KeyError: 'MEAN(credit.AMT_PAYMENT_TOTAL_CURRENT)'

In [25]:
correlations['MEAN(credit.AMT_PAYMENT_TOTAL_CURRENT)'].sort_values(ascending=False).head()


NameError: name 'correlations' is not defined

In [26]:
plt.plot(features_sample['MEAN(credit.AMT_PAYMENT_TOTAL_CURRENT)'], features_sample['MEAN(previous_app.MEAN(credit.AMT_PAYMENT_CURRENT))'], 'bo')
plt.title('Highly Correlated Features');

NameError: name 'features_sample' is not defined

In [27]:
fi = pd.read_csv('../input/home-credit-default-risk-feature-tools/spec_feature_importances_ohe.csv', index_col = 0)
fi = fi.sort_values('importance', ascending = False)
fi.head(15)

FileNotFoundError: [Errno 2] No such file or directory: '../input/home-credit-default-risk-feature-tools/spec_feature_importances_ohe.csv'

In [28]:
kde_target_plot(features_sample, feature = 'MAX(bureau.DAYS_CREDIT)')


NameError: name 'features_sample' is not defined

In [29]:
original_features = list(pd.get_dummies(app).columns)

created_features = []

# Iterate through the top 100 features
for feature in fi['feature'][:100]:
    if feature not in original_features:
        created_features.append(feature)
        
print('%d of the top 100 features were made by featuretools' % len(created_features))

NameError: name 'app' is not defined

In [30]:
import matplotlib.pyplot as plt
plt.rcParams['font.size'] = 22

def plot_feature_importances(df):
    
    
    # Sort features according to importance
    df = df.sort_values('importance', ascending = False).reset_index()
    
    # Normalize the feature importances to add up to one
    df['importance_normalized'] = df['importance'] / df['importance'].sum()

    # Make a horizontal bar chart of feature importances
    plt.figure(figsize = (14, 10))
    ax = plt.subplot()
    
    # Need to reverse the index to plot most important on top
    ax.barh(list(reversed(list(df.index[:15]))), 
            df['importance_normalized'].head(15), 
            align = 'center', edgecolor = 'k')
    
    # Set the yticks and labels
    ax.set_yticks(list(reversed(list(df.index[:15]))))
    ax.set_yticklabels(df['feature'].head(15))
    
    # Plot labeling
    plt.xlabel('Normalized Importance'); plt.title('Feature Importances')
    plt.show()
    
    return df

In [31]:
print('There are %d features with 0 importance' % sum(fi['importance'] == 0.0))


NameError: name 'fi' is not defined

In [32]:
from featuretools import selection

feature_matrix2 = selection.remove_low_information_features(feature_matrix)

print('Removed %d features' % (feature_matrix.shape[1]- feature_matrix2.shape[1])

_IncompleteInputError: incomplete input (2590107320.py, line 5)

In [33]:
train = feature_matrix2[feature_matrix2['set'] == 'train']
test = feature_matrix2[feature_matrix2['set'] == 'test']

train = pd.get_dummies(train)
test = pd.get_dummies(test)

train, test = train.align(test, join = 'inner', axis = 1)
test = test.drop(columns = ['TARGET'])

print('Final Training Shape: ', train.shape)
print('Final Testing Shape: ', test.shape)

NameError: name 'feature_matrix2' is not defined