In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import featuretools as ft

In [2]:
# let's load the example data set from feature tools

data_dict = ft.demo.load_mock_customer()

data = data_dict["transactions"].merge(
    data_dict["sessions"]).merge(data_dict["customers"])

cols = ['customer_id',
        'transaction_id',
        'transaction_time',
        'amount',
        ]

data = data[cols]

data.head()

Unnamed: 0,customer_id,transaction_id,transaction_time,amount
0,2,298,2014-01-01 00:00:00,127.64
1,2,2,2014-01-01 00:01:05,109.48
2,2,308,2014-01-01 00:02:10,95.06
3,2,116,2014-01-01 00:03:15,78.92
4,2,371,2014-01-01 00:04:20,31.54


In [3]:
print('Number of customers: {}'.format(data['customer_id'].nunique()))
print('Number of transactions: {}'.format(data['transaction_id'].nunique()))

Number of customers: 5
Number of transactions: 500


In [4]:
data.dtypes

customer_id                  int64
transaction_id               int64
transaction_time    datetime64[ns]
amount                     float64
dtype: object

In [5]:
# in order for feature tools to work, we need to create
# entity sets

es = ft.EntitySet(id="customer_data")

# entity set with the transactions
es.entity_from_dataframe(entity_id='transactions',
                         dataframe=data,
                         index="transaction_id",
                         time_index='transaction_time')

# display the entity set
es

Entityset: customer_data
  Entities:
    transactions [Rows: 500, Columns: 4]
  Relationships:
    No relationships

In [6]:
# now we indicate that within the entity set
# there is another table, with customer data,
# each customer identified with a unique
# customer id

es.normalize_entity(base_entity_id="transactions",
                    new_entity_id="customers",
                    index="customer_id")

Entityset: customer_data
  Entities:
    transactions [Rows: 500, Columns: 4]
    customers [Rows: 5, Columns: 2]
  Relationships:
    transactions.customer_id -> customers.customer_id

## Creating new features from the existing data - without aggregations

In [7]:
# featuretools automatically creates more fearures from those present in the data set
# for every single transaction, that is without aggregation

# the names in the below list, indicate featuretools to:
# 1) Create a feature from the datetime variable signalling
# if the transaction occurred on a weekend
# 2) Determines the cumulative transaction amount,
# 3) Determine the cumulative number of transactions,
# 4) Creates a new feature indicating the time since the previous transaction

# all of this operations occur at a transaction level, that is, transaction
# after transaction

transf_operations = ['is_weekend', 'cum_sum', 'cum_count','time_since_previous']

# set up the dfs from featuretools to return the previous features
# it is important to leave agg_primitives as an empty list, so that featuretools does not
# aggregate the data at a customer level as well

feature_matrix, features = ft.dfs(entityset=es,
                      target_entity="transactions",
                      agg_primitives=[],
                      trans_primitives=transf_operations,
                      verbose=True)

feature_matrix.head()

Built 8 features
Elapsed: 00:00 | Progress: 100%|██████████████████████████████████████████████████████████████████


Unnamed: 0_level_0,customer_id,amount,IS_WEEKEND(transaction_time),CUM_SUM(amount),CUM_COUNT(customer_id),TIME_SINCE_PREVIOUS(transaction_time),customers.IS_WEEKEND(first_transactions_time),customers.TIME_SINCE_PREVIOUS(first_transactions_time)
transaction_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
298,2,127.64,False,127.64,1,,False,
2,2,109.48,False,237.12,2,65.0,False,
308,2,95.06,False,332.18,3,65.0,False,
116,2,78.92,False,411.1,4,65.0,False,
371,2,31.54,False,442.64,5,65.0,False,


In [8]:
# let's now extract information about date and time as well

# list with the operations to perform to extract new features from the 
# existing variables
operations = ['year', 'month', 'day', 'hour', 'minute', 'second',
             'is_weekend', 'cum_sum', 'time_since_previous']


# extract tge new features
feature_matrix, features = ft.dfs(entityset=es,
                      target_entity="transactions",
                      agg_primitives=[],
                      trans_primitives=None,
                      verbose=True)

feature_matrix.head()

Built 10 features
Elapsed: 00:00 | Progress: 100%|██████████████████████████████████████████████████████████████████


Unnamed: 0_level_0,customer_id,amount,DAY(transaction_time),YEAR(transaction_time),MONTH(transaction_time),WEEKDAY(transaction_time),customers.DAY(first_transactions_time),customers.YEAR(first_transactions_time),customers.MONTH(first_transactions_time),customers.WEEKDAY(first_transactions_time)
transaction_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
298,2,127.64,1,2014,1,2,1,2014,1,2
2,2,109.48,1,2014,1,2,1,2014,1,2
308,2,95.06,1,2014,1,2,1,2014,1,2
116,2,78.92,1,2014,1,2,1,2014,1,2
371,2,31.54,1,2014,1,2,1,2014,1,2


In [9]:
# original dataframe and derived feature_matrix should have
# same number of observations

data.shape, feature_matrix.shape

((500, 4), (500, 10))

In [10]:
# if we don't indicate the operations to perform to create new features, featuretools will perform
# the default operations, which can be found here:

# https://docs.featuretools.com/en/stable/generated/featuretools.dfs.html#featuretools.dfs

In [11]:
# if we had more than 1 numeric feature in our dataframe, we could create new feautres by addition
# or multiplication as we did in recipe Combining multiple features with statistical operations of Chapter 9,
# Applying Mathematical Computations to Features

# for code on how to do that check this link:
# https://stackoverflow.com/questions/55155371/how-to-use-featuretools-to-create-features-for-a-single-table-with-no-immediate/55172142#55172142

In [12]:
# for more details in the types of features that we can create
# check the supported operations with the code below

primitives = ft.list_primitives()
pd.options.display.max_colwidth = 500
primitives[primitives['type'] == 'transform']

Unnamed: 0,name,type,description
20,absolute,transform,Computes the absolute value of a number.
21,isin,transform,Determines whether a value is present in a provided list.
22,latitude,transform,Returns the first tuple value in a list of LatLong tuples.
23,cum_count,transform,Calculates the cumulative count.
24,and,transform,Element-wise logical AND of two lists.
25,not_equal,transform,Determines if values in one list are not equal to another list.
26,second,transform,Determines the seconds value of a datetime.
27,modulo_by_feature,transform,Return the modulo of a scalar by each element in the list.
28,divide_by_feature,transform,Divide a scalar by each value in the list.
29,greater_than,transform,Determines if values in one list are greater than another list.


## Creating new features from the existing data - with aggregations

These are the features we can create when we want a flattened view of our dataframe, that is one row per customer.

In [13]:
# differently from the previous case, here we want to aggregate
# the information at customer level, so we want a summary view of
# each customer

# first we aggregate only the existing variables
# that is the transaction amount

# we want the mean and maximum transaction amount per customer

feature_matrix, features = ft.dfs(entityset=es,
                      target_entity="customers",
                      agg_primitives=["mean", 'max'],
                      trans_primitives=[],
                      verbose=True)

# dataframe with the new features
feature_matrix

Built 2 features
Elapsed: 00:00 | Progress: 100%|██████████████████████████████████████████████████████████████████


Unnamed: 0_level_0,MEAN(transactions.amount),MAX(transactions.amount)
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1
2,77.422366,146.81
5,80.375443,149.02
4,80.070459,149.95
1,71.631905,139.43
3,67.06043,149.15


In [14]:
# now we will create a new feature as in the first part of the 
# notebook, and more specifically, we will create the feature
# time_since_previous transaction

# and aggregate the new feature together with the pre-existing
# variable transaction amount, all in one go

feature_matrix, features = ft.dfs(entityset=es,
                      target_entity="customers",
                      agg_primitives=["mean", 'max'],
                      trans_primitives=['time_since_previous'],
                      verbose=True)

# dataframe with the new features
feature_matrix

Built 5 features
Elapsed: 00:00 | Progress: 100%|██████████████████████████████████████████████████████████████████


Unnamed: 0_level_0,MEAN(transactions.amount),MAX(transactions.amount),TIME_SINCE_PREVIOUS(first_transactions_time),MEAN(transactions.TIME_SINCE_PREVIOUS(transaction_time)),MAX(transactions.TIME_SINCE_PREVIOUS(transaction_time))
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2,77.422366,146.81,,65.0,65.0
5,80.375443,149.02,1040.0,65.0,65.0
4,80.070459,149.95,650.0,65.0,65.0
1,71.631905,139.43,975.0,65.0,65.0
3,67.06043,149.15,3315.0,65.0,65.0


In [15]:
# note how the final dataframe contains as many rows as 
# different customers in the data set

data.shape, feature_matrix.shape

((500, 4), (5, 5))

In [16]:
# and now let's create 2 new features and aggregate both
# together with the transaction amount

feature_matrix, features = ft.dfs(entityset=es,
                      target_entity="customers",
                      agg_primitives=["mean", 'max'],
                      trans_primitives=['cum_sum','time_since_previous'],
                      verbose=True)

# dataframe with the new features
feature_matrix

Built 9 features
Elapsed: 00:00 | Progress: 100%|██████████████████████████████████████████████████████████████████


Unnamed: 0_level_0,MEAN(transactions.amount),MAX(transactions.amount),TIME_SINCE_PREVIOUS(first_transactions_time),MEAN(transactions.CUM_SUM(amount)),MEAN(transactions.TIME_SINCE_PREVIOUS(transaction_time)),MAX(transactions.CUM_SUM(amount)),MAX(transactions.TIME_SINCE_PREVIOUS(transaction_time)),CUM_SUM(MAX(transactions.amount)),CUM_SUM(MEAN(transactions.amount))
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2,77.422366,146.81,,18794.182366,65.0,35101.22,65.0,146.81,77.422366
5,80.375443,149.02,1040.0,24258.206582,65.0,34296.39,65.0,295.83,157.797809
4,80.070459,149.95,650.0,12455.024495,65.0,23418.09,65.0,445.78,237.868267
1,71.631905,139.43,975.0,16566.784048,65.0,31291.89,65.0,585.21,309.500172
3,67.06043,149.15,3315.0,25387.640538,65.0,37539.86,65.0,734.36,376.560602


In [17]:
data.shape, feature_matrix.shape

((500, 4), (5, 9))

## Integrating user defined functions

In the remaining lines, instead of using Featuretools pre-coded functions, we will create our own.

In [18]:
from featuretools.primitives import make_trans_primitive, make_agg_primitive
from featuretools.variable_types import Numeric

from scipy.signal import find_peaks


def find_no_peaks(column):
    peaks, _ = find_peaks(column)
    return len(peaks)


def find_no_valleys(column):
    valleys, _ = find_peaks(1 / column)
    return len(valleys)


FindNoPeaks = make_agg_primitive(function=find_no_peaks,
                                 input_types=[Numeric],
                                 return_type=Numeric)

FindNoValleys = make_agg_primitive(function=find_no_valleys,
                                   input_types=[Numeric],
                                   return_type=Numeric)

In [19]:
# and now let's use our 2 new aggregation functions with mean and max

feature_matrix, features = ft.dfs(entityset=es,
                      target_entity="customers",
                      agg_primitives=[FindNoPeaks, FindNoValleys, 'Mean', 'Max'],
                      trans_primitives=[],
                      verbose=True)

# dataframe with the new features
feature_matrix

Built 4 features
Elapsed: 00:00 | Progress: 100%|██████████████████████████████████████████████████████████████████


Unnamed: 0_level_0,FIND_NO_PEAKS(transactions.amount),FIND_NO_VALLEYS(transactions.amount),MEAN(transactions.amount),MAX(transactions.amount)
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2,28.0,29.0,77.422366,146.81
5,26.0,26.0,80.375443,149.02
4,37.0,38.0,80.070459,149.95
1,43.0,42.0,71.631905,139.43
3,29.0,29.0,67.06043,149.15
