In [1]:
import pandas as pd
import featuretools as ft
from woodwork.logical_types import Categorical, NaturalLanguage

In [2]:
# let's load the data again

df = pd.read_csv("retail.csv", parse_dates=["invoice_date"])

In [3]:
# create and entity set

es = ft.EntitySet(id="data")

In [4]:
# Add the data to the entity

es = es.add_dataframe(
    dataframe=df,              # the dataframe with the data
    dataframe_name="data",     # unique name to associate with this dataframe
    index="rows",              # column name to index the items
    make_index=True,           # if true, create a new column with unique values
    time_index="invoice_date", # column containing time data
    logical_types={
        "customer_id": Categorical, # the id is numerical, but should be handled as categorical
        "description": NaturalLanguage, # we need to set this variable as text for ft to work
    },
)

In [5]:
# Create a new dataframe with invoices
# indicating its relationship to the main data

es.normalize_dataframe(
    base_dataframe_name="data",     # Datarame name from which to split.
    new_dataframe_name="invoices",  # Name of the new dataframe.
    index="invoice",                # relationship will be created across this column.
    copy_columns=["customer_id"],   # columns to remove from base_dataframe and move to new dataframe.
)


Entityset: data
  DataFrames:
    data [Rows: 741301, Columns: 8]
    invoices [Rows: 40505, Columns: 3]
  Relationships:
    data.invoice -> invoices.invoice

In [6]:
# Create a new dataframe indicating its 
# relationship to the previous dataframe

# now we work with customers

es.normalize_dataframe(
    base_dataframe_name="invoices",  # note that we use the df from the previous cell
    new_dataframe_name="customers",  # the name of the new df
    index="customer_id",             # the column that indicates the relationship
)

es

Entityset: data
  DataFrames:
    data [Rows: 741301, Columns: 8]
    invoices [Rows: 40505, Columns: 3]
    customers [Rows: 5410, Columns: 2]
  Relationships:
    data.invoice -> invoices.invoice
    invoices.customer_id -> customers.customer_id

In [7]:
# multiple operations simultaneously

agg_primitives = ["mean", "max", "min", "sum"]

In [8]:
# create all features simultaneously

feature_matrix, feature_defs = ft.dfs(
    entityset=es,                                 # the entity set
    target_dataframe_name="customers",            # the dataframe for wich to create the feature
    agg_primitives=agg_primitives,                # the aggregation primitives
    trans_primitives=[],                          # empy list to override defo params
)

# display name of created features
feature_defs

[<Feature: MAX(data.price)>,
 <Feature: MAX(data.quantity)>,
 <Feature: MEAN(data.price)>,
 <Feature: MEAN(data.quantity)>,
 <Feature: MIN(data.price)>,
 <Feature: MIN(data.quantity)>,
 <Feature: SUM(data.price)>,
 <Feature: SUM(data.quantity)>,
 <Feature: MAX(invoices.MEAN(data.price))>,
 <Feature: MAX(invoices.MEAN(data.quantity))>,
 <Feature: MAX(invoices.MIN(data.price))>,
 <Feature: MAX(invoices.MIN(data.quantity))>,
 <Feature: MAX(invoices.SUM(data.price))>,
 <Feature: MAX(invoices.SUM(data.quantity))>,
 <Feature: MEAN(invoices.MAX(data.price))>,
 <Feature: MEAN(invoices.MAX(data.quantity))>,
 <Feature: MEAN(invoices.MEAN(data.price))>,
 <Feature: MEAN(invoices.MEAN(data.quantity))>,
 <Feature: MEAN(invoices.MIN(data.price))>,
 <Feature: MEAN(invoices.MIN(data.quantity))>,
 <Feature: MEAN(invoices.SUM(data.price))>,
 <Feature: MEAN(invoices.SUM(data.quantity))>,
 <Feature: MIN(invoices.MAX(data.price))>,
 <Feature: MIN(invoices.MAX(data.quantity))>,
 <Feature: MIN(invoices.MEAN(d

In [9]:
# new features

feature_matrix.head()

Unnamed: 0_level_0,MAX(data.price),MAX(data.quantity),MEAN(data.price),MEAN(data.quantity),MIN(data.price),MIN(data.quantity),SUM(data.price),SUM(data.quantity),MAX(invoices.MEAN(data.price)),MAX(invoices.MEAN(data.quantity)),...,MIN(invoices.MEAN(data.price)),MIN(invoices.MEAN(data.quantity)),MIN(invoices.SUM(data.price)),MIN(invoices.SUM(data.quantity)),SUM(invoices.MAX(data.price)),SUM(invoices.MAX(data.quantity)),SUM(invoices.MEAN(data.price)),SUM(invoices.MEAN(data.quantity)),SUM(invoices.MIN(data.price)),SUM(invoices.MIN(data.quantity))
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
13085.0,830.12,48.0,12.413587,9.076087,0.55,-48.0,1142.05,835.0,830.12,20.75,...,1.828571,-15.428571,10.5,-108.0,886.42,209.0,860.134136,78.982882,839.97,-3.0
13078.0,12.75,300.0,3.961193,14.061988,0.19,-14.0,3386.82,12023.0,12.75,61.333333,...,0.19,-14.0,0.19,-15.0,702.52,3696.0,408.733369,843.462091,207.15,45.0
15362.0,9.95,48.0,3.612,9.2,0.21,1.0,144.48,368.0,3.628261,13.117647,...,3.59,6.304348,61.03,145.0,18.9,60.0,7.218261,19.421995,0.86,3.0
18102.0,3580.8,1008.0,10.831367,175.196629,0.27,-324.0,11567.9,187110.0,3580.8,624.0,...,0.48,-324.0,0.48,-432.0,8432.42,35654.0,8048.487489,26429.403963,7858.33,17709.0
18087.0,852.8,3906.0,11.971368,78.189474,0.36,-96.0,1137.28,7428.0,852.8,3906.0,...,0.82,-96.0,0.82,-288.0,960.41,4648.0,912.976463,4374.814472,883.9,4162.0


In [10]:
date_primitives = ["month", "weekday"]

text_primitives = ["num_words"]

trans_primitives = date_primitives + text_primitives 

In [11]:
agg_primitives = ["mean"]

In [12]:
feature_matrix, feature_defs = ft.dfs(
    entityset=es,                                 # the entity set
    target_dataframe_name="customers",            # the dataframe for wich to create the feature
    agg_primitives=agg_primitives,                # the aggregation primitives
    trans_primitives=trans_primitives,            # the operation to create the new features
    max_depth=3,
)

# display name of created features
feature_defs

[<Feature: MEAN(data.price)>,
 <Feature: MEAN(data.quantity)>,
 <Feature: MONTH(first_invoices_time)>,
 <Feature: WEEKDAY(first_invoices_time)>,
 <Feature: MEAN(invoices.MEAN(data.price))>,
 <Feature: MEAN(invoices.MEAN(data.quantity))>,
 <Feature: MEAN(data.NUM_WORDS(description))>,
 <Feature: MEAN(invoices.MEAN(data.NUM_WORDS(description)))>]

In [13]:
feature_matrix.head()

Unnamed: 0_level_0,MEAN(data.price),MEAN(data.quantity),MONTH(first_invoices_time),WEEKDAY(first_invoices_time),MEAN(invoices.MEAN(data.price)),MEAN(invoices.MEAN(data.quantity)),MEAN(data.NUM_WORDS(description)),MEAN(invoices.MEAN(data.NUM_WORDS(description)))
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
13085.0,12.413587,9.076087,12,1,86.013414,7.898288,4.771739,4.355024
13078.0,3.961193,14.061988,12,1,4.302457,8.878548,4.505263,4.524268
15362.0,3.612,9.2,12,1,3.60913,9.710997,4.525,4.556266
18102.0,10.831367,175.196629,12,1,52.604493,172.741202,4.617978,4.45912
18087.0,11.971368,78.189474,12,1,43.47507,208.324499,4.452632,4.175557
