In [1]:
import pandas as pd
import featuretools as ft
from woodwork.logical_types import Categorical

In [2]:
# let's load the data again

df = pd.read_csv("retail.csv", parse_dates=["invoice_date"])

In [3]:
# create and entity set

es = ft.EntitySet(id="data")

In [4]:
# Add the data to the entity

es = es.add_dataframe(
    dataframe=df,              # the dataframe with the data
    dataframe_name="data",     # unique name to associate with this dataframe
    index="rows",              # column name to index the items
    make_index=True,           # if true, create a new column with unique values
    time_index="invoice_date", # column containing time data
    logical_types={
        "customer_id": Categorical, # the id is numerical, but should be handled as categorical
    },
)

In [5]:
# Create a new dataframe with invoices
# indicating its relationship to the main data

es.normalize_dataframe(
    base_dataframe_name="data",     # Datarame name from which to split.
    new_dataframe_name="invoices",  # Name of the new dataframe.
    index="invoice",                # relationship will be created across this column.
    copy_columns=["customer_id"],   # columns to remove from base_dataframe and move to new dataframe.
)

Entityset: data
  DataFrames:
    data [Rows: 741301, Columns: 8]
    invoices [Rows: 40505, Columns: 3]
  Relationships:
    data.invoice -> invoices.invoice

In [6]:
# cumulative transform primitives

cum_primitives = ["cum_sum", "cum_max", "diff", "time_since_previous"]

general_primitives = ["sine"]

In [7]:
# create all features simultaneously

feature_matrix, feature_defs = ft.dfs(
    entityset=es,                                # the entity set
    target_dataframe_name="data",                # the dataframe for wich to create the features
    agg_primitives=[],                           # empty list to avoid returning the defo parameters
    trans_primitives=general_primitives,         # empty list to avoid returning the defo parameters
    groupby_trans_primitives = cum_primitives,   # the operations to perform by invoice
    ignore_dataframes = ["invoices"],            # columns to ignore when creating features
)

# display name of created features
feature_defs

[<Feature: customer_id>,
 <Feature: invoice>,
 <Feature: stock_code>,
 <Feature: description>,
 <Feature: quantity>,
 <Feature: price>,
 <Feature: SINE(price)>,
 <Feature: SINE(quantity)>,
 <Feature: CUM_MAX(price) by invoice>,
 <Feature: CUM_MAX(quantity) by invoice>,
 <Feature: CUM_SUM(price) by invoice>,
 <Feature: CUM_SUM(quantity) by invoice>,
 <Feature: DIFF(price) by invoice>,
 <Feature: DIFF(quantity) by invoice>,
 <Feature: TIME_SINCE_PREVIOUS(invoice_date) by invoice>]

In [8]:
# Resulting datatable with original 
# and new features

feature_matrix.head()

Unnamed: 0_level_0,customer_id,invoice,stock_code,description,quantity,price,SINE(price),SINE(quantity),CUM_MAX(price) by invoice,CUM_MAX(quantity) by invoice,CUM_SUM(price) by invoice,CUM_SUM(quantity) by invoice,DIFF(price) by invoice,DIFF(quantity) by invoice,TIME_SINCE_PREVIOUS(invoice_date) by invoice
rows,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
0,13085.0,489434,85048,15CM CHRISTMAS GLASS BALL 20 LIGHTS,12,6.95,0.618486,-0.536573,6.95,12.0,6.95,12.0,,,
1,13085.0,489434,79323P,PINK CHERRY LIGHTS,12,6.75,0.450044,-0.536573,6.95,12.0,13.7,24.0,-0.2,0.0,0.0
2,13085.0,489434,79323W,WHITE CHERRY LIGHTS,12,6.75,0.450044,-0.536573,6.95,12.0,20.45,36.0,0.0,0.0,0.0
3,13085.0,489434,22041,"RECORD FRAME 7"" SINGLE SIZE",48,2.1,0.863209,-0.768255,6.95,48.0,22.55,84.0,-4.65,36.0,0.0
4,13085.0,489434,21232,STRAWBERRY CERAMIC TRINKET BOX,24,1.25,0.948985,-0.905578,6.95,48.0,23.8,108.0,-0.85,-24.0,0.0


In [9]:
# inspect values for 1 invoice

feature_matrix[feature_matrix["invoice"] == "489434"]

Unnamed: 0_level_0,customer_id,invoice,stock_code,description,quantity,price,SINE(price),SINE(quantity),CUM_MAX(price) by invoice,CUM_MAX(quantity) by invoice,CUM_SUM(price) by invoice,CUM_SUM(quantity) by invoice,DIFF(price) by invoice,DIFF(quantity) by invoice,TIME_SINCE_PREVIOUS(invoice_date) by invoice
rows,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
0,13085.0,489434,85048,15CM CHRISTMAS GLASS BALL 20 LIGHTS,12,6.95,0.618486,-0.536573,6.95,12.0,6.95,12.0,,,
1,13085.0,489434,79323P,PINK CHERRY LIGHTS,12,6.75,0.450044,-0.536573,6.95,12.0,13.7,24.0,-0.2,0.0,0.0
2,13085.0,489434,79323W,WHITE CHERRY LIGHTS,12,6.75,0.450044,-0.536573,6.95,12.0,20.45,36.0,0.0,0.0,0.0
3,13085.0,489434,22041,"RECORD FRAME 7"" SINGLE SIZE",48,2.1,0.863209,-0.768255,6.95,48.0,22.55,84.0,-4.65,36.0,0.0
4,13085.0,489434,21232,STRAWBERRY CERAMIC TRINKET BOX,24,1.25,0.948985,-0.905578,6.95,48.0,23.8,108.0,-0.85,-24.0,0.0
5,13085.0,489434,22064,PINK DOUGHNUT TRINKET POT,24,1.65,0.996865,-0.905578,6.95,48.0,25.45,132.0,0.4,0.0,0.0
6,13085.0,489434,21871,SAVE THE PLANET MUG,24,1.25,0.948985,-0.905578,6.95,48.0,26.7,156.0,-0.4,0.0,0.0
7,13085.0,489434,21523,FANCY FONT HOME SWEET HOME DOORMAT,10,5.95,-0.327055,-0.544021,6.95,48.0,32.65,166.0,4.7,-14.0,0.0


In [10]:
# inspect values for another invoice

feature_matrix[feature_matrix["invoice"] == "489435"]

Unnamed: 0_level_0,customer_id,invoice,stock_code,description,quantity,price,SINE(price),SINE(quantity),CUM_MAX(price) by invoice,CUM_MAX(quantity) by invoice,CUM_SUM(price) by invoice,CUM_SUM(quantity) by invoice,DIFF(price) by invoice,DIFF(quantity) by invoice,TIME_SINCE_PREVIOUS(invoice_date) by invoice
rows,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
8,13085.0,489435,22350,CAT BOWL,12,2.55,0.557684,-0.536573,2.55,12.0,2.55,12.0,,,
9,13085.0,489435,22349,"DOG BOWL , CHASING BALL DESIGN",12,3.75,-0.571561,-0.536573,3.75,12.0,6.3,24.0,1.2,0.0,0.0
10,13085.0,489435,22195,HEART MEASURING SPOONS LARGE,24,1.65,0.996865,-0.905578,3.75,24.0,7.95,48.0,-2.1,12.0,0.0
11,13085.0,489435,22353,LUNCHBOX WITH CUTLERY FAIRY CAKES,12,2.55,0.557684,-0.536573,3.75,24.0,10.5,60.0,0.9,-12.0,0.0


## In relation to pandas

In [11]:
# load data

df = pd.read_csv("retail.csv", parse_dates=["invoice_date"])

df.head()

Unnamed: 0,customer_id,invoice,invoice_date,stock_code,description,quantity,price
0,13085.0,489434,2009-12-01 07:45:00,85048,15CM CHRISTMAS GLASS BALL 20 LIGHTS,12,6.95
1,13085.0,489434,2009-12-01 07:45:00,79323P,PINK CHERRY LIGHTS,12,6.75
2,13085.0,489434,2009-12-01 07:45:00,79323W,WHITE CHERRY LIGHTS,12,6.75
3,13085.0,489434,2009-12-01 07:45:00,22041,"RECORD FRAME 7"" SINGLE SIZE",48,2.1
4,13085.0,489434,2009-12-01 07:45:00,21232,STRAWBERRY CERAMIC TRINKET BOX,24,1.25


In [12]:
# Cumulative sum at invoice level

# numerical variables
numeric_vars = ["quantity", "price"]

# new variable names
new_names = [f"{var}_cumsum" for var in numeric_vars]

new_names

['quantity_cumsum', 'price_cumsum']

In [13]:
# get cumulative sum
df[new_names] = df.groupby("invoice")[numeric_vars].cumsum()

# display input variables and new features
df[numeric_vars + new_names].head()

Unnamed: 0,quantity,price,quantity_cumsum,price_cumsum
0,12,6.95,12,6.95
1,12,6.75,24,13.7
2,12,6.75,36,20.45
3,48,2.1,84,22.55
4,24,1.25,108,23.8


In [14]:
# inspect values for 1 invoice

df[df["invoice"] == "489434" ][numeric_vars + new_names]

Unnamed: 0,quantity,price,quantity_cumsum,price_cumsum
0,12,6.95,12,6.95
1,12,6.75,24,13.7
2,12,6.75,36,20.45
3,48,2.1,84,22.55
4,24,1.25,108,23.8
5,24,1.65,132,25.45
6,24,1.25,156,26.7
7,10,5.95,166,32.65


In [15]:
# inspect values in a different invoice

df[df["invoice"] == "489435" ][numeric_vars + new_names]

Unnamed: 0,quantity,price,quantity_cumsum,price_cumsum
8,12,2.55,12,2.55
9,12,3.75,24,6.3
10,24,1.65,48,7.95
11,12,2.55,60,10.5


In [16]:
# Create features with multiple functions

# functions
func = ["cumsum", "cummax", "diff"]

# new variable names
new_names = [f"{var}_{function}" for function in func for var in numeric_vars]

new_names

['quantity_cumsum',
 'price_cumsum',
 'quantity_cummax',
 'price_cummax',
 'quantity_diff',
 'price_diff']

In [17]:
df[new_names] = df.groupby("invoice")[numeric_vars].agg(func)

df[new_names].head()

Unnamed: 0,quantity_cumsum,price_cumsum,quantity_cummax,price_cummax,quantity_diff,price_diff
0,12,12,,6.95,6.95,
1,24,12,0.0,13.7,6.95,-0.2
2,36,12,0.0,20.45,6.95,0.0
3,84,48,36.0,22.55,6.95,-4.65
4,108,48,-24.0,23.8,6.95,-0.85


In [18]:
# general transformations

new_names = [f"{var}_{function}" for function in ["sin", "cos"] for var in numeric_vars]

new_names

['quantity_sin', 'price_sin', 'quantity_cos', 'price_cos']

In [19]:
import numpy as np

In [20]:
df[new_names] = df[numeric_vars].agg([np.sin, np.cos])

df[new_names].head()

Unnamed: 0,quantity_sin,price_sin,quantity_cos,price_cos
0,-0.536573,0.843854,0.618486,0.785796
1,-0.536573,0.843854,0.450044,0.893006
2,-0.536573,0.843854,0.450044,0.893006
3,-0.768255,-0.640144,0.863209,-0.504846
4,-0.905578,0.424179,0.948985,0.315322
