In [1]:
import pandas as pd
import featuretools as ft
from woodwork.logical_types import Categorical

In [2]:
# let's load the data again

df = pd.read_csv("retail.csv", parse_dates=["invoice_date"])

In [3]:
# create and entity set

es = ft.EntitySet(id="data")

In [4]:
# Add the data to the entity

es = es.add_dataframe(
    dataframe=df,              # the dataframe with the data
    dataframe_name="data",     # unique name to associate with this dataframe
    index="rows",              # column name to index the items
    make_index=True,           # if true, create a new column with unique values
    time_index="invoice_date", # column containing time data
    logical_types={
        "customer_id": Categorical, # the id is numerical, but should be handled as categorical
    },
)

In [5]:
# Create a new dataframe with invoices
# indicating its relationship to the main data

es.normalize_dataframe(
    base_dataframe_name="data",     # Datarame name from which to split.
    new_dataframe_name="invoices",  # Name of the new dataframe.
    index="invoice",                # relationship will be created across this column.
    copy_columns=["customer_id"],   # columns to remove from base_dataframe and move to new dataframe.
)

Entityset: data
  DataFrames:
    data [Rows: 741301, Columns: 8]
    invoices [Rows: 40505, Columns: 3]
  Relationships:
    data.invoice -> invoices.invoice

In [6]:
# Obtain new variable "amount" by multiplying
# price and quantity.

feature_matrix, feature_defs = ft.dfs(
    entityset=es,                          # the entity set
    target_dataframe_name="data",          # the dataframe for wich to create the feature
    agg_primitives=[],                     # we need an empty list to avoid returning the defo parameters
    trans_primitives=["multiply_numeric"], # the operation to create the new features
    primitive_options={                    # the features that we want to multiply
        ("multiply_numeric"): {
            'include_columns': {
                'data': ["quantity", "price"]
            }
        }
    },
    ignore_dataframes=["invoices"],
)

# display name of created features
feature_defs

[<Feature: customer_id>,
 <Feature: invoice>,
 <Feature: stock_code>,
 <Feature: description>,
 <Feature: quantity>,
 <Feature: price>,
 <Feature: price * quantity>]

In [7]:
feature_matrix.head()

Unnamed: 0_level_0,customer_id,invoice,stock_code,description,quantity,price,price * quantity
rows,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,13085.0,489434,85048,15CM CHRISTMAS GLASS BALL 20 LIGHTS,12,6.95,83.4
1,13085.0,489434,79323P,PINK CHERRY LIGHTS,12,6.75,81.0
2,13085.0,489434,79323W,WHITE CHERRY LIGHTS,12,6.75,81.0
3,13085.0,489434,22041,"RECORD FRAME 7"" SINGLE SIZE",48,2.1,100.8
4,13085.0,489434,21232,STRAWBERRY CERAMIC TRINKET BOX,24,1.25,30.0


## In relation to pandas

In [8]:
# load data

df = pd.read_csv("retail.csv", parse_dates=["invoice_date"])

df.head()

Unnamed: 0,customer_id,invoice,invoice_date,stock_code,description,quantity,price
0,13085.0,489434,2009-12-01 07:45:00,85048,15CM CHRISTMAS GLASS BALL 20 LIGHTS,12,6.95
1,13085.0,489434,2009-12-01 07:45:00,79323P,PINK CHERRY LIGHTS,12,6.75
2,13085.0,489434,2009-12-01 07:45:00,79323W,WHITE CHERRY LIGHTS,12,6.75
3,13085.0,489434,2009-12-01 07:45:00,22041,"RECORD FRAME 7"" SINGLE SIZE",48,2.1
4,13085.0,489434,2009-12-01 07:45:00,21232,STRAWBERRY CERAMIC TRINKET BOX,24,1.25


In [9]:
# Add total amount of transaction

df["amount"] = df["quantity"].mul(df["price"])

df.head()

Unnamed: 0,customer_id,invoice,invoice_date,stock_code,description,quantity,price,amount
0,13085.0,489434,2009-12-01 07:45:00,85048,15CM CHRISTMAS GLASS BALL 20 LIGHTS,12,6.95,83.4
1,13085.0,489434,2009-12-01 07:45:00,79323P,PINK CHERRY LIGHTS,12,6.75,81.0
2,13085.0,489434,2009-12-01 07:45:00,79323W,WHITE CHERRY LIGHTS,12,6.75,81.0
3,13085.0,489434,2009-12-01 07:45:00,22041,"RECORD FRAME 7"" SINGLE SIZE",48,2.1,100.8
4,13085.0,489434,2009-12-01 07:45:00,21232,STRAWBERRY CERAMIC TRINKET BOX,24,1.25,30.0
