In [1]:
import pandas as pd
import featuretools as ft
from woodwork.logical_types import Categorical

In [2]:
# load data

df = pd.read_csv("retail.csv", parse_dates=["invoice_date"])

df.head()

Unnamed: 0,customer_id,invoice,invoice_date,stock_code,description,quantity,price
0,13085.0,489434,2009-12-01 07:45:00,85048,15CM CHRISTMAS GLASS BALL 20 LIGHTS,12,6.95
1,13085.0,489434,2009-12-01 07:45:00,79323P,PINK CHERRY LIGHTS,12,6.75
2,13085.0,489434,2009-12-01 07:45:00,79323W,WHITE CHERRY LIGHTS,12,6.75
3,13085.0,489434,2009-12-01 07:45:00,22041,"RECORD FRAME 7"" SINGLE SIZE",48,2.1
4,13085.0,489434,2009-12-01 07:45:00,21232,STRAWBERRY CERAMIC TRINKET BOX,24,1.25


In [3]:
# create and entity set

es = ft.EntitySet(id="data")

In [4]:
# Add the data to the entity

es = es.add_dataframe(
    dataframe=df,              # the dataframe with the data
    dataframe_name="data",     # unique name to associate with this dataframe
    index="rows",              # column name to index the items
    make_index=True,           # if true, create a new column with unique values
    time_index="invoice_date", # column containing time data
    logical_types={
        "customer_id": Categorical, # the id is numerical, but should be handled as categorical
    },
)

In [5]:
# Create a new dataframe with invoices
# indicating its relationship to the main data

es.normalize_dataframe(
    base_dataframe_name="data",     # Datarame name from which to split.
    new_dataframe_name="invoices",  # Name of the new dataframe.
    index="invoice",                # relationship will be created across this column.
    copy_columns=["customer_id"],   # columns to remove from base_dataframe and move to new dataframe.
)

Entityset: data
  DataFrames:
    data [Rows: 741301, Columns: 8]
    invoices [Rows: 40505, Columns: 3]
  Relationships:
    data.invoice -> invoices.invoice

In [6]:
es["invoices"].head()

Unnamed: 0,invoice,customer_id,first_data_time
489434,489434,13085.0,2009-12-01 07:45:00
489435,489435,13085.0,2009-12-01 07:46:00
489436,489436,13078.0,2009-12-01 09:06:00
489437,489437,15362.0,2009-12-01 09:08:00
489438,489438,18102.0,2009-12-01 09:24:00


In [7]:
# the date related features we want to extract

date_primitives = ["day", "year", "month", "weekday",
                   "days_in_month", "part_of_day",
                   "is_federal_holiday",
                   "hour", "minute"]

In [8]:
# Create datetime features

feature_matrix, feature_defs = ft.dfs(
    entityset=es,                       # the entity set
    target_dataframe_name="invoices",   # the dataframe for wich to create the feature
    agg_primitives=[],                  # we need an empty list to avoid returning the defo parameters
    trans_primitives=date_primitives,   # the date features to extract
)

# display name of created features
feature_defs

[<Feature: customer_id>,
 <Feature: DAY(first_data_time)>,
 <Feature: DAYS_IN_MONTH(first_data_time)>,
 <Feature: HOUR(first_data_time)>,
 <Feature: IS_FEDERAL_HOLIDAY(first_data_time)>,
 <Feature: MINUTE(first_data_time)>,
 <Feature: MONTH(first_data_time)>,
 <Feature: PART_OF_DAY(first_data_time)>,
 <Feature: WEEKDAY(first_data_time)>,
 <Feature: YEAR(first_data_time)>]

In [9]:
# dataframe with the new features

feature_matrix.head()

Unnamed: 0_level_0,customer_id,DAY(first_data_time),DAYS_IN_MONTH(first_data_time),HOUR(first_data_time),IS_FEDERAL_HOLIDAY(first_data_time),MINUTE(first_data_time),MONTH(first_data_time),PART_OF_DAY(first_data_time),WEEKDAY(first_data_time),YEAR(first_data_time)
invoice,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
489434,13085.0,1,31,7,False,45,12,early morning,1,2009
489435,13085.0,1,31,7,False,46,12,early morning,1,2009
489436,13078.0,1,31,9,False,6,12,late morning,1,2009
489437,15362.0,1,31,9,False,8,12,late morning,1,2009
489438,18102.0,1,31,9,False,24,12,late morning,1,2009


In [10]:
from featuretools.primitives import DistanceToHoliday

In [11]:
distance_to_boxing_day = DistanceToHoliday(holiday="Boxing Day", country="UK")

In [12]:
feature_matrix, feature_defs = ft.dfs(
    entityset=es,
    target_dataframe_name="invoices",
    agg_primitives=[],
    trans_primitives=[distance_to_boxing_day],
    verbose=True,    
)

# display name of created features
feature_defs

Built 2 features
Elapsed: 00:00 | Progress: 100%|██████████████████████████████████████████████████████████████████████████████████


[<Feature: customer_id>,
 <Feature: DISTANCE_TO_HOLIDAY(first_data_time, holiday=Boxing Day, country=UK)>]

In [13]:
feature_matrix.head()

Unnamed: 0_level_0,customer_id,"DISTANCE_TO_HOLIDAY(first_data_time, holiday=Boxing Day, country=UK)"
invoice,Unnamed: 1_level_1,Unnamed: 2_level_1
489434,13085.0,25.0
489435,13085.0,25.0
489436,13078.0,25.0
489437,15362.0,25.0
489438,18102.0,25.0


In [14]:
from featuretools.primitives.utils import HolidayUtil

In [15]:
holidayUtil = HolidayUtil("UK")

In [16]:
available_holidays = list(set(holidayUtil.federal_holidays.values()))

available_holidays

['Good Friday',
 'Boxing Day',
 "St. Patrick's Day [Northern Ireland]",
 "New Year's Day",
 'Golden Jubilee of Elizabeth II',
 'Millennium Celebrations',
 'Christmas Day',
 'Late Summer Bank Holiday [England/Wales/Northern Ireland]',
 'Platinum Jubilee of Elizabeth II',
 'New Year Holiday [Scotland]',
 'Battle of the Boyne [Northern Ireland]',
 'May Day',
 'Boxing Day (Observed)',
 'Silver Jubilee of Elizabeth II',
 'Summer Bank Holiday [Scotland]',
 'Christmas Day (Observed)',
 'Wedding of Charles and Diana',
 'Diamond Jubilee of Elizabeth II',
 "St. Patrick's Day [Northern Ireland] (Observed)",
 'Easter Monday [England/Wales/Northern Ireland]',
 'State Funeral of Queen Elizabeth II',
 'New Year Holiday [Scotland] (Observed)',
 'Wedding of William and Catherine',
 "St. Andrew's Day [Scotland]",
 "New Year Holiday [Scotland], New Year's Day (Observed)",
 'Spring Bank Holiday',
 "New Year's Day (Observed)"]