In [1]:
import pandas as pd
import featuretools as ft
from woodwork.logical_types import Categorical, NaturalLanguage

In [2]:
# load data

df = pd.read_csv("retail.csv", parse_dates=["invoice_date"])

df.head()

Unnamed: 0,customer_id,invoice,invoice_date,stock_code,description,quantity,price
0,13085.0,489434,2009-12-01 07:45:00,85048,15CM CHRISTMAS GLASS BALL 20 LIGHTS,12,6.95
1,13085.0,489434,2009-12-01 07:45:00,79323P,PINK CHERRY LIGHTS,12,6.75
2,13085.0,489434,2009-12-01 07:45:00,79323W,WHITE CHERRY LIGHTS,12,6.75
3,13085.0,489434,2009-12-01 07:45:00,22041,"RECORD FRAME 7"" SINGLE SIZE",48,2.1
4,13085.0,489434,2009-12-01 07:45:00,21232,STRAWBERRY CERAMIC TRINKET BOX,24,1.25


In [3]:
# create and entity set

es = ft.EntitySet(id="data")

In [4]:
# Add the data to the entity

es = es.add_dataframe(
    dataframe=df,              # the dataframe with the data
    dataframe_name="data",     # unique name to associate with this dataframe
    index="rows",              # column name to index the items
    make_index=True,           # if true, create a new column with unique values
    time_index="invoice_date", # column containing time data
    logical_types={
        "customer_id": Categorical, # the id is numerical, but should be handled as categorical
        "description": NaturalLanguage, # we need to set this variable as text for ft to work
    },
)

In [5]:
# Create a new dataframe with invoices
# indicating its relationship to the main data

es.normalize_dataframe(
    base_dataframe_name="data",     # Datarame name from which to split.
    new_dataframe_name="invoices",  # Name of the new dataframe.
    index="invoice",                # relationship will be created across this column.
    copy_columns=["customer_id"],   # columns to remove from base_dataframe and move to new dataframe.
)

Entityset: data
  DataFrames:
    data [Rows: 741301, Columns: 8]
    invoices [Rows: 40505, Columns: 3]
  Relationships:
    data.invoice -> invoices.invoice

In [6]:
# the date related features we want to extract

text_primitives = ["num_words", "num_characters"]

In [7]:
# Create datetime features

feature_matrix, feature_defs = ft.dfs(
    entityset=es,                       # the entity set
    target_dataframe_name="data",       # the dataframe for wich to create the feature
    agg_primitives=[],                  # we need an empty list to avoid returning the defo parameters
    trans_primitives=text_primitives,   # the date features to extract
    ignore_dataframes=["invoices"],
)

# display name of created features
feature_defs

[<Feature: customer_id>,
 <Feature: invoice>,
 <Feature: stock_code>,
 <Feature: quantity>,
 <Feature: price>,
 <Feature: NUM_CHARACTERS(description)>,
 <Feature: NUM_WORDS(description)>]

In [8]:
# dataframe with the new features

feature_matrix.head()

Unnamed: 0_level_0,customer_id,invoice,stock_code,quantity,price,NUM_CHARACTERS(description),NUM_WORDS(description)
rows,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,13085.0,489434,85048,12,6.95,35.0,6.0
1,13085.0,489434,79323P,12,6.75,18.0,3.0
2,13085.0,489434,79323W,12,6.75,20.0,4.0
3,13085.0,489434,22041,48,2.1,28.0,6.0
4,13085.0,489434,21232,24,1.25,30.0,4.0


In [9]:
# for a demo:
# https://github.com/FeatureLabs/predict-restaurant-rating/blob/master/predict-restaurant-rating.ipynb

# blog:
# https://innovation.alteryx.com/natural-language-processing-featuretools/

from nlp_primitives import (
    DiversityScore,
    MeanCharactersPerWord,
    PunctuationCount,
)

In [10]:
text_primitives = text_primitives + [
    DiversityScore,
    MeanCharactersPerWord,
    PunctuationCount,
]

In [11]:
# Create datetime features

feature_matrix, feature_defs = ft.dfs(
    entityset=es,                       # the entity set
    target_dataframe_name="data",       # the dataframe for wich to create the feature
    agg_primitives=[],                  # we need an empty list to avoid returning the defo parameters
    trans_primitives=text_primitives,   # the date features to extract
    ignore_dataframes=["invoices"],
)

# display name of created features
feature_defs

[<Feature: customer_id>,
 <Feature: invoice>,
 <Feature: stock_code>,
 <Feature: quantity>,
 <Feature: price>,
 <Feature: DIVERSITY_SCORE(description)>,
 <Feature: MEAN_CHARACTERS_PER_WORD(description)>,
 <Feature: NUM_CHARACTERS(description)>,
 <Feature: NUM_WORDS(description)>,
 <Feature: PUNCTUATION_COUNT(description)>]

In [12]:
feature_matrix.head()

Unnamed: 0_level_0,customer_id,invoice,stock_code,quantity,price,DIVERSITY_SCORE(description),MEAN_CHARACTERS_PER_WORD(description),NUM_CHARACTERS(description),NUM_WORDS(description),PUNCTUATION_COUNT(description)
rows,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,13085.0,489434,85048,12,6.95,0.833333,5.0,35.0,6.0,0
1,13085.0,489434,79323P,12,6.75,1.0,5.333333,18.0,3.0,0
2,13085.0,489434,79323W,12,6.75,1.0,5.666667,20.0,4.0,0
3,13085.0,489434,22041,48,2.1,1.0,4.6,28.0,6.0,1
4,13085.0,489434,21232,24,1.25,1.0,6.75,30.0,4.0,0


In [13]:
new_vars = feature_matrix.columns[5:10]

In [14]:
feature_matrix[new_vars].head()

Unnamed: 0_level_0,DIVERSITY_SCORE(description),MEAN_CHARACTERS_PER_WORD(description),NUM_CHARACTERS(description),NUM_WORDS(description),PUNCTUATION_COUNT(description)
rows,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0.833333,5.0,35.0,6.0,0
1,1.0,5.333333,18.0,3.0,0
2,1.0,5.666667,20.0,4.0,0
3,1.0,4.6,28.0,6.0,1
4,1.0,6.75,30.0,4.0,0
