In [1]:
pip install featuretools

Collecting featuretools
  Downloading featuretools-1.28.0-py3-none-any.whl (619 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/619.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m174.1/619.2 kB[0m [31m5.5 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m619.2/619.2 kB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
Collecting holidays<0.33,>=0.13 (from featuretools)
  Downloading holidays-0.32-py3-none-any.whl (754 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m754.4/754.4 kB[0m [31m16.2 MB/s[0m eta [36m0:00:00[0m
Collecting woodwork>=0.23.0 (from featuretools)
  Downloading woodwork-0.26.0-py3-none-any.whl (236 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m236.0/236.0 kB[0m [31m14.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: holidays, woodwork, featuretools
  Attempting uninst

In [2]:
import numpy as np
import featuretools as ft
import pandas as pd

In [3]:
from featuretools.primitives.standard.transform.datetime.season import date

customers = pd.read_csv('/content/customers.csv',
                        dtype ={'CustomerId': int, 'Name' : str, 'Email' :str , 'SignupDate':str},
                        parse_dates=['SignupDate'])
products = pd.read_csv('/content/products.csv',
                       dtype = {'ProductID': int, 'Name' : str, 'Category' :str , 'Price':float})
orders = pd.read_csv('/content/orders.csv',
                     dtype = {'OrderID': int, 'CustomerID' : int, 'OrderDate' :str , 'ShipDate':str},
                     parse_dates=['OrderDate','ShipDate'] )
orderDetails = pd.read_csv('/content/orderDetails.csv',
                           dtype = {'OrderID': int, 'ProductID' : int, 'Quantity' :int , 'Discount':float})

#remove any nan column
customers = customers.dropna()
products = products.dropna()
orders = orders.dropna()
orderDetails = orderDetails.dropna()

In [4]:
#create entity and entityset
es = ft.EntitySet(id = 'orderDetails')

#use existing index
es = es.add_dataframe(dataframe_name= 'products', dataframe = products, index = 'ProductID')
es = es.add_dataframe(dataframe_name= 'customers', dataframe = customers, index = 'CustomerID')
es = es.add_dataframe(dataframe_name= 'orders', dataframe = orders, index = 'OrderID')
#create new unique index
es = es.add_dataframe(dataframe_name= 'orderDetails', dataframe = orderDetails,
    make_index = True, index = 'orderDetails_index')

es

Entityset: orderDetails
  DataFrames:
    products [Rows: 10, Columns: 4]
    customers [Rows: 15, Columns: 4]
    orders [Rows: 20, Columns: 4]
    orderDetails [Rows: 20, Columns: 5]
  Relationships:
    No relationships

In [5]:
#create relationship
es = es.add_relationship('customers', 'CustomerID', 'orders', 'CustomerID')
es = es.add_relationship('orders', 'OrderID', 'orderDetails', 'OrderID')
es = es.add_relationship('products','ProductID', 'orderDetails', 'ProductID')

es

Entityset: orderDetails
  DataFrames:
    products [Rows: 10, Columns: 4]
    customers [Rows: 15, Columns: 4]
    orders [Rows: 20, Columns: 4]
    orderDetails [Rows: 20, Columns: 5]
  Relationships:
    orders.CustomerID -> customers.CustomerID
    orderDetails.OrderID -> orders.OrderID
    orderDetails.ProductID -> products.ProductID

In [6]:
#Determine primitive
agg_primitives =  ["sum","count", "percent_true", "mode"]
trans_primitives =  ["day", "year", "month", "weekday", "haversine","num_words", "num_characters",
                     "subtract_numeric","add_numeric","multiply_numeric"]

#DFS with specified primitives
feature_matrix, feature_defs = ft.dfs(entityset = es,
    target_dataframe_name = 'orderDetails',
    trans_primitives = trans_primitives,
    agg_primitives=agg_primitives,
    max_depth = 4, n_jobs = -1, verbose = 1)



  trans_primitives: ['haversine', 'num_characters', 'num_words']
  agg_primitives: ['percent_true']
This may be caused by a using a value of max_depth that is too small, not setting interesting values, or it may indicate no compatible columns for the primitive were found in the data. If the DFS call contained multiple instances of a primitive in the list above, none of them were used.


Built 543 features
Elapsed: 00:00 | Progress:   0%|          

INFO:distributed.http.proxy:To route to workers diagnostics web server please install jupyter-server-proxy: python -m pip install jupyter-server-proxy
INFO:distributed.scheduler:State start
INFO:distributed.scheduler:  Scheduler at:     tcp://127.0.0.1:46707
INFO:distributed.scheduler:  dashboard at:  http://127.0.0.1:8787/status
INFO:distributed.nanny:        Start Nanny at: 'tcp://127.0.0.1:42987'
INFO:distributed.nanny:        Start Nanny at: 'tcp://127.0.0.1:46367'
INFO:distributed.scheduler:Register worker <WorkerState 'tcp://127.0.0.1:38743', name: 0, status: init, memory: 0, processing: 0>
INFO:distributed.scheduler:Starting worker compute stream, tcp://127.0.0.1:38743
INFO:distributed.core:Starting established connection to tcp://127.0.0.1:48366
INFO:distributed.scheduler:Register worker <WorkerState 'tcp://127.0.0.1:43745', name: 1, status: init, memory: 0, processing: 0>
INFO:distributed.scheduler:Starting worker compute stream, tcp://127.0.0.1:43745
INFO:distributed.core:Sta

EntitySet scattered to 2 workers in 3 seconds
Elapsed: 00:02 | Progress:  95%|█████████▌

INFO:distributed.scheduler:Remove client Client-90d145a3-81da-11ee-80f1-0242ac1c000c
INFO:distributed.core:Received 'close-stream' from tcp://127.0.0.1:59184; closing.
INFO:distributed.scheduler:Remove client Client-90d145a3-81da-11ee-80f1-0242ac1c000c
INFO:distributed.scheduler:Close client connection: Client-90d145a3-81da-11ee-80f1-0242ac1c000c
INFO:distributed.nanny:Closing Nanny at 'tcp://127.0.0.1:42987'. Reason: nanny-close
INFO:distributed.nanny:Nanny asking worker to close. Reason: nanny-close
INFO:distributed.nanny:Closing Nanny at 'tcp://127.0.0.1:46367'. Reason: nanny-close
INFO:distributed.nanny:Nanny asking worker to close. Reason: nanny-close
INFO:distributed.core:Received 'close-stream' from tcp://127.0.0.1:48366; closing.
INFO:distributed.scheduler:Remove worker <WorkerState 'tcp://127.0.0.1:38743', name: 0, status: closing, memory: 0, processing: 0> (stimulus_id='handle-worker-cleanup-1699848628.638726')
INFO:distributed.batched:Batched Comm Closed <TCP (closed) Schedu

Elapsed: 00:05 | Progress: 100%|██████████


In [7]:
feature_matrix

Unnamed: 0_level_0,OrderID,ProductID,Quantity,Discount,Discount + Quantity,Discount * Quantity,Discount - Quantity,orders.CustomerID,products.Price,orders.COUNT(orderDetails),...,orders.customers.SUM(orderDetails.Quantity) - products.COUNT(orderDetails),orders.customers.SUM(orderDetails.Quantity) - products.Price,orders.customers.SUM(orderDetails.Quantity) - products.SUM(orderDetails.Discount * Quantity),orders.customers.SUM(orderDetails.Quantity) - products.SUM(orderDetails.Discount + Quantity),orders.customers.SUM(orderDetails.Quantity) - products.SUM(orderDetails.Discount - Quantity),orders.customers.SUM(orderDetails.Quantity) - products.SUM(orderDetails.Discount),orders.customers.SUM(orderDetails.Quantity) - products.SUM(orderDetails.Quantity),products.SUM(orderDetails.Discount * Quantity) - Quantity,products.SUM(orderDetails.Discount + Quantity) - Quantity,products.SUM(orderDetails.Discount - Quantity) - Quantity
orderDetails_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,301,201,1,0.0,1.0,0.0,-1.0,101,500.0,1,...,1.0,-497.0,3.0,1.0,5.0,3.0,1.0,-1.0,1.0,-3.0
1,302,202,2,0.1,2.1,0.2,-1.9,102,300.0,1,...,1.0,-297.0,2.6,-1.2,6.8,2.8,-1.0,-1.6,2.2,-5.8
2,303,203,1,0.0,1.0,0.0,-1.0,103,700.0,1,...,0.0,-698.0,2.0,0.0,4.0,2.0,0.0,-1.0,1.0,-3.0
3,304,204,3,0.2,3.2,0.6,-2.8,104,20.0,1,...,3.0,-15.0,3.8,-1.4,10.6,4.6,-1.0,-1.8,3.4,-8.6
4,305,205,1,0.0,1.0,0.0,-1.0,105,50.0,1,...,0.0,-48.0,2.0,0.0,4.0,2.0,0.0,-1.0,1.0,-3.0
5,306,206,2,0.0,2.0,0.0,-2.0,106,80.0,1,...,0.0,-78.0,2.0,-2.0,6.0,2.0,-2.0,-2.0,2.0,-6.0
6,307,207,1,0.15,1.15,0.15,-0.85,107,1000.0,1,...,-1.0,-999.0,0.7,-1.3,2.7,0.7,-1.0,-0.7,1.3,-2.7
7,308,208,1,0.0,1.0,0.0,-1.0,108,800.0,1,...,-1.0,-799.0,1.0,-1.0,3.0,1.0,-1.0,-1.0,1.0,-3.0
8,309,209,2,0.1,2.1,0.2,-1.9,109,150.0,1,...,0.0,-148.0,1.6,-2.2,5.8,1.8,-2.0,-1.6,2.2,-5.8
9,310,210,1,0.0,1.0,0.0,-1.0,110,200.0,1,...,-1.0,-199.0,1.0,-1.0,3.0,1.0,-1.0,-1.0,1.0,-3.0


In [8]:
feature_defs

[<Feature: OrderID>,
 <Feature: ProductID>,
 <Feature: Quantity>,
 <Feature: Discount>,
 <Feature: Discount + Quantity>,
 <Feature: Discount * Quantity>,
 <Feature: Discount - Quantity>,
 <Feature: orders.CustomerID>,
 <Feature: products.Price>,
 <Feature: orders.COUNT(orderDetails)>,
 <Feature: orders.SUM(orderDetails.Discount)>,
 <Feature: orders.SUM(orderDetails.Quantity)>,
 <Feature: orders.DAY(OrderDate)>,
 <Feature: orders.DAY(ShipDate)>,
 <Feature: orders.MONTH(OrderDate)>,
 <Feature: orders.MONTH(ShipDate)>,
 <Feature: orders.WEEKDAY(OrderDate)>,
 <Feature: orders.WEEKDAY(ShipDate)>,
 <Feature: orders.YEAR(OrderDate)>,
 <Feature: orders.YEAR(ShipDate)>,
 <Feature: products.COUNT(orderDetails)>,
 <Feature: products.SUM(orderDetails.Discount)>,
 <Feature: products.SUM(orderDetails.Quantity)>,
 <Feature: Discount + products.Price>,
 <Feature: products.Price + Quantity>,
 <Feature: Discount * products.Price>,
 <Feature: products.Price * Quantity>,
 <Feature: Discount - products.Pri

In [9]:
feature_matrix.to_csv('/content/result.csv', index=False)