## Zautomatyzowana eksploracja danych



In [5]:
import numpy as np
import pandas as pd
from ydata_profiling import ProfileReport
import warnings
warnings.simplefilter("ignore")

df = pd.DataFrame(np.random.rand(100,5), columns=['a','b','c','d','e'])
df

Unnamed: 0,a,b,c,d,e
0,0.161173,0.328087,0.397611,0.753076,0.666922
1,0.013127,0.723590,0.762098,0.060419,0.646746
2,0.140227,0.437692,0.868179,0.122205,0.701317
3,0.068732,0.834403,0.012661,0.494414,0.595600
4,0.898939,0.412232,0.912804,0.209317,0.575482
...,...,...,...,...,...
95,0.173063,0.607937,0.424769,0.081226,0.223279
96,0.235608,0.550652,0.398154,0.420294,0.709562
97,0.335784,0.862064,0.279954,0.145985,0.907339
98,0.159712,0.201389,0.667647,0.807603,0.733181


In [6]:
profile = ProfileReport(df, title="Pandas Profile Report", dark_mode=True)

In [7]:
profile.to_notebook_iframe()

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

In [8]:
profile.to_widgets()

Render widgets:   0%|          | 0/1 [00:00<?, ?it/s]

VBox(children=(Tab(children=(Tab(children=(GridBox(children=(VBox(children=(GridspecLayout(children=(HTML(valu…

___
## Zautmatyzoawana inżynieria cech

In [10]:
import featuretools as ft
es = ft.demo.load_mock_customer(return_entityset=True)
es

Entityset: transactions
  DataFrames:
    transactions [Rows: 500, Columns: 6]
    products [Rows: 5, Columns: 3]
    sessions [Rows: 35, Columns: 5]
    customers [Rows: 5, Columns: 5]
  Relationships:
    transactions.product_id -> products.product_id
    transactions.session_id -> sessions.session_id
    sessions.customer_id -> customers.customer_id

In [11]:
es.dataframe_dict.keys()

dict_keys(['transactions', 'products', 'sessions', 'customers'])

In [12]:
es.dataframe_dict['customers']

Unnamed: 0,customer_id,zip_code,join_date,birthday,_ft_last_time
5,5,60091,2010-07-17 05:27:50,1984-07-28,2014-01-01 08:09:40
4,4,60091,2011-04-08 20:08:14,2006-08-15,2014-01-01 05:31:30
1,1,60091,2011-04-17 10:48:33,1994-07-18,2014-01-01 07:26:20
3,3,13244,2011-08-13 15:42:34,2003-11-21,2014-01-01 09:00:35
2,2,13244,2012-04-15 23:31:04,1986-08-18,2014-01-01 08:23:45


In [13]:
df = es.dataframe_dict['transactions']
df

Unnamed: 0,transaction_id,session_id,transaction_time,product_id,amount,_ft_last_time
298,298,1,2014-01-01 00:00:00,5,127.64,2014-01-01 00:00:00
2,2,1,2014-01-01 00:01:05,2,109.48,2014-01-01 00:01:05
308,308,1,2014-01-01 00:02:10,3,95.06,2014-01-01 00:02:10
116,116,1,2014-01-01 00:03:15,4,78.92,2014-01-01 00:03:15
371,371,1,2014-01-01 00:04:20,3,31.54,2014-01-01 00:04:20
...,...,...,...,...,...,...
112,112,35,2014-01-01 08:56:15,5,55.42,2014-01-01 08:56:15
111,111,35,2014-01-01 08:57:20,3,34.87,2014-01-01 08:57:20
276,276,35,2014-01-01 08:58:25,1,10.94,2014-01-01 08:58:25
266,266,35,2014-01-01 08:59:30,5,19.86,2014-01-01 08:59:30


In [14]:
df = es.dataframe_dict['sessions']
df.head()

Unnamed: 0,session_id,customer_id,device,session_start,_ft_last_time
1,1,2,desktop,2014-01-01 00:00:00,2014-01-01 00:16:15
2,2,5,mobile,2014-01-01 00:17:20,2014-01-01 00:27:05
3,3,4,mobile,2014-01-01 00:28:10,2014-01-01 00:43:20
4,4,1,mobile,2014-01-01 00:44:25,2014-01-01 01:10:25
5,5,4,mobile,2014-01-01 01:11:30,2014-01-01 01:22:20


In [15]:
from ydata_profiling import ProfileReport

profile = ProfileReport(df, title="Pandas Profile Report", dark_mode=True)
profile

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]



In [18]:
feature_matrix, feature_defs = ft.dfs(entityset=es,
                                      target_dataframe_name='customers',
                                      agg_primitives=['count'],
                                      trans_primitives=['month'],
                                      max_depth=1)
feature_matrix

Unnamed: 0_level_0,zip_code,COUNT(sessions),MONTH(birthday),MONTH(join_date)
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
5,60091,6,7,7
4,60091,8,8,4
1,60091,8,7,4
3,13244,6,11,8
2,13244,7,8,4


In [19]:
feature_matrix.shape

(5, 4)

In [20]:
feature_matrix, feature_defs = ft.dfs(entityset=es,
                                      target_dataframe_name='customers',
                                      agg_primitives=['mean', 'sum','mode'],
                                      trans_primitives=['month', 'hour'],
                                      max_depth=2)
feature_matrix

Unnamed: 0_level_0,zip_code,MODE(sessions.device),MEAN(transactions.amount),MODE(transactions.product_id),SUM(transactions.amount),HOUR(birthday),HOUR(join_date),MONTH(birthday),MONTH(join_date),MEAN(sessions.MEAN(transactions.amount)),MEAN(sessions.SUM(transactions.amount)),MODE(sessions.HOUR(session_start)),MODE(sessions.MODE(transactions.product_id)),MODE(sessions.MONTH(session_start)),SUM(sessions.MEAN(transactions.amount)),MODE(transactions.sessions.device)
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
5,60091,mobile,80.375443,5,6349.66,0,5,7,7,78.705187,1058.276667,0,3,1,472.231119,mobile
4,60091,mobile,80.070459,2,8727.68,0,20,8,4,81.207189,1090.96,1,1,1,649.657515,mobile
1,60091,mobile,71.631905,4,9025.62,0,10,7,4,72.77414,1128.2025,6,4,1,582.193117,mobile
3,13244,desktop,67.06043,1,6236.62,0,15,11,8,67.539577,1039.436667,5,1,1,405.237462,desktop
2,13244,desktop,77.422366,4,7200.28,0,23,8,4,78.415122,1028.611429,3,3,1,548.905851,desktop


In [21]:
feature_matrix.shape

(5, 16)

In [22]:
feature_matrix, feature_defs = ft.dfs(entityset=es,
                                      target_dataframe_name='customers',
                                      agg_primitives=['mean', 'sum','mode'],
                                      trans_primitives=['month', 'hour'],
                                      max_depth=3)
feature_matrix

Unnamed: 0_level_0,zip_code,MODE(sessions.device),MEAN(transactions.amount),MODE(transactions.product_id),SUM(transactions.amount),HOUR(birthday),HOUR(join_date),MONTH(birthday),MONTH(join_date),MEAN(sessions.MEAN(transactions.amount)),MEAN(sessions.SUM(transactions.amount)),MODE(sessions.HOUR(session_start)),MODE(sessions.MODE(transactions.product_id)),MODE(sessions.MONTH(session_start)),SUM(sessions.MEAN(transactions.amount)),MODE(transactions.HOUR(transaction_time)),MODE(transactions.MONTH(transaction_time)),MODE(transactions.sessions.device),MODE(sessions.MODE(transactions.HOUR(transaction_time))),MODE(sessions.MODE(transactions.MONTH(transaction_time)))
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
5,60091,mobile,80.375443,5,6349.66,0,5,7,7,78.705187,1058.276667,0,3,1,472.231119,7,1,mobile,0,1
4,60091,mobile,80.070459,2,8727.68,0,20,8,4,81.207189,1090.96,1,1,1,649.657515,5,1,mobile,2,1
1,60091,mobile,71.631905,4,9025.62,0,10,7,4,72.77414,1128.2025,6,4,1,582.193117,6,1,mobile,6,1
3,13244,desktop,67.06043,1,6236.62,0,15,11,8,67.539577,1039.436667,5,1,1,405.237462,8,1,desktop,8,1
2,13244,desktop,77.422366,4,7200.28,0,23,8,4,78.415122,1028.611429,3,3,1,548.905851,3,1,desktop,3,1


In [23]:
feature_matrix.shape

(5, 20)

In [24]:
feature_matrix[['MODE(sessions.HOUR(session_start))']]

Unnamed: 0_level_0,MODE(sessions.HOUR(session_start))
customer_id,Unnamed: 1_level_1
5,0
4,1
1,6
3,5
2,3


In [26]:
ft.describe_feature(feature_defs[11])

'The most frequently occurring value of the hour value of the "session_start" of all instances of "sessions" for each "customer_id" in "customers".'

In [35]:
import pandas as pd
import featuretools as ft

from featuretools.selection import (
    remove_highly_correlated_features,
    remove_highly_null_features,
    remove_single_value_features,
)

from featuretools.demo.flight import load_flight

es = load_flight(nrows=50)
es

Downloading data ...


Entityset: Flight Data
  DataFrames:
    trip_logs [Rows: 50, Columns: 21]
    flights [Rows: 6, Columns: 9]
    airlines [Rows: 1, Columns: 1]
    airports [Rows: 4, Columns: 3]
  Relationships:
    trip_logs.flight_id -> flights.flight_id
    flights.carrier -> airlines.carrier
    flights.dest -> airports.dest

In [39]:
fm, features = ft.dfs(entityset=es,
                      target_dataframe_name="trip_logs",
                      cutoff_time=pd.DataFrame({
                          'trip_log_id': [30,  1, 2, 3 ,4],
                          'time':pd.to_datetime(['2016-09-22 00:00:00']*5)
                      }),
                      trans_primitives = [],
                      agg_primitives = [],
                      max_depth=2)
fm

Unnamed: 0_level_0,flight_id,dep_delay,taxi_out,taxi_in,arr_delay,diverted,air_time,distance,carrier_delay,weather_delay,national_airspace_delay,security_delay,late_aircraft_delay,canceled,flights.origin,flights.origin_city,flights.origin_state,flights.dest,flights.distance_group,flights.carrier,flights.flight_num,flights.airports.dest_city,flights.airports.dest_state
trip_log_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
30,AA-494:RSW->CLT,,,,,,,600.0,,,,,,,RSW,"Fort Myers, FL",FL,CLT,3.0,AA,494.0,"Charlotte, NC",NC
1,AA-494:CLT->PHX,,,,,,,1773.0,,,,,,,CLT,"Charlotte, NC",NC,PHX,8.0,AA,494.0,"Phoenix, AZ",AZ
2,AA-494:CLT->PHX,,,,,,,1773.0,,,,,,,CLT,"Charlotte, NC",NC,PHX,8.0,AA,494.0,"Phoenix, AZ",AZ
3,AA-494:CLT->PHX,,,,,,,1773.0,,,,,,,CLT,"Charlotte, NC",NC,PHX,8.0,AA,494.0,"Phoenix, AZ",AZ
4,,,,,,,,,,,,,,,,,,,,,,,


In [49]:
remove_highly_null_features(fm)

Unnamed: 0_level_0,flight_id,distance,flights.origin,flights.origin_city,flights.origin_state,flights.dest,flights.distance_group,flights.carrier,flights.flight_num,flights.airports.dest_city,flights.airports.dest_state
trip_log_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
30,AA-494:RSW->CLT,600.0,RSW,"Fort Myers, FL",FL,CLT,3.0,AA,494.0,"Charlotte, NC",NC
1,AA-494:CLT->PHX,1773.0,CLT,"Charlotte, NC",NC,PHX,8.0,AA,494.0,"Phoenix, AZ",AZ
2,AA-494:CLT->PHX,1773.0,CLT,"Charlotte, NC",NC,PHX,8.0,AA,494.0,"Phoenix, AZ",AZ
3,AA-494:CLT->PHX,1773.0,CLT,"Charlotte, NC",NC,PHX,8.0,AA,494.0,"Phoenix, AZ",AZ
4,,,,,,,,,,,


In [53]:
remove_highly_null_features(fm, pct_null_threshold=.2)

30
1
2
3
4


In [43]:
remove_highly_correlated_features(fm)

Unnamed: 0_level_0,flight_id,dep_delay,taxi_out,taxi_in,arr_delay,diverted,air_time,distance,carrier_delay,weather_delay,national_airspace_delay,security_delay,late_aircraft_delay,canceled,flights.origin,flights.origin_city,flights.origin_state,flights.dest,flights.distance_group,flights.carrier,flights.flight_num,flights.airports.dest_city,flights.airports.dest_state
trip_log_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
30,AA-494:RSW->CLT,,,,,,,600.0,,,,,,,RSW,"Fort Myers, FL",FL,CLT,3.0,AA,494.0,"Charlotte, NC",NC
1,AA-494:CLT->PHX,,,,,,,1773.0,,,,,,,CLT,"Charlotte, NC",NC,PHX,8.0,AA,494.0,"Phoenix, AZ",AZ
2,AA-494:CLT->PHX,,,,,,,1773.0,,,,,,,CLT,"Charlotte, NC",NC,PHX,8.0,AA,494.0,"Phoenix, AZ",AZ
3,AA-494:CLT->PHX,,,,,,,1773.0,,,,,,,CLT,"Charlotte, NC",NC,PHX,8.0,AA,494.0,"Phoenix, AZ",AZ
4,,,,,,,,,,,,,,,,,,,,,,,


In [42]:
remove_single_value_features(fm)

Unnamed: 0_level_0,flight_id,distance,flights.origin,flights.origin_city,flights.origin_state,flights.dest,flights.distance_group,flights.airports.dest_city,flights.airports.dest_state
trip_log_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
30,AA-494:RSW->CLT,600.0,RSW,"Fort Myers, FL",FL,CLT,3.0,"Charlotte, NC",NC
1,AA-494:CLT->PHX,1773.0,CLT,"Charlotte, NC",NC,PHX,8.0,"Phoenix, AZ",AZ
2,AA-494:CLT->PHX,1773.0,CLT,"Charlotte, NC",NC,PHX,8.0,"Phoenix, AZ",AZ
3,AA-494:CLT->PHX,1773.0,CLT,"Charlotte, NC",NC,PHX,8.0,"Phoenix, AZ",AZ
4,,,,,,,,,
