# 4.10 Part 2 - Deriving customer insights

### This script contains the following points:

#### 1. Import libraries
#### 2. Import data
#### 3. Create intuitive variable groups
##### 3.1 Create age ranges
##### 3.2 Create income brackets
##### 3.3 Create parental status
##### 3.4 Check data type of new variables
##### 3.5 Confirm data type of existing variables
#### 4. Data insights
#### 5. Export dataframe

# Import libraries

In [1]:
import pandas as pd
import numpy as np
import os

# Import data

In [2]:
# project folder path

path = r'C:\Users\Odette\Desktop\CareerFoundry\Immersion Courses\Course 4\Instacart Basket Analysis'

In [3]:
# import of orders_products data

final_df = pd.read_pickle(os.path.join(path, '02 Data','Prepared Data', 'Instacart_final.pkl'))

In [4]:
final_df.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_last_order,product_id,add_to_cart_order,reordered,product_name,...,gender,state,age,date_joined,n_dependants,fam_status,income,_merge,region,exclusion_flag
0,2539329,1,1,2,8,0,196,1,False,Soda,...,Female,Alabama,31,2019-02-17,3,married,40423,both,South,Frequent Customer
1,2398795,1,2,3,7,15,196,1,True,Soda,...,Female,Alabama,31,2019-02-17,3,married,40423,both,South,Frequent Customer
2,473747,1,3,3,12,21,196,1,True,Soda,...,Female,Alabama,31,2019-02-17,3,married,40423,both,South,Frequent Customer
3,2254736,1,4,4,7,29,196,1,True,Soda,...,Female,Alabama,31,2019-02-17,3,married,40423,both,South,Frequent Customer
4,431534,1,5,4,15,28,196,1,True,Soda,...,Female,Alabama,31,2019-02-17,3,married,40423,both,South,Frequent Customer


In [5]:
final_df.shape

(30964564, 32)

# Create intuitive variable groups

### Create age ranges

In [6]:
# find minimum age

final_df['age'].min()

18

In [7]:
# find maximum age

final_df['age'].max()

81

In [8]:
# create flag for age range 19 & under

final_df.loc[(final_df['age'] <= 19), 'age_range'] = '<19'

In [9]:
# create flag for age range 20-29

final_df.loc[(final_df['age'] > 19) & (final_df['age'] <= 29), 'age_range'] = '20-29'

In [10]:
# create flag for age range 30-39

final_df.loc[(final_df['age'] > 29) & (final_df['age'] <= 39), 'age_range'] = '30-39'

In [11]:
# create flag for age range 40-49

final_df.loc[(final_df['age'] > 39) & (final_df['age'] <= 49), 'age_range'] = '40-49'

In [12]:
# create flag for age range 50-59

final_df.loc[(final_df['age'] > 49) & (final_df['age'] <= 59), 'age_range'] = '50-59'

In [13]:
# create flag for age range 60-69

final_df.loc[(final_df['age'] > 59) & (final_df['age'] <= 69), 'age_range'] = '60-69'

In [14]:
# create flag for age range 70-79

final_df.loc[(final_df['age'] > 69) & (final_df['age'] <= 79), 'age_range'] = '70-79'

In [15]:
# create flag for age range 80+

final_df.loc[(final_df['age'] >= 80), 'age_range'] = '80+'

In [16]:
# check age_range flag 

final_df['age_range'].value_counts().sort_index()

20-29    4843069
30-39    4847507
40-49    4883179
50-59    4841771
60-69    4761621
70-79    4834286
80+       978597
<19       974534
Name: age_range, dtype: int64

In [17]:
# head results just for age & age_range

final_df[['age', 'age_range']].head()

Unnamed: 0,age,age_range
0,31,30-39
1,31,30-39
2,31,30-39
3,31,30-39
4,31,30-39


### Create income brackets

In [18]:
# find the minimum income amount

final_df['income'].min()

25903

In [19]:
# find the maximum income amount

final_df['income'].max()

593901

In [20]:
# create flag for income bracket <100000

final_df.loc[(final_df['income'] <= 99999), 'income_bracket'] = '<100000'

In [21]:
# create flag for income bracket 100000-199999

final_df.loc[(final_df['income'] > 99999) & (final_df['income'] <= 199999), 'income_bracket'] = '100000-199999'

In [22]:
# create flag for income bracket 200000-299999

final_df.loc[(final_df['income'] > 199999) & (final_df['income'] <= 299999), 'income_bracket'] = '200000-299999'

In [23]:
# create flag for income bracket 300000-399999

final_df.loc[(final_df['income'] > 299999) & (final_df['income'] <= 399999), 'income_bracket'] = '300000-399999'

In [24]:
# create flag for income bracket 400000-499999

final_df.loc[(final_df['income'] > 399999) & (final_df['income'] <= 499999), 'income_bracket'] = '400000-499999'

In [25]:
# create flag for income bracket >500000

final_df.loc[(final_df['income'] >= 500000), 'income_bracket'] = '>500000'

In [26]:
# Check income_bracket flag 

final_df['income_bracket'].value_counts().sort_index()

100000-199999    13953260
200000-299999      154113
300000-399999       53033
400000-499999       27927
<100000          16757162
>500000             19069
Name: income_bracket, dtype: int64

In [27]:
# head results just for income & income_bracket

final_df[['income', 'income_bracket']].head()

Unnamed: 0,income,income_bracket
0,40423,<100000
1,40423,<100000
2,40423,<100000
3,40423,<100000
4,40423,<100000


### Create parental status

In [28]:
# check dependant numbers

final_df['n_dependants'].value_counts().sort_index()

0    7739681
1    7719106
2    7733261
3    7772516
Name: n_dependants, dtype: int64

In [29]:
# create flag for parent (1+ children)

final_df.loc[(final_df['n_dependants'] >= 1), 'parental_status'] = 'Parent'

In [30]:
# create flag for non-parents(0 children)

final_df.loc[(final_df['n_dependants'] == 0), 'parental_status'] = 'Non-parent'

In [31]:
# check parental_status flag

final_df['parental_status'].value_counts().sort_index()

Non-parent     7739681
Parent        23224883
Name: parental_status, dtype: int64

In [32]:
# head results just for n_dependants & parental_status

final_df[['n_dependants', 'parental_status']].head()

Unnamed: 0,n_dependants,parental_status
0,3,Parent
1,3,Parent
2,3,Parent
3,3,Parent
4,3,Parent


### Check data type of new variables

In [33]:
final_df[['age_range', 'income_bracket', 'parental_status']].dtypes

age_range          object
income_bracket     object
parental_status    object
dtype: object

In [34]:
# convert the age_range, income_bracket & parental_status columns to category to optimise memory

final_df[['age_range', 'income_bracket', 'parental_status']] = final_df[['age_range', 'income_bracket', 'parental_status']].astype('category')

In [35]:
# confirm dtype change

final_df[['age_range', 'income_bracket', 'parental_status']].dtypes

age_range          category
income_bracket     category
parental_status    category
dtype: object

### Confirm data type of existing variables

In [36]:
final_df.dtypes

order_id                       category
user_id                        category
order_number                       int8
orders_day_of_week                 int8
order_hour_of_day                  int8
days_since_last_order              int8
product_id                     category
add_to_cart_order                 int16
reordered                          bool
product_name                     object
aisle_id                       category
department_id                  category
prices                          float16
price_range_loc                category
busiest_days                   category
busiest_period_of_day          category
max_order                          int8
loyalty_flag                   category
average_spend                   float16
spending_type                    object
median_order_days               float16
order_frequency                category
gender                           object
state                            object
age                               int64


In [37]:
# convert the gender, state, fam_status, spending type & product_name columns to category to further optimise memory

final_df[['gender', 'state', 'fam_status', 'spending_type', 'product_name']] = final_df[['gender', 'state', 'fam_status', 'spending_type', 'product_name']].astype('category')

In [38]:
# confirm dtype changes

final_df.dtypes

order_id                       category
user_id                        category
order_number                       int8
orders_day_of_week                 int8
order_hour_of_day                  int8
days_since_last_order              int8
product_id                     category
add_to_cart_order                 int16
reordered                          bool
product_name                   category
aisle_id                       category
department_id                  category
prices                          float16
price_range_loc                category
busiest_days                   category
busiest_period_of_day          category
max_order                          int8
loyalty_flag                   category
average_spend                   float16
spending_type                  category
median_order_days               float16
order_frequency                category
gender                         category
state                          category
age                               int64


# Data insights

In [39]:
# crosstab of age_range & income_bracket variables

xtab_age_income = pd.crosstab(final_df['age_range'], final_df['income_bracket'], dropna = False)

In [40]:
xtab_age_income

income_bracket,100000-199999,200000-299999,300000-399999,400000-499999,<100000,>500000
age_range,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
20-29,136078,6727,10447,0,4689817,0
30-39,141512,9179,9325,0,4687491,0
40-49,3041066,40300,6454,9179,1780300,5880
50-59,3320244,25040,10541,4112,1478202,3632
60-69,3281440,31669,8762,7821,1429298,2631
70-79,3349359,31563,4904,5628,1437862,4970
80+,655191,8056,581,1187,311626,1956
<19,28370,1579,2019,0,942566,0


In [41]:
# crosstab of age_range & parental_status variables

xtab_age_parent = pd.crosstab(final_df['age_range'], final_df['parental_status'], dropna = False)

In [42]:
xtab_age_parent

parental_status,Non-parent,Parent
age_range,Unnamed: 1_level_1,Unnamed: 2_level_1
20-29,1222782,3620287
30-39,1212957,3634550
40-49,1213734,3669445
50-59,1220101,3621670
60-69,1204466,3557155
70-79,1184761,3649525
80+,256044,722553
<19,224836,749698


In [43]:
# crosstab of age_range & order_hour_of_day variables

xtab_age_loyalty = pd.crosstab(final_df['age_range'], final_df['loyalty_flag'], dropna = False)

In [44]:
xtab_age_loyalty

loyalty_flag,Loyal customer,New customer,Regular customer
age_range,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
20-29,1596190,750118,2496761
30-39,1606129,739301,2502077
40-49,1632365,752701,2498113
50-59,1617569,761285,2462917
60-69,1586126,748124,2427371
70-79,1593284,754119,2486883
80+,335363,147732,495502
<19,317067,150315,507152


In [45]:
# crosstab of parental_status & order_frequency variables

xtab_parent_freq = pd.crosstab(final_df['parental_status'], final_df['order_frequency'], dropna = False)

In [46]:
xtab_parent_freq

order_frequency,Frequent customer,Non-frequent customer,Regular customer
parental_status,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Non-parent,5544082,540110,1655489
Parent,16664812,1647307,4912764


In [47]:
# crosstab of age_range & busiest_period_of_day variables

xtab_age_dayperiod = pd.crosstab(final_df['age_range'], final_df['busiest_period_of_day'], dropna = False)

In [48]:
xtab_age_dayperiod

busiest_period_of_day,Average orders,Fewest orders,Most orders
age_range,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
20-29,1231079,133672,3478318
30-39,1241078,133677,3472752
40-49,1252743,131909,3498527
50-59,1252219,130444,3459108
60-69,1218905,128579,3414137
70-79,1246474,131203,3456609
80+,261489,28198,688910
<19,249032,30662,694840


In [49]:
# crosstab of age_range & spending_type variables

xtab_age_spender = pd.crosstab(final_df['age_range'], final_df['spending_type'], dropna = False)

In [50]:
xtab_age_spender

spending_type,High spender,Low spender
age_range,Unnamed: 1_level_1,Unnamed: 2_level_1
20-29,93053,4750016
30-39,91963,4755544
40-49,108340,4774839
50-59,87053,4754718
60-69,97375,4664246
70-79,89012,4745274
80+,20727,957870
<19,15848,958686


In [51]:
# crosstab of department_id & income_bracket variables

xtab_deptid_income = pd.crosstab(final_df['department_id'], final_df['income_bracket'], dropna = False)

In [52]:
xtab_deptid_income

income_bracket,100000-199999,200000-299999,300000-399999,400000-499999,<100000,>500000
department_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,968648,12114,4490,2038,1133231,1210
2,15279,295,112,48,18661,16
3,515107,7727,2812,1266,593117,799
4,4187609,29052,10791,5307,4842271,4243
5,63015,3312,1361,846,75485,608
6,117954,1159,442,249,136000,187
7,1101801,15370,4842,2726,1445439,1723
8,43206,1136,316,279,47971,152
9,385848,4059,1460,749,429405,615
10,14191,22,1,1,19233,3


# Export dataframe

In [53]:
# export final_df to pkl 
 
final_df.to_pickle(os.path.join(path, '02 Data','Prepared Data', 'Instacart_final2.pkl'))