# Deriving Customer Insights

# Script contains:
## Importing Libraries and Data
## Creating intuitive variable groups (age, income, parental status)
## Checking and confirming new variable types
## changing variable types 
## Deriving Insights:
### crosstab of age_range & income_bracket variables
### crosstab of age_range & parental_status variables
### crosstab of age_range & order_hour_of_day variables
### crosstab parental status & order frequency
### crosstab of age_range & spending_habit
### crosstab department_id & income_bracket

## Import Libraries

In [2]:
import pandas as pd
import numpy as np
import os

## Import Data

In [3]:
path = r'C:\Users\Bradley Allen\Desktop\Instacart Basket Analysis'

In [4]:
final_df = pd.read_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'Instacart_final.pkl'))

In [5]:
final_df.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,first_order,product_id,add_to_cart_order,product_name,...,gender,state,age,date_joined,n_dependants,fam_status,income,_merge,region,exclusion_flag
0,2539329,1,1,2,8,,True,196,1,Soda,...,Female,Alabama,31,2/17/2019,3,married,40423,both,South,Frequent Customer
1,2539329,1,1,2,8,,True,12427,3,Original Beef Jerky,...,Female,Alabama,31,2/17/2019,3,married,40423,both,South,Frequent Customer
2,2398795,1,2,3,7,15.0,False,10258,2,Pistachios,...,Female,Alabama,31,2/17/2019,3,married,40423,both,South,Frequent Customer
3,473747,1,3,3,12,21.0,False,12427,2,Original Beef Jerky,...,Female,Alabama,31,2/17/2019,3,married,40423,both,South,Frequent Customer
4,473747,1,3,3,12,21.0,False,10258,3,Pistachios,...,Female,Alabama,31,2/17/2019,3,married,40423,both,South,Frequent Customer


In [6]:
final_df.shape

(6164429, 28)

## Creating intuitive Variable groups

### age ranges

In [9]:
final_df['age'].min()

18

In [10]:
final_df['age'].max()

81

In [11]:
# create flag for ages under 20
final_df.loc[(final_df['age'] <= 19), 'age_range'] = '<19'

In [12]:
# create age range for 20-29
final_df.loc[(final_df['age'] > 19) & (final_df['age'] <= 29), 'age_range'] = '20-29'

In [13]:
# 30-39 group
final_df.loc[(final_df['age'] > 29) & (final_df['age'] <= 39), 'age_range'] = '30-39'

In [14]:
# 40-49
final_df.loc[(final_df['age'] > 39) & (final_df['age'] <= 49), 'age_range'] = '40-49'

In [15]:
# 50-59
final_df.loc[(final_df['age'] > 49) & (final_df['age'] <= 59), 'age_range'] = '50-59'

In [16]:
# 60-69
final_df.loc[(final_df['age'] > 59) & (final_df['age'] <= 69), 'age_range'] = '60-69'

In [17]:
# 70-79
final_df.loc[(final_df['age'] > 69) & (final_df['age'] <= 79), 'age_range'] = '70-79'

In [18]:
# 80+
final_df.loc[(final_df['age'] >= 80), 'age_range'] = '80+'

In [19]:
# checking age_range flag establishment
final_df['age_range'].value_counts().sort_index()

20-29    963285
30-39    965210
40-49    972762
50-59    964081
60-69    948218
70-79    962148
80+      194802
<19      193923
Name: age_range, dtype: int64

In [20]:
final_df[['age', 'age_range']].head()

Unnamed: 0,age,age_range
0,31,30-39
1,31,30-39
2,31,30-39
3,31,30-39
4,31,30-39


In [21]:
final_df[['age', 'age_range']].tail()

Unnamed: 0,age,age_range
6480161,74,70-79
6480162,74,70-79
6480163,74,70-79
6480164,74,70-79
6480165,74,70-79


### Creating Income groups

In [22]:
final_df['income'].min()

25903

In [23]:
final_df['income'].max()

593901

In [24]:
# income less than 100,000
final_df.loc[(final_df['income'] <= 99999), 'income_bracket'] = '<100000'

In [25]:
# 100,000-199,999
final_df.loc[(final_df['income'] > 99999) & (final_df['income'] <= 199999), 'income_bracket'] = '100000-199999'

In [26]:
# 200,000-299,999
final_df.loc[(final_df['income'] > 199999) & (final_df['income'] <= 299999), 'income_bracket'] = '200000-299999'

In [27]:
# 300,000-399,999
final_df.loc[(final_df['income'] > 299999) & (final_df['income'] <= 399999), 'income_bracket'] = '300000-399999'

In [28]:
# 400,000-499,999
final_df.loc[(final_df['income'] > 399999) & (final_df['income'] <= 499999), 'income_bracket'] = '400000-499999'

In [31]:
# 500,000+
final_df.loc[(final_df['income'] >= 500000), 'income_bracket'] = '>500000'

In [32]:
# Check income bracket flag
final_df['income_bracket'].value_counts().sort_index()

100000-199999    2779273
200000-299999      30633
300000-399999      10470
400000-499999       5627
<100000          3334720
>500000             3706
Name: income_bracket, dtype: int64

In [33]:
final_df[['income', 'income_bracket']].head()

Unnamed: 0,income,income_bracket
0,40423,<100000
1,40423,<100000
2,40423,<100000
3,40423,<100000
4,40423,<100000


## Parental Status Groups

In [34]:
# checking dependant numbers
final_df['n_dependants'].value_counts().sort_index()

0    1540064
1    1537718
2    1537957
3    1548690
Name: n_dependants, dtype: int64

In [35]:
# flag 1+ children
final_df.loc[(final_df['n_dependants'] >= 1), 'parental_status'] = 'Parent'

In [37]:
# flagging non-parents (0 children)
final_df.loc[(final_df['n_dependants'] == 0), 'parental_status'] = 'Non-parent'

In [38]:
# check parental_status 
final_df['parental_status'].value_counts().sort_index()

Non-parent    1540064
Parent        4624365
Name: parental_status, dtype: int64

In [39]:
final_df[['n_dependants', 'parental_status']].head()

Unnamed: 0,n_dependants,parental_status
0,3,Parent
1,3,Parent
2,3,Parent
3,3,Parent
4,3,Parent


## Check data type of new variables

In [40]:
final_df[['age_range', 'income_bracket', 'parental_status']].dtypes

age_range          object
income_bracket     object
parental_status    object
dtype: object

In [41]:
#changing to category to optimise memory
final_df[['age_range', 'income_bracket', 'parental_status']] = final_df[['age_range', 'income_bracket', 'parental_status']].astype('category')

In [42]:
final_df[['age_range', 'income_bracket', 'parental_status']].dtypes

age_range          category
income_bracket     category
parental_status    category
dtype: object

## confirm existing variable types

In [43]:
final_df.dtypes

order_id                     int64
user_id                      int64
order_number                 int64
orders_day_of_week           int64
order_hour_of_day            int64
days_since_prior_order     float64
first_order                   bool
product_id                   int64
add_to_cart_order            int64
product_name                object
aisle_id                   float64
department_id              float64
prices                     float64
max_order                    int64
loyalty_flag                object
user_avg_price             float64
spending_habit              object
order_frequency             object
gender                      object
state                       object
age                          int64
date_joined                 object
n_dependants                 int64
fam_status                  object
income                       int64
_merge                    category
region                    category
exclusion_flag            category
age_range           

### changing variable types to category to optimise memory

In [44]:
final_df[['gender', 'state', 'fam_status', 'spending_habit', 'product_name']] = final_df[['gender', 'state', 'fam_status', 'spending_habit', 'product_name']].astype('category')

In [45]:
final_df.dtypes

order_id                     int64
user_id                      int64
order_number                 int64
orders_day_of_week           int64
order_hour_of_day            int64
days_since_prior_order     float64
first_order                   bool
product_id                   int64
add_to_cart_order            int64
product_name              category
aisle_id                   float64
department_id              float64
prices                     float64
max_order                    int64
loyalty_flag                object
user_avg_price             float64
spending_habit            category
order_frequency             object
gender                    category
state                     category
age                          int64
date_joined                 object
n_dependants                 int64
fam_status                category
income                       int64
_merge                    category
region                    category
exclusion_flag            category
age_range           

## Data Insights

In [46]:
# crosstab of age_range & income_bracket variables
xtab_age_income= pd.crosstab(final_df['age_range'], final_df['income_bracket'], dropna = False)

In [47]:
xtab_age_income

income_bracket,100000-199999,200000-299999,300000-399999,400000-499999,<100000,>500000
age_range,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
20-29,26961,1323,2058,0,932943,0
30-39,28097,1847,1832,0,933434,0
40-49,607087,7994,1278,1843,353445,1115
50-59,661248,4973,2034,836,294299,691
60-69,653163,6367,1782,1584,284789,533
70-79,666785,6192,958,1103,286128,982
80+,130357,1628,120,261,62051,385
<19,5575,309,408,0,187631,0


In [48]:
# crosstab of age_range & parental_status variables

In [49]:
xtab_age_parent = pd.crosstab(final_df['age_range'], final_df['parental_status'], dropna = False)

In [50]:
xtab_age_parent

parental_status,Non-parent,Parent
age_range,Unnamed: 1_level_1,Unnamed: 2_level_1
20-29,242693,720592
30-39,241279,723931
40-49,241531,731231
50-59,243290,720791
60-69,239844,708374
70-79,236229,725919
80+,50696,144106
<19,44502,149421


In [51]:
# crosstab of age_range & order_hour_of_day variables
xtab_age_loyalty = pd.crosstab(final_df['age_range'], final_df['loyalty_flag'], dropna = False)

In [52]:
xtab_age_loyalty

loyalty_flag,Loyal customer,New customer,Regular customer
age_range,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
20-29,316384,149837,497064
30-39,319343,147845,498022
40-49,324398,150160,498204
50-59,321099,152263,490719
60-69,314420,150235,483563
70-79,316223,150904,495021
80+,66544,29885,98373
<19,63109,29803,101011


In [53]:
# crosstab parental status & order frequency
xtab_parent_freq = pd.crosstab(final_df['parental_status'], final_df['order_frequency'], dropna = False)

In [54]:
xtab_parent_freq

order_frequency,Frequent customer,Non-frequent customer
parental_status,Unnamed: 1_level_1,Unnamed: 2_level_1
Non-parent,937535,231979
Parent,2807614,700198


In [56]:
# crosstab of age_range & spending_habit
xtab_age_spender = pd.crosstab(final_df['age_range'], final_df['spending_habit'], dropna = False)

In [57]:
xtab_age_spender

spending_habit,Higher spender,Lower spender
age_range,Unnamed: 1_level_1,Unnamed: 2_level_1
20-29,12090,951195
30-39,12159,953051
40-49,13953,958809
50-59,12859,951222
60-69,12707,935511
70-79,12677,949471
80+,2499,192303
<19,2525,191398


In [59]:
# crosstab department_id & income_bracket
xtab_deptid_income = pd.crosstab(final_df['department_id'], final_df['income_bracket'], dropna = False)

In [60]:
xtab_deptid_income

income_bracket,100000-199999,200000-299999,300000-399999,400000-499999,<100000,>500000
department_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1.0,192449,2413,903,421,224482,243
2.0,3061,69,16,11,3659,2
3.0,102389,1538,534,255,117765,148
4.0,834318,5835,2124,1011,963955,857
5.0,12423,681,275,191,14881,109
6.0,23476,247,74,55,27142,38
7.0,218497,3085,967,544,287684,322
8.0,8710,250,77,59,9868,26
9.0,77077,813,309,133,85581,128
10.0,2845,6,0,0,3834,1


# Export DF

In [62]:
final_df.to_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'Instacart_final2.pkl'))