# 4.10 Coding Etiquette & Excel Reporting

## This script contains the following points:

### 1. Import Libraries

### 2. Import Data

### 3. Create Customer Profiles

#### 3.1 Create Demographic Profile - age, family status and number of dependents

#### 3.2 Create Diet/Purchasing Profile - based on 'department_id'

#### 3.3 Create Income Profile - based on 'income'

### 4. Optimise Data Types

### 5. Export Data

# 1. Import Libraries

In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
import scipy


# 2. Import Data

In [2]:
# Creating path string to folder
path = r'C:\Users\Admin\Desktop\Instacart Basket Analysis'

In [3]:
# Importing active_customer_sample df from csv
active_customer = pd.read_csv(os.path.join(path, '02 Data', 'Prepared Data', 'active_customer_sample.csv')) 

In [4]:
# check shape of df
active_customer.shape

(15329101, 31)

In [5]:
# Check head of df
active_customer.head()

Unnamed: 0.1,Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,hour_order_placed,days_since_prior_order,product_id,add_to_cart_order,reordered,...,frequency_flag,Gender,State,Age,date_joined,no_of_dependents,fam_status,income,Region,low_activity_flag
0,0,2539329,1,1,2,8,,196,1.0,0,...,Non-frequent customer,Female,Alabama,31,2/17/2019,3,married,40423,South,Active customer
1,1,2398795,1,2,3,7,15.0,196,1.0,1,...,Non-frequent customer,Female,Alabama,31,2/17/2019,3,married,40423,South,Active customer
2,2,473747,1,3,3,12,21.0,196,1.0,1,...,Non-frequent customer,Female,Alabama,31,2/17/2019,3,married,40423,South,Active customer
3,3,2254736,1,4,4,7,29.0,196,1.0,1,...,Non-frequent customer,Female,Alabama,31,2/17/2019,3,married,40423,South,Active customer
4,4,431534,1,5,4,15,28.0,196,1.0,1,...,Non-frequent customer,Female,Alabama,31,2/17/2019,3,married,40423,South,Active customer


In [6]:
# Check columns
active_customer.columns

Index(['Unnamed: 0', 'order_id', 'user_id', 'order_number',
       'orders_day_of_week', 'hour_order_placed', 'days_since_prior_order',
       'product_id', 'add_to_cart_order', 'reordered', 'product_name',
       'aisle_id', 'department_id', 'prices', 'busiest_days',
       'busiest_period_of_day', 'max_order', 'loyalty_flag',
       'mean_product_price', 'spending_flag', 'median_days_since_prior_order',
       'frequency_flag', 'Gender', 'State', 'Age', 'date_joined',
       'no_of_dependents', 'fam_status', 'income', 'Region',
       'low_activity_flag'],
      dtype='object')

In [7]:
# drop 'Unnamed: 0' column
active_customer = active_customer.drop(columns = 'Unnamed: 0')

# 3. Create Customer Profiles

In [8]:
## Explore column values to inform customer profiles
# Frequency check of 'fam status'
active_customer['fam_status'].value_counts(dropna = False)

married                             10762710
single                               2526935
divorced/widowed                     1309891
living with parents and siblings      729565
Name: fam_status, dtype: int64

In [9]:
# Frequency check of 'no_of_dependents'
active_customer['no_of_dependents'].value_counts(dropna = False)

3    3851783
0    3836826
2    3825575
1    3814917
Name: no_of_dependents, dtype: int64

In [10]:
# Print descriptive stastics of age column
active_customer['Age'].describe()

count    1.532910e+07
mean     4.946328e+01
std      1.847878e+01
min      1.800000e+01
25%      3.300000e+01
50%      4.900000e+01
75%      6.500000e+01
max      8.100000e+01
Name: Age, dtype: float64

#### 3.1 Create Demographic Profile - age, family status and number of dependents

In [12]:
# Use loc() function to create customer profile flag based on age, family status and number of dependents - 'Young Individual without Kids'
active_customer.loc[(active_customer['Age'] < 35) & (active_customer['fam_status'] == 'single') & (active_customer['no_of_dependents'] == 0), 'customer_profile'] = 'Young Individual without Kids'

In [13]:
active_customer.loc[(active_customer['Age'] < 35) & (active_customer['fam_status'] == 'divorced/widowed') & (active_customer['no_of_dependents'] == 0), 'customer_profile'] = 'Young Individual without Kids'

In [14]:
active_customer.loc[(active_customer['Age'] < 35) & (active_customer['fam_status'] == 'living with parents and siblings') & (active_customer['no_of_dependents'] == 0), 'customer_profile'] = 'Young Individual without Kids'

In [15]:
# Use loc() function to create customer profile flag based on age, family status and number of dependents - 'Young Individual with Kids'
active_customer.loc[(active_customer['Age'] < 35) & (active_customer['fam_status'] == 'single') & (active_customer['no_of_dependents'] > 0), 'customer_profile'] = 'Young Individual with Kids'

In [16]:
active_customer.loc[(active_customer['Age'] < 35) & (active_customer['fam_status'] == 'divorced/widowed') & (active_customer['no_of_dependents'] > 0), 'customer_profile'] = 'Young Individual with Kids'

In [17]:
active_customer.loc[(active_customer['Age'] < 35) & (active_customer['fam_status'] == 'living with parents and siblings') & (active_customer['no_of_dependents'] > 0), 'customer_profile'] = 'Young Individual with Kids'

In [18]:
# Use loc() function to create customer profile flag based on age, family status and number of dependents - 'Young Married without Kids'
active_customer.loc[(active_customer['Age'] < 35) & (active_customer['fam_status'] == 'married') & (active_customer['no_of_dependents'] == 0), 'customer_profile'] = 'Young Married without Kids'

In [19]:
# Use loc() function to create customer profile flag based on age, family status and number of dependents - 'Young Married with Kids'
active_customer.loc[(active_customer['Age'] < 35) & (active_customer['fam_status'] == 'married') & (active_customer['no_of_dependents'] > 0), 'customer_profile'] = 'Young Married with Kids'

In [20]:
# Use loc() function to create customer profile flag based on age, family status and number of dependents - 'Middle Aged Individual without Kids'
active_customer.loc[(active_customer['Age'] >= 35) & (active_customer['Age'] < 60) & (active_customer['fam_status'] == 'single') & (active_customer['no_of_dependents'] == 0), 'customer_profile'] = 'Middle Aged Individual without Kids'

In [21]:
active_customer.loc[(active_customer['Age'] >= 35) & (active_customer['Age'] < 60) & (active_customer['fam_status'] == 'divorced/widowed') & (active_customer['no_of_dependents'] == 0), 'customer_profile'] = 'Middle Aged Individual without Kids'

In [22]:
active_customer.loc[(active_customer['Age'] >= 35) & (active_customer['Age'] < 60) & (active_customer['fam_status'] == 'living with parents and siblings') & (active_customer['no_of_dependents'] == 0), 'customer_profile'] = 'Middle Aged Individual without Kids'

In [23]:
# Use loc() function to create customer profile flag based on age, family status and number of dependents - 'Middle Aged Individual with Kids'
active_customer.loc[(active_customer['Age'] >= 35) & (active_customer['Age'] < 60) & (active_customer['fam_status'] == 'single') & (active_customer['no_of_dependents'] > 0), 'customer_profile'] = 'Middle Aged Individual with Kids'

In [24]:
active_customer.loc[(active_customer['Age'] >= 35) & (active_customer['Age'] < 60) & (active_customer['fam_status'] == 'divorced/widowed') & (active_customer['no_of_dependents'] > 0), 'customer_profile'] = 'Middle Aged Individual with Kids'

In [25]:
active_customer.loc[(active_customer['Age'] >= 35) & (active_customer['Age'] < 60) & (active_customer['fam_status'] == 'living with parents and siblings') & (active_customer['no_of_dependents'] > 0), 'customer_profile'] = 'Middle Aged Individual with Kids'

In [26]:
# Use loc() function to create customer profile flag based on age, family status and number of dependents - 'Middle Aged Married without Kids'
active_customer.loc[(active_customer['Age'] >= 35) & (active_customer['Age'] < 60) & (active_customer['fam_status'] == 'married') & (active_customer['no_of_dependents'] == 0), 'customer_profile'] = 'Middle Aged Married without Kids'

In [27]:
# Use loc() function to create customer profile flag based on age, family status and number of dependents - 'Middle Aged Married with Kids'
active_customer.loc[(active_customer['Age'] >= 35) & (active_customer['Age'] < 60) & (active_customer['fam_status'] == 'married') & (active_customer['no_of_dependents'] > 0), 'customer_profile'] = 'Middle Aged Married with Kids'

In [28]:
# Use loc() function to create customer profile flag based on age, family status and number of dependents - 'Older Individual without Kids'
active_customer.loc[(active_customer['Age'] >= 60) & (active_customer['fam_status'] == 'single') & (active_customer['no_of_dependents'] == 0), 'customer_profile'] = 'Older Individual without Kids'

In [29]:
active_customer.loc[(active_customer['Age'] >= 60) & (active_customer['fam_status'] == 'divorced/widowed') & (active_customer['no_of_dependents'] == 0), 'customer_profile'] = 'Older Individual without Kids'

In [30]:
active_customer.loc[(active_customer['Age'] >= 60) & (active_customer['fam_status'] == 'living with parents and siblings') & (active_customer['no_of_dependents'] == 0), 'customer_profile'] = 'Older Individual without Kids'

In [31]:
# Use loc() function to create customer profile flag based on age, family status and number of dependents - 'Older Individual with Kids'
active_customer.loc[(active_customer['Age'] >= 60) & (active_customer['fam_status'] == 'single') & (active_customer['no_of_dependents'] > 0), 'customer_profile'] = 'Older Individual with Kids'

In [32]:
active_customer.loc[(active_customer['Age'] >= 60) & (active_customer['fam_status'] == 'divorced/widowed') & (active_customer['no_of_dependents'] > 0), 'customer_profile'] = 'Older Individual with Kids'

In [33]:
active_customer.loc[(active_customer['Age'] >= 60) & (active_customer['fam_status'] == 'living with parents and siblings') & (active_customer['no_of_dependents'] > 0), 'customer_profile'] = 'Older Individual with Kids'

In [34]:
# Use loc() function to create customer profile flag based on age, family status and number of dependents - 'Older Married without Kids'
active_customer.loc[(active_customer['Age'] >= 60) & (active_customer['fam_status'] == 'married') & (active_customer['no_of_dependents'] == 0), 'customer_profile'] = 'Older Married without Kids'

In [35]:
# Use loc() function to create customer profile flag based on age, family status and number of dependents - 'Older Married with Kids'
active_customer.loc[(active_customer['Age'] >= 60) & (active_customer['fam_status'] == 'married') & (active_customer['no_of_dependents'] > 0), 'customer_profile'] = 'Older Married with Kids'

In [36]:
# Frequency check of 'customer_profile'
active_customer['customer_profile'].value_counts(dropna = False)

Middle Aged Married with Kids          4510509
Older Married with Kids                3922453
Young Married with Kids                2329748
Middle Aged Individual without Kids    1504730
Older Individual without Kids          1309891
Young Individual without Kids          1022205
Young Individual with Kids              729565
Name: customer_profile, dtype: int64

#### 3.2 Create Diet/Purchasing Profile - based on 'department_id'

In [38]:
# Create loop to create new object 'result' based on department id's

result = []

for value in active_customer['department_id']:
    if value == 12:
        result.append('Meat Eater')
    elif value == 8:
        result.append('Pet Owner')
    elif value == 5:
        result.append('Alcohol Drinker')
    elif value == 18:
        result.append('Young Child/Children')
    elif value == 10:
        result.append('Purchases in Volume')
    else:
        result.append('No Profile')

In [39]:
# Assign result to new column
active_customer['Purchasing Profile'] = result

In [40]:
# Frequency check new column
active_customer['Purchasing Profile'].value_counts(dropna = False)

No Profile              14956516
Meat Eater                287460
Alcohol Drinker            41620
Young Child/Children       37621
Pet Owner                   5884
Name: Purchasing Profile, dtype: int64

#### 3.3 Create Income Profile - based on 'income'

In [41]:
# Check descriptive statistics of 'income'
active_customer['income'].describe()

count    1.532910e+07
mean     9.988454e+04
std      4.166126e+04
min      2.590300e+04
25%      6.821000e+04
50%      9.714100e+04
75%      1.281770e+05
max      5.939010e+05
Name: income, dtype: float64

In [42]:
# Use loc() function to create 'income profile', users to be classified as lower income, regular income and higher income
active_customer.loc[active_customer['income'] <= 68000, 'income_profile'] = 'Lower income'

In [43]:
active_customer.loc[(active_customer['income'] > 68000) & (active_customer['income'] <= 128000), 'income_profile '] = 'Regular income'

In [44]:
active_customer.loc[active_customer['income'] > 128000, 'income_profile'] = 'Higher income'

In [45]:
# Frequency check new column
active_customer['income_profile'].value_counts(dropna = False)

NaN              7676860
Higher income    3847580
Lower income     3804661
Name: income_profile, dtype: int64

In [46]:
# Correct typo code for 'Regular Income'
active_customer.loc[(active_customer['income'] > 68000) & (active_customer['income'] <= 128000), 'income_profile'] = 'Regular income'

In [47]:
# Frequency check new column
active_customer['income_profile'].value_counts(dropna = False)

Regular income    7676860
Higher income     3847580
Lower income      3804661
Name: income_profile, dtype: int64

In [48]:
# Check columns
active_customer.columns

Index(['order_id', 'user_id', 'order_number', 'orders_day_of_week',
       'hour_order_placed', 'days_since_prior_order', 'product_id',
       'add_to_cart_order', 'reordered', 'product_name', 'aisle_id',
       'department_id', 'prices', 'busiest_days', 'busiest_period_of_day',
       'max_order', 'loyalty_flag', 'mean_product_price', 'spending_flag',
       'median_days_since_prior_order', 'frequency_flag', 'Gender', 'State',
       'Age', 'date_joined', 'no_of_dependents', 'fam_status', 'income',
       'Region', 'low_activity_flag', 'customer_profile', 'Purchasing Profile',
       'income_profile', 'income_profile '],
      dtype='object')

In [49]:
# drop 'income_profile ' column created in error
active_customer = active_customer.drop(columns = 'income_profile ')

In [51]:
# Check data types of df
active_customer.dtypes

order_id                           int64
user_id                            int64
order_number                       int64
orders_day_of_week                 int64
hour_order_placed                  int64
days_since_prior_order           float64
product_id                         int64
add_to_cart_order                float64
reordered                          int64
product_name                      object
aisle_id                           int64
department_id                      int64
prices                           float64
busiest_days                      object
busiest_period_of_day             object
max_order                          int64
loyalty_flag                      object
mean_product_price               float64
spending_flag                     object
median_days_since_prior_order    float64
frequency_flag                    object
Gender                            object
State                             object
Age                                int64
date_joined     

# 4. Optimise Data Types

In [52]:
# Change data type of hour_order_placed column to int8
active_customer['hour_order_placed'] = active_customer['hour_order_placed'].astype('int8')

In [53]:
# Change data type of days_since_prior_order column to float16
active_customer['days_since_prior_order'] = active_customer['days_since_prior_order'].astype('float16')

In [54]:
# Change data type of days_since_prior_order column to uint16
active_customer['product_id'] = active_customer['product_id'].astype('uint16')

In [55]:
# Change data type of add_to_cart_order column to float16
active_customer['add_to_cart_order'] = active_customer['add_to_cart_order'].astype('float16')

In [56]:
# Change data type of reordered column to int8
active_customer['reordered'] = active_customer['reordered'].astype('int8')

In [57]:
# Change data type of aisle_id column to int8
active_customer['aisle_id'] = active_customer['aisle_id'].astype('int8')

In [58]:
# Change data type of days_since_prior_order column to int8
active_customer['department_id'] = active_customer['department_id'].astype('int8')

In [59]:
# Change data type of prices column to float16
active_customer['prices'] = active_customer['prices'].astype('float16')

In [60]:
# Change data type of max_order column to uint8
active_customer['max_order'] = active_customer['max_order'].astype('uint8')

In [61]:
# Change data type of mean_product_price column to float16
active_customer['mean_product_price'] = active_customer['mean_product_price'].astype('float16')

In [62]:
# Change data type of median_days_since_prior_order column to float16
active_customer['median_days_since_prior_order'] = active_customer['median_days_since_prior_order'].astype('float16')

In [64]:
# Change data type of orders_day_of_week column to int8
active_customer['orders_day_of_week'] = active_customer['orders_day_of_week'].astype('int8')

# 5. Export Data

In [65]:
active_customer.to_csv(os.path.join(path,'02 Data','Prepared Data','active_customer_profiles.csv'))