# Contents 

### 01. Importing libraries and data
### 02. Customer profiling


## Importing libraries and data

In [1]:
# Import libraries 
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
import scipy

In [2]:
# Import active customers data

df_final2 = pd.read_pickle(r'C:\Users\David\Desktop\Instacart Basket Analysis\02 Data\Prepared Data\active_customers.pkl')

In [3]:
path =r'C:\Users\David\Desktop\Instacart Basket Analysis'

### Customer profiles 

#### Profiles based on age

In [4]:
# Descibe age variable 
df_final2['Age'].describe()

count    3.096456e+07
mean     4.946803e+01
std      1.848528e+01
min      1.800000e+01
25%      3.300000e+01
50%      4.900000e+01
75%      6.500000e+01
max      8.100000e+01
Name: Age, dtype: float64

The minimum age is 18, and the max age is 81. We can group customers by age group as follows:

18 - 35

36 - 60

61 and older

In [5]:
 # Age category column
df_final2.loc[(df_final2['Age'] >35) & (df_final2['Age'] <=60), 'age_category'] = 'Middle-aged Adults'
df_final2.loc[df_final2['Age'] >60, 'age_category'] = 'Senior Citizens'
df_final2.loc[df_final2['Age'] <=35, 'age_category'] = 'Young Adults'

In [6]:
# Check results 
df_final2['age_category'].value_counts(dropna = False)

Middle-aged Adults    12113152
Senior Citizens       10112607
Young Adults           8738805
Name: age_category, dtype: int64

In [7]:
df_final2.shape

(30964564, 37)

#### Profiles based on income 

In [8]:
# Descibe income variable 
df_final2['income'].describe()

count    3.096456e+07
mean     9.967587e+04
std      4.314187e+04
min      2.590300e+04
25%      6.729200e+04
50%      9.676500e+04
75%      1.281020e+05
max      5.939010e+05
Name: income, dtype: float64

In [9]:
# creating a flag for 'income' column
df_final2.loc[(df_final2['income'] >=25900) & (df_final2['income'] <100000), 'income_class'] = 'Lower-income'
df_final2.loc[(df_final2['income'] >=100000) & (df_final2['income'] <=200000), 'income_class'] = 'Middle-income'
df_final2.loc[df_final2['income'] >200000, 'income_class'] = 'Upper_income'

#### Profiles based on department_id

In [10]:
# Import departments_wrangled.csv data
df_dept = pd.read_csv(os.path.join(path,'02 Data', 'Prepared Data', 'departments_wrangled.csv'))

In [11]:
df_dept

Unnamed: 0.1,Unnamed: 0,department
0,1,frozen
1,2,other
2,3,bakery
3,4,produce
4,5,alcohol
5,6,international
6,7,beverages
7,8,pets
8,9,dry goods pasta
9,10,bulk


In [12]:
#Rename "Unnamed:0" to "department_id"
df_dept.rename(columns = {'Unnamed: 0' : 'department_id'}, inplace = True)

In [13]:
df_dept

Unnamed: 0,department_id,department
0,1,frozen
1,2,other
2,3,bakery
3,4,produce
4,5,alcohol
5,6,international
6,7,beverages
7,8,pets
8,9,dry goods pasta
9,10,bulk


In [14]:
# Merging departments dataframe with 'df_final2'
df_merged = df_final2.merge(df_dept, on = 'department_id')

In [15]:
# Check merge results 
df_merged[['user_id', 'department_id']].head()

Unnamed: 0,user_id,department_id
0,1,7
1,1,7
2,1,7
3,1,7
4,1,7


In [16]:
df_merged.columns

Index(['order_id', 'user_id', 'eval_set', 'order_number', 'orders_day_of_week',
       'order_hour', 'days_since_prior_order', 'product_id',
       'add_to_cart_order', 'reordered', '_merge', 'product_name', 'aisle_id',
       'department_id', 'prices', ' price_range_loc', 'price_range_loc',
       'busiest_day', 'busiest_days', 'busiest_period_of_day', 'max_order',
       'loyalty_flag', 'mean_spend', 'spender_flag', 'median_order_frequency',
       'order_frequency_flag', 'Unnamed: 0', 'Gender', 'STATE', 'Age',
       'date_joined', 'dependents', 'fam_status', 'income', 'region',
       'customer_activity', 'age_category', 'income_class', 'department'],
      dtype='object')

#### Profiles based on dependents 

In [19]:
# Check frequency of dependents column
df_merged['dependents'].describe()

count    3.096456e+07
mean     1.501819e+00
std      1.118896e+00
min      0.000000e+00
25%      1.000000e+00
50%      2.000000e+00
75%      3.000000e+00
max      3.000000e+00
Name: dependents, dtype: float64

In [20]:
# checking martitual status column.
df_merged['fam_status'].value_counts(dropna = False)

married                             21743711
single                               5094410
divorced/widowed                     2645271
living with parents and siblings     1481172
Name: fam_status, dtype: int64

In [24]:
# Creating flag for number of dependents in a household.

df_merged.loc[(df_merged['fam_status'].isin(['divorced/widowed','single','living with parents and siblings'])) & (df_merged['dependents'] ==0), 'customer_profile'] = 'Single adult'

df_merged.loc[(df_merged['fam_status'].isin(['divorced/widowed','single','living with parents and siblings'])) & (df_merged['dependents'] >0), 'customer_profile'] = 'Young parent'

df_merged.loc[(df_merged['fam_status']=='married') & (df_merged['dependents'] <=2), 'customer_profile'] = 'Family'

df_merged.loc[(df_merged['fam_status']=='married') & (df_merged['dependents'] >2), 'customer_profile'] = 'Big Family'

In [26]:
# Checking the results.
df_merged[['fam_status', 'dependents', 'customer_profile']].head(30)

Unnamed: 0,fam_status,dependents,customer_profile
0,married,3,Big Family
1,married,3,Big Family
2,married,3,Big Family
3,married,3,Big Family
4,married,3,Big Family
5,married,3,Big Family
6,married,3,Big Family
7,married,3,Big Family
8,married,3,Big Family
9,married,3,Big Family


In [27]:
df_merged['customer_profile'].value_counts(dropna = False)

Family          14459664
Single adult     7739681
Big Family       7284047
Young parent     1481172
Name: customer_profile, dtype: int64

## Export data 

In [29]:
df_merged.to_pickle(os.path.join(path, '02 Data','Prepared Data', 'active_customers_orders_products_depts.pkl'))