# Script that splits and flags customers based on loyalty, spending and frequency

# The script contains the following sections:
## 1. Importing Libraries
## 2. Importing Data
## 3. Deriving Columns
### 3a. Flagging number of orders
### 3b. Flagging spending (average item price)
### 3c. Flagging Frequency (median days between orders)
## 4. Exporting Data

# 1. Importing Libraries

In [1]:
# Importing libraries
import pandas as pd
import numpy as np
import os

# 2. Importing Data

In [2]:
# Folder Shortcut
path = r'C:\Users\seank\OneDrive\Dokumente\Career Foundry Data Analytics Course\Data Immersion\4 Python\03-2020_Instacart_Basket _Analysis'

In [3]:
# Importing Merged Data
ords_prods = pd.read_pickle(os.path.join(path, '02_Data', 'Prepared_Data', 'ords_prods_vars.pkl'))

# 3. Aggregating and Deriving Variables

In [4]:
# Average orders per user by department
ords_prods.groupby('department_id')['order_number'].mean().sort_values(ascending=False)

department_id
21    22.902379
10    20.197148
18    19.310397
4     17.811403
16    17.665606
2     17.277920
7     17.225802
19    17.177343
3     17.170395
14    16.773669
13    16.583536
20    16.473447
6     16.439806
11    16.170638
15    16.165037
9     15.895474
12    15.887671
17    15.694469
1     15.457838
8     15.340650
5     15.215751
Name: order_number, dtype: float64

# Flagging customers based on number of orders

In [5]:
# column with customer's max orders
ords_prods['max_order'] = ords_prods.groupby(['user_id'])['order_number'].transform(np.max)

  ords_prods['max_order'] = ords_prods.groupby(['user_id'])['order_number'].transform(np.max)


In [6]:
# Deriving 3-level 'loyalty_flag' var from max order score 
ords_prods.loc[ords_prods['max_order'] > 40, 'loyalty_flag'] = 'Loyal customer'
ords_prods.loc[(ords_prods['max_order'] <= 40) & (ords_prods['max_order'] > 10), 'loyalty_flag'] = 'Regular customer'
ords_prods.loc[ords_prods['max_order'] <= 10, 'loyalty_flag'] = 'New customer'

In [7]:
# Checking output
ords_prods[['user_id', 'order_id', 'max_order', 'loyalty_flag']].head(100)

Unnamed: 0,user_id,order_id,max_order,loyalty_flag
0,1,2539329,10,New customer
1,1,2539329,10,New customer
2,1,2539329,10,New customer
3,1,2539329,10,New customer
4,1,2539329,10,New customer
...,...,...,...,...
95,2,738281,14,Regular customer
96,2,1673511,14,Regular customer
97,2,1673511,14,Regular customer
98,2,1673511,14,Regular customer


In [40]:
ords_prods[['user_id', 'loyalty_flag']].drop_duplicates()['loyalty_flag'].value_counts()

loyalty_flag
New customer        112328
Regular customer     76864
Loyal customer       17017
Name: count, dtype: int64

Checking if loyal customers are buying products that are different prices than regular/new customers

In [10]:
# Retrieving average, min, max, 1,3 quartile prices by customer group
    # Defining 25th Percentile
def q25(x):
    return x.quantile(0.25)

    # 75th Percentile
def q75(x):
    return x.quantile(0.75)

# Group by customer group and return some main statisitics about product prices
ords_prods.groupby('loyalty_flag').agg({'prices': ['mean', 'min', 'max', q25, q75]})

Unnamed: 0_level_0,prices,prices,prices,prices,prices
Unnamed: 0_level_1,mean,min,max,q25,q75
loyalty_flag,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
Loyal customer,7.773414,1.0,25.0,4.2,11.2
New customer,7.800896,1.0,25.0,4.2,11.3
Regular customer,7.797993,1.0,25.0,4.2,11.3


The products purchased by all customer groups have very similar average prices, as well as min, max and 1st and 3rd quartiles. The average price of the loyal customer group is actually slightly lower than the other groups

## Flagging customers as high/low spenders based on avg product price (<10<)

In [11]:
# Calculating average product price per customer
ords_prods['avg_price'] = ords_prods.groupby('user_id')['prices'].transform(np.mean)
ords_prods.head(100)

  ords_prods['avg_price'] = ords_prods.groupby('user_id')['prices'].transform(np.mean)


Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,product_name,...,department_id,prices,_merge,price_range_loc,busiest_day,busiest_days,Busiest_hours,max_order,loyalty_flag,avg_price
0,2539329,1,1,2,8,,196,1,0,Soda,...,7,9.0,both,Mid-range product,Regularly busy,Regularly busy,Average orders,10,New customer,6.367797
1,2539329,1,1,2,8,,14084,2,0,Organic Unsweetened Vanilla Almond Milk,...,16,12.5,both,Mid-range product,Regularly busy,Regularly busy,Average orders,10,New customer,6.367797
2,2539329,1,1,2,8,,12427,3,0,Original Beef Jerky,...,19,4.4,both,Low-range product,Regularly busy,Regularly busy,Average orders,10,New customer,6.367797
3,2539329,1,1,2,8,,26088,4,0,Aged White Cheddar Popcorn,...,19,4.7,both,Low-range product,Regularly busy,Regularly busy,Average orders,10,New customer,6.367797
4,2539329,1,1,2,8,,26405,5,0,XL Pick-A-Size Paper Towel Rolls,...,17,1.0,both,Low-range product,Regularly busy,Regularly busy,Average orders,10,New customer,6.367797
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,738281,2,4,2,10,8.0,21150,13,0,Fire Grilled Steak Bowl,...,1,5.9,both,Mid-range product,Regularly busy,Regularly busy,Most orders,14,Regular customer,7.515897
96,1673511,2,5,3,11,8.0,47144,1,0,Unsweetened Original Almond Breeze Almond Milk,...,16,14.0,both,Mid-range product,Regularly busy,Least busy days,Most orders,14,Regular customer,7.515897
97,1673511,2,5,3,11,8.0,5322,2,0,Gluten Free Dark Chocolate Chunk Chewy with a ...,...,19,2.9,both,Low-range product,Regularly busy,Least busy days,Most orders,14,Regular customer,7.515897
98,1673511,2,5,3,11,8.0,17224,3,0,Oats & Honey Gluten Free Granola,...,19,1.6,both,Low-range product,Regularly busy,Least busy days,Most orders,14,Regular customer,7.515897


In [12]:
# Assigning customers as high or low spenders 
ords_prods.loc[ords_prods['avg_price'] < 10, 'spending_flag'] = 'Low spender'
ords_prods.loc[ords_prods['avg_price'] >= 10, 'spending_flag'] = 'High spender'

In [13]:
ords_prods[['user_id', 'order_id', 'avg_price', 'spending_flag']].head(100)    # checking output

Unnamed: 0,user_id,order_id,avg_price,spending_flag
0,1,2539329,6.367797,Low spender
1,1,2539329,6.367797,Low spender
2,1,2539329,6.367797,Low spender
3,1,2539329,6.367797,Low spender
4,1,2539329,6.367797,Low spender
...,...,...,...,...
95,2,738281,7.515897,Low spender
96,2,1673511,7.515897,Low spender
97,2,1673511,7.515897,Low spender
98,2,1673511,7.515897,Low spender


In [None]:
# Counting values, making sure to first group by users so that's what's being counted
ords_prods[['user_id', 'spending_flag']].drop_duplicates()['spending_flag'].value_counts()

spending_flag
Low spender     202821
High spender      3388
Name: count, dtype: int64

There are a lot more low spenders than high spenders

# Flagging customers' order frequency based on median days between orders

In [16]:
# Median days between orders for each customer
ords_prods['median_days_tween_orders'] = ords_prods.groupby('user_id')['days_since_prior_order'].transform('median')

In [17]:
# Check output
ords_prods[['user_id', 'order_id', 'median_days_tween_orders']]

Unnamed: 0,user_id,order_id,median_days_tween_orders
0,1,2539329,20.5
1,1,2539329,20.5
2,1,2539329,20.5
3,1,2539329,20.5
4,1,2539329,20.5
...,...,...,...
32404854,206209,2977660,22.0
32404855,206209,2977660,22.0
32404856,206209,2977660,22.0
32404857,206209,2977660,22.0


In [18]:
# Checking if all users only have 1 value for new variable
ords_prods.groupby('user_id')['median_days_tween_orders'].nunique().value_counts()

median_days_tween_orders
1    206208
0         1
Name: count, dtype: int64

In [19]:
# Finding this user with the 0
problem_user = ords_prods.groupby('user_id')['median_days_tween_orders'].nunique()
problem_user = problem_user[problem_user == 0]
print(problem_user.index)

Index([159838], dtype='int64', name='user_id')


In [20]:
ords_prods[ords_prods['user_id'].isin(problem_user.index)]
# They only have one order, hence NaN for days_since_prior_order

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,product_name,...,_merge,price_range_loc,busiest_day,busiest_days,Busiest_hours,max_order,loyalty_flag,avg_price,spending_flag,median_days_tween_orders
25156870,895835,159838,1,0,17,,22198,1,0,4X Ultra Concentrated Natural Laundry Detergen...,...,both,Low-range product,Busiest day,Busiest days,Average orders,1,New customer,7.42,Low spender,
25156871,895835,159838,1,0,17,,23695,2,0,California Veggie Burger,...,both,Low-range product,Busiest day,Busiest days,Average orders,1,New customer,7.42,Low spender,
25156872,895835,159838,1,0,17,,10749,3,0,Organic Red Bell Pepper,...,both,Mid-range product,Busiest day,Busiest days,Average orders,1,New customer,7.42,Low spender,
25156873,895835,159838,1,0,17,,21334,5,0,Organic Peeled Garlic,...,both,Mid-range product,Busiest day,Busiest days,Average orders,1,New customer,7.42,Low spender,
25156874,895835,159838,1,0,17,,33401,6,0,Goat Cheese Crumbles,...,both,Mid-range product,Busiest day,Busiest days,Average orders,1,New customer,7.42,Low spender,


In [21]:
# Assigning flag based on median rules
ords_prods.loc[ords_prods['median_days_tween_orders'] > 20, 'frequency_flag'] = 'Non-frequent customer'
ords_prods.loc[(ords_prods['median_days_tween_orders'] > 10) & (ords_prods['median_days_tween_orders'] <= 20), 'frequency_flag'] = 'Regular customer'
ords_prods.loc[ords_prods['median_days_tween_orders'] <= 10, 'frequency_flag'] = 'Frequent customer'



In [22]:
# Checking frequencies
ords_prods['frequency_flag'].value_counts()

frequency_flag
Frequent customer        21559853
Regular customer          7208564
Non-frequent customer     3636437
Name: count, dtype: int64

These are counts of the rows in the df (products within an order), whereas I want to count users

In [35]:
# Grouping by users
ords_prods[['user_id', 'frequency_flag']].drop_duplicates()['frequency_flag'].value_counts()

frequency_flag
Frequent customer        86596
Regular customer         59993
Non-frequent customer    59619
Name: count, dtype: int64

In [37]:
# Since so many frequent customers, checking if these are also customers with low amount of orders
    # Counting unique users by creating df with one user per row and their loyalty and frequency flag
user_freq_loyalty = ords_prods.groupby(['user_id', 'frequency_flag', 'loyalty_flag']).size().reset_index().drop(columns=0)

In [38]:
user_freq_loyalty[['frequency_flag', 'loyalty_flag']].value_counts().reset_index()

Unnamed: 0,frequency_flag,loyalty_flag,count
0,Non-frequent customer,New customer,52579
1,Frequent customer,Regular customer,40735
2,Regular customer,New customer,30895
3,Regular customer,Regular customer,29089
4,Frequent customer,New customer,28853
5,Frequent customer,Loyal customer,17008
6,Non-frequent customer,Regular customer,7040
7,Regular customer,Loyal customer,9


Even though frequent customers is the largest group, the loyal frequent customers is the smallest segment of this 

# 4. Exporting Data

In [22]:
# Exporting as Pickle
ords_prods.to_pickle(os.path.join(path, '02_Data', 'Prepared_Data', 'ords_prods_vars_flags.pkl'))