### This script contains:

#### 1. Import libraries & datasets
#### 2. Determine the max orders and create a max_order flag
#### 3. Create a loyalty flag for existing customers
#### 4. Create a spend_group flag
#### 5. Create a order_frequency_flag
#### 6. Exporting final dataframe

### Importing libraries and datasets

In [1]:
#Import libraries 
import pandas as pd
import numpy as np
import os

In [2]:
#Folder path into usable string
path = r'C:\Users\willm\Instacart Basket Analysis'

In [3]:
#Import Orders_wrangled Dataset
ords_prods_merged = pd.read_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'orders_products_hourly_daily.pkl'))

### Determining the max orders and creating a max_order flag

In [4]:
# Creating a max_order flag
ords_prods_merged['max_order'] = ords_prods_merged.groupby(['user_id'])['order_number'].transform(np.max)

In [5]:
ords_prods_merged.head(15)

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,first_order,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,prices,_merge,price_range_loc,busiest_day,busiest_period_of_day,max_order
0,2539329,1,1,2,8,,First Order,196,1,0,Soda,77,7,9.0,both,Mid-range product,Regularly busy,Average orders,10
1,2398795,1,2,3,7,15.0,Repeat Customer,196,1,1,Soda,77,7,9.0,both,Mid-range product,Slowest days,Average orders,10
2,473747,1,3,3,12,21.0,Repeat Customer,196,1,1,Soda,77,7,9.0,both,Mid-range product,Slowest days,Most orders,10
3,2254736,1,4,4,7,29.0,Repeat Customer,196,1,1,Soda,77,7,9.0,both,Mid-range product,Slowest days,Average orders,10
4,431534,1,5,4,15,28.0,Repeat Customer,196,1,1,Soda,77,7,9.0,both,Mid-range product,Slowest days,Most orders,10
5,3367565,1,6,2,7,19.0,Repeat Customer,196,1,1,Soda,77,7,9.0,both,Mid-range product,Regularly busy,Average orders,10
6,550135,1,7,1,9,20.0,Repeat Customer,196,1,1,Soda,77,7,9.0,both,Mid-range product,Busiest days,Average orders,10
7,3108588,1,8,1,14,14.0,Repeat Customer,196,2,1,Soda,77,7,9.0,both,Mid-range product,Busiest days,Most orders,10
8,2295261,1,9,1,16,0.0,Repeat Customer,196,4,1,Soda,77,7,9.0,both,Mid-range product,Busiest days,Most orders,10
9,2550362,1,10,4,8,30.0,Repeat Customer,196,1,1,Soda,77,7,9.0,both,Mid-range product,Slowest days,Average orders,10


### Create a loyalty flag for existing customers

In [6]:
#Creating a new column to display one of three flags, depending on ammount of orders a customer has made
ords_prods_merged.loc[ords_prods_merged['max_order'] > 40, 'loyalty_flag'] = 'Loyal customer'
ords_prods_merged.loc[(ords_prods_merged['max_order'] <= 40) & (ords_prods_merged['max_order'] > 10), 'loyalty_flag'] = 'Regular customer'
ords_prods_merged.loc[ords_prods_merged['max_order'] <= 10, 'loyalty_flag'] = 'New customer'

In [7]:
#Checking the frequency of the flags
ords_prods_merged['loyalty_flag'].value_counts(dropna = False)

Regular customer    15874128
Loyal customer      10282763
New customer         6242841
Name: loyalty_flag, dtype: int64

In [8]:
#Requesting to just view 3 columns to see the new flag working
ords_prods_merged[['user_id', 'loyalty_flag', 'order_number']].head(20)

Unnamed: 0,user_id,loyalty_flag,order_number
0,1,New customer,1
1,1,New customer,2
2,1,New customer,3
3,1,New customer,4
4,1,New customer,5
5,1,New customer,6
6,1,New customer,7
7,1,New customer,8
8,1,New customer,9
9,1,New customer,10


#### The marketing team at Instacart wants to know whether there’s a difference between the spending habits of the three types of customers you identified. Use the loyalty flag you created and check the basic statistics of the product prices for each loyalty category (Loyal Customer, Regular Customer, and New Customer). What you’re trying to determine is whether the prices of products purchased by loyal customers differ from those purchased by regular or new customers.

In [9]:
#Running a GroupBy on the loyalty flag column, and checking the Mean of the Prices column 
# - I am also requesting the Min and Max to spot any anomalies
ords_prods_merged.groupby('loyalty_flag').agg({'prices': ['mean', 'min', 'max']})

Unnamed: 0_level_0,prices,prices,prices
Unnamed: 0_level_1,mean,min,max
loyalty_flag,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
Loyal customer,7.773575,1.0,25.0
New customer,7.801206,1.0,25.0
Regular customer,7.798262,1.0,25.0


### Creating a spend_group flag

In [10]:
#Creating a new column to classify spend groupings
ords_prods_merged['spend_group'] = ords_prods_merged.groupby(['user_id'])['prices'].transform(np.mean)

In [11]:
#Creating a new column to display one of three flags, depending on the average price of a customers item spend
ords_prods_merged.loc[ords_prods_merged['spend_group'] >= 10, 'spend_flag'] = 'High spender'
ords_prods_merged.loc[ords_prods_merged['spend_group'] < 10, 'spend_flag'] = 'Low spender'

In [12]:
#Checking the frequency of the flags
ords_prods_merged['spend_flag'].value_counts(dropna = False)

Low spender     32280013
High spender      119719
Name: spend_flag, dtype: int64

In [13]:
#Checking the new columns
ords_prods_merged[['user_id', 'spend_group', 'spend_flag']].head(15)

Unnamed: 0,user_id,spend_group,spend_flag
0,1,6.367797,Low spender
1,1,6.367797,Low spender
2,1,6.367797,Low spender
3,1,6.367797,Low spender
4,1,6.367797,Low spender
5,1,6.367797,Low spender
6,1,6.367797,Low spender
7,1,6.367797,Low spender
8,1,6.367797,Low spender
9,1,6.367797,Low spender


### Creating a order_frequency_flag

In [14]:
#Creating a new column to classify days since order groupings
ords_prods_merged['days_since_last_order_median'] = ords_prods_merged.groupby(['user_id'])['days_since_prior_order'].transform(np.mean).round(2)

In [15]:
#Creating a new column to display one of three flags, depending on the average price of a customers item spend
ords_prods_merged.loc[ords_prods_merged['days_since_last_order_median'] > 20, 'order_frequency_flag'] = 'Non-frequent customer'
ords_prods_merged.loc[(ords_prods_merged['days_since_last_order_median'] <= 20) & (ords_prods_merged['days_since_last_order_median'] > 10), 'order_frequency_flag'] = 'Regular customer'
ords_prods_merged.loc[ords_prods_merged['days_since_last_order_median'] <= 10, 'order_frequency_flag'] = 'Frequent customer'

In [16]:
#Checking the frequency of the flags
ords_prods_merged['days_since_last_order_median'].value_counts(dropna = False)

30.00    279549
7.00      50715
7.33      43352
7.06      40141
6.86      40110
          ...  
0.89         10
0.17          8
0.57          8
0.60          7
NaN           5
Name: days_since_last_order_median, Length: 2939, dtype: int64

### Exporting the dataframe

In [17]:
#Exporting data to pkl
ords_prods_merged.to_pickle(os.path.join(path, '02 Data','Prepared Data', '4-8.pkl'))