# 4.10 Part 1 - Finalise database for customer segmentation

### This script contains the following points:

#### 1. Import libraries
#### 2. Import data
#### 3. Data security
#### 4. Create regional segmentation
#### 5. Create exclusion flag for low-activity customers
#### 6. Export dataframes

# Import Libraries

In [2]:
import pandas as pd
import numpy as np
import os

# Import data

In [3]:
# project folder path

path = r'C:\Users\Odette\Desktop\CareerFoundry\Immersion Courses\Course 4\Instacart Basket Analysis'

In [4]:
# import of orders_products data

final_df = pd.read_pickle(os.path.join(path, '02 Data','Prepared Data', 'orders_products_final.pkl'))

In [4]:
final_df.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_last_order,product_id,add_to_cart_order,reordered,product_name,...,median_order_days,order_frequency,gender,state,age,date_joined,n_dependants,fam_status,income,_merge
0,2539329,1,1,2,8,0,196,1,False,Soda,...,20.0,Regular customer,Female,Alabama,31,2019-02-17,3,married,40423,both
1,2398795,1,2,3,7,15,196,1,True,Soda,...,20.0,Regular customer,Female,Alabama,31,2019-02-17,3,married,40423,both
2,473747,1,3,3,12,21,196,1,True,Soda,...,20.0,Regular customer,Female,Alabama,31,2019-02-17,3,married,40423,both
3,2254736,1,4,4,7,29,196,1,True,Soda,...,20.0,Regular customer,Female,Alabama,31,2019-02-17,3,married,40423,both
4,431534,1,5,4,15,28,196,1,True,Soda,...,20.0,Regular customer,Female,Alabama,31,2019-02-17,3,married,40423,both


In [5]:
final_df.tail()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_last_order,product_id,add_to_cart_order,reordered,product_name,...,median_order_days,order_frequency,gender,state,age,date_joined,n_dependants,fam_status,income,_merge
32404854,156685,106143,26,4,23,5,19675,1,True,Organic Raspberry Black Tea,...,6.5,Frequent customer,Male,Hawaii,25,2017-05-26,0,single,53755,both
32404855,484769,66343,1,6,11,0,47210,1,False,Fresh Farmed Tilapia Fillet,...,24.5,Non-frequent customer,Female,Tennessee,22,2017-09-12,3,married,46151,both
32404856,1561557,66343,2,1,11,30,47210,1,True,Fresh Farmed Tilapia Fillet,...,24.5,Non-frequent customer,Female,Tennessee,22,2017-09-12,3,married,46151,both
32404857,276317,66343,3,6,15,19,47210,1,True,Fresh Farmed Tilapia Fillet,...,24.5,Non-frequent customer,Female,Tennessee,22,2017-09-12,3,married,46151,both
32404858,2922475,66343,4,1,12,30,47210,1,True,Fresh Farmed Tilapia Fillet,...,24.5,Non-frequent customer,Female,Tennessee,22,2017-09-12,3,married,46151,both


In [6]:
final_df.shape

(32404859, 30)

# Data security

The customers dataset, which was merged with the final_df dataframe, contained personally identifiable information (PII) in the form of first and last names. However, these columns were removed as part of the data cleansing process of the customers dataset. There is no other PII in the final_df dataframe. As well as removing all PII, all data has been stored and accessed safely, and no sensitive information has been distributed outside of the network.

# Create regional segmentation

In [5]:
# create a flag for Northeast region in new region column

final_df.loc[final_df['state'].isin(['Maine', 'New Hampshire', 'Vermont', 'Massachusetts', 'Rhode Island', 'Connecticut', 'New York', 'Pennsylvania', 'New Jersey']), 'region'] = 'Northeast'

In [6]:
# create a flag for Midwest region in new region column

final_df.loc[final_df['state'].isin(['Wisconsin', 'Michigan', 'Illinois', 'Indiana', 'Ohio', 'North Dakota', 'South Dakota', 'Nebraska', 'Kansas', 'Minnesota', 'Iowa', 'Missouri']), 'region'] = 'Midwest'

In [7]:
# create a flag for South region in new region column

final_df.loc[final_df['state'].isin(['Delaware', 'Maryland', 'District of Columbia', 'Virginia', 'West Virginia', 'North Carolina', 'South Carolina', 'Georgia', 'Florida', 'Kentucky', 'Tennessee', 'Mississippi', 'Alabama', 'Oklahoma', 'Texas', 'Arkansas', 'Louisiana']), 'region'] = 'South'

In [8]:
# create a flag for West region in new region column

final_df.loc[final_df['state'].isin(['Idaho', 'Montana', 'Wyoming', 'Nevada', 'Utah', 'Colorado', 'Arizona', 'New Mexico', 'Alaska', 'Washington', 'Oregon', 'California', 'Hawaii']), 'region'] = 'West'

In [11]:
final_df['region'].value_counts(dropna = False)

South        10791885
West          8292913
Midwest       7597325
Northeast     5722736
Name: region, dtype: int64

In [12]:
# remove row limit to enable checking the data

pd.options.display.max_rows = None

In [13]:
# head results just for user_id, state & region

final_df[['user_id', 'state', 'region']].head(10)

Unnamed: 0,user_id,state,region
0,1,Alabama,South
1,1,Alabama,South
2,1,Alabama,South
3,1,Alabama,South
4,1,Alabama,South
5,1,Alabama,South
6,1,Alabama,South
7,1,Alabama,South
8,1,Alabama,South
9,1,Alabama,South


In [14]:
# tail results just for user_id, state & region

final_df[['user_id', 'state', 'region']].tail(10)

Unnamed: 0,user_id,state,region
32404849,106143,Hawaii,West
32404850,106143,Hawaii,West
32404851,106143,Hawaii,West
32404852,106143,Hawaii,West
32404853,106143,Hawaii,West
32404854,106143,Hawaii,West
32404855,66343,Tennessee,South
32404856,66343,Tennessee,South
32404857,66343,Tennessee,South
32404858,66343,Tennessee,South


In [9]:
# crosstab of region & loyalty_flag

crosstab = pd.crosstab(final_df['region'], final_df['loyalty_flag'], dropna = False)

In [10]:
# check crosstab analysis results

crosstab

loyalty_flag,Loyal customer,New customer,Regular customer
region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Midwest,2373774,1472573,3750978
Northeast,1841785,1100207,2780744
South,3405844,2074410,5311631
West,2662690,1596800,4033423


There is a difference in spending habits between the regions. The 'South' region is the most populous and also accounts for the most high and low spenders. The 'Northeast' region is the least populous and accounts for the least high and low spenders. However, the 'Midwest' has the highest proportion of its population being high spenders at 2.05%, followed by 'South' with 1.94%, 'West' with 1.93%, and 'Northeast' with 1.89%.

In [11]:
crosstab.to_clipboard()

# Create exclusion flag for low-activity customers

In [17]:
# create a flag for low-activity customers based on less than 5 orders

final_df.loc[final_df['max_order'] < 5, 'exclusion_flag'] = 'Low-activity Customer'

In [18]:
# create a flag for high-activity customers based on 5 or more orders

final_df.loc[final_df['max_order'] >= 5, 'exclusion_flag'] = 'Frequent Customer'

In [19]:
# tail results just for max_order & exclusion_flag

final_df[['max_order', 'exclusion_flag']].tail(10)

Unnamed: 0,max_order,exclusion_flag
32404849,26,Frequent Customer
32404850,26,Frequent Customer
32404851,26,Frequent Customer
32404852,26,Frequent Customer
32404853,26,Frequent Customer
32404854,26,Frequent Customer
32404855,4,Low-activity Customer
32404856,4,Low-activity Customer
32404857,4,Low-activity Customer
32404858,4,Low-activity Customer


In [20]:
final_df['exclusion_flag'].value_counts(dropna = False)

Frequent Customer        30964564
Low-activity Customer     1440295
Name: exclusion_flag, dtype: int64

In [21]:
# check dtype of new variables

final_df[['region', 'exclusion_flag']].dtypes

region            object
exclusion_flag    object
dtype: object

In [22]:
# convert the region & exclusion column to category format

final_df[['region', 'exclusion_flag']] = final_df[['region', 'exclusion_flag']].astype('category')

In [23]:
# creating new dataframe only showing Low-activity Customer

low_act_df = final_df[final_df['exclusion_flag'] == 'Low-activity Customer']

In [24]:
low_act_df.head(40)

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_last_order,product_id,add_to_cart_order,reordered,product_name,...,gender,state,age,date_joined,n_dependants,fam_status,income,_merge,region,exclusion_flag
1510,520620,120,1,3,11,0,196,2,False,Soda,...,Female,Kentucky,54,2017-03-02,2,married,99219,both,South,Low-activity Customer
1511,3273029,120,3,2,8,19,196,2,True,Soda,...,Female,Kentucky,54,2017-03-02,2,married,99219,both,South,Low-activity Customer
1512,520620,120,1,3,11,0,46149,1,False,Zero Calorie Cola,...,Female,Kentucky,54,2017-03-02,2,married,99219,both,South,Low-activity Customer
1513,3273029,120,3,2,8,19,46149,1,True,Zero Calorie Cola,...,Female,Kentucky,54,2017-03-02,2,married,99219,both,South,Low-activity Customer
1514,520620,120,1,3,11,0,26348,3,False,Mixed Fruit Fruit Snacks,...,Female,Kentucky,54,2017-03-02,2,married,99219,both,South,Low-activity Customer
1515,906054,120,2,4,7,15,31102,1,False,Sugar Free Energy Drink,...,Female,Kentucky,54,2017-03-02,2,married,99219,both,South,Low-activity Customer
1516,3273029,120,3,2,8,19,31102,3,True,Sugar Free Energy Drink,...,Female,Kentucky,54,2017-03-02,2,married,99219,both,South,Low-activity Customer
3855,3226575,360,1,5,12,0,196,1,False,Soda,...,Male,Arizona,46,2017-09-01,1,married,104257,both,West,Low-activity Customer
3856,3046940,360,2,2,8,4,5322,3,False,Gluten Free Dark Chocolate Chunk Chewy with a ...,...,Male,Arizona,46,2017-09-01,1,married,104257,both,West,Low-activity Customer
3857,2300993,360,3,1,13,6,5322,2,True,Gluten Free Dark Chocolate Chunk Chewy with a ...,...,Male,Arizona,46,2017-09-01,1,married,104257,both,West,Low-activity Customer


In [25]:
low_act_df.shape

(1440295, 32)

In [26]:
# creating new dataframe only including Frequent Customer

Insta_final_df = final_df[final_df['exclusion_flag'] == 'Frequent Customer']

In [27]:
Insta_final_df.head(10)

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_last_order,product_id,add_to_cart_order,reordered,product_name,...,gender,state,age,date_joined,n_dependants,fam_status,income,_merge,region,exclusion_flag
0,2539329,1,1,2,8,0,196,1,False,Soda,...,Female,Alabama,31,2019-02-17,3,married,40423,both,South,Frequent Customer
1,2398795,1,2,3,7,15,196,1,True,Soda,...,Female,Alabama,31,2019-02-17,3,married,40423,both,South,Frequent Customer
2,473747,1,3,3,12,21,196,1,True,Soda,...,Female,Alabama,31,2019-02-17,3,married,40423,both,South,Frequent Customer
3,2254736,1,4,4,7,29,196,1,True,Soda,...,Female,Alabama,31,2019-02-17,3,married,40423,both,South,Frequent Customer
4,431534,1,5,4,15,28,196,1,True,Soda,...,Female,Alabama,31,2019-02-17,3,married,40423,both,South,Frequent Customer
5,3367565,1,6,2,7,19,196,1,True,Soda,...,Female,Alabama,31,2019-02-17,3,married,40423,both,South,Frequent Customer
6,550135,1,7,1,9,20,196,1,True,Soda,...,Female,Alabama,31,2019-02-17,3,married,40423,both,South,Frequent Customer
7,3108588,1,8,1,14,14,196,2,True,Soda,...,Female,Alabama,31,2019-02-17,3,married,40423,both,South,Frequent Customer
8,2295261,1,9,1,16,0,196,4,True,Soda,...,Female,Alabama,31,2019-02-17,3,married,40423,both,South,Frequent Customer
9,2550362,1,10,4,8,30,196,1,True,Soda,...,Female,Alabama,31,2019-02-17,3,married,40423,both,South,Frequent Customer


In [28]:
Insta_final_df.shape

(30964564, 32)

# Export dataframes

In [29]:
# export low_act_df to pkl 
 
low_act_df.to_pickle(os.path.join(path, '02 Data','Prepared Data', 'low_activity_customers.pkl'))

In [30]:
# export Insta_final_df to pkl 
 
Insta_final_df.to_pickle(os.path.join(path, '02 Data','Prepared Data', 'Instacart_final.pkl'))