CONTENTS LIST:
- Using .groupby to group data
- Using .agg() to aggregate data with single/multiple statistics
- Deriving columns with .loc()

#01 Importing libraries and data

In [3]:
import pandas as pd
import numpy as np
import os

In [4]:
#Defining path and importing data
path = '/Users/gingermoore/Documents/04-2025 Instacart Basket Analysis'
df_ords_prods_merge = pd.read_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'new_ords_prods_merge.pkl'))

In [5]:
#Checking head and shape
df_ords_prods_merge.shape

(32399732, 18)

In [6]:
df_ords_prods_merge.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices,order_id,user_id,orders_chronological,orders_day_of_week,order_hour_of_day,days_since_prior_order,is_new,add_to_cart_order,reordered,exists,price_range,busiest_days,busiest_period_of_day
0,1,Chocolate Sandwich Cookies,61,19,5.8,3139998,138,28,6,11,3.0,False,5,0,both,Mid-range product,Regularly busy,Most orders
1,1,Chocolate Sandwich Cookies,61,19,5.8,1977647,138,30,6,17,20.0,False,1,1,both,Mid-range product,Regularly busy,Average orders
2,1,Chocolate Sandwich Cookies,61,19,5.8,389851,709,2,0,21,6.0,False,20,0,both,Mid-range product,Busiest days,Average orders
3,1,Chocolate Sandwich Cookies,61,19,5.8,652770,764,1,3,13,,True,10,0,both,Mid-range product,Least busy days,Most orders
4,1,Chocolate Sandwich Cookies,61,19,5.8,1813452,764,3,4,17,9.0,False,11,1,both,Mid-range product,Least busy days,Average orders


In [7]:
df = df_ords_prods_merge[:1000000]

In [8]:
df.shape

(1000000, 18)

#02 Grouping Data

In [10]:
df.groupby('product_name')

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x11be825d0>

#03 Aggregating data with agg()

In [12]:
df.groupby('department_id').agg({'orders_chronological': ['mean']})

Unnamed: 0_level_0,orders_chronological
Unnamed: 0_level_1,mean
department_id,Unnamed: 1_level_2
1,15.577493
2,17.320781
3,16.084944
4,17.530458
5,14.763075
6,16.658449
7,17.03159
8,15.076662
9,15.44758
10,18.681852


In [13]:
df.groupby('department_id')['orders_chronological'].mean()

department_id
1     15.577493
2     17.320781
3     16.084944
4     17.530458
5     14.763075
6     16.658449
7     17.031590
8     15.076662
9     15.447580
10    18.681852
11    15.447411
12    14.327957
13    16.548642
14    16.960241
15    16.121948
16    17.803851
17    15.593633
18    19.674252
19    16.899756
20    16.255442
21    25.535479
Name: orders_chronological, dtype: float64

In [14]:
#Producing multiple statistics
df.groupby('department_id').agg({'orders_chronological': ['mean', 'min', 'max']})

Unnamed: 0_level_0,orders_chronological,orders_chronological,orders_chronological
Unnamed: 0_level_1,mean,min,max
department_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
1,15.577493,1,99
2,17.320781,1,96
3,16.084944,1,99
4,17.530458,1,99
5,14.763075,1,99
6,16.658449,1,99
7,17.03159,1,99
8,15.076662,1,98
9,15.44758,1,99
10,18.681852,1,99


#04 Aggregating data with transform()

In [16]:
df_ords_prods_merge['max_order'] = df_ords_prods_merge.groupby(['user_id'])['orders_chronological'].transform('max')

In [17]:
df_ords_prods_merge.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices,order_id,user_id,orders_chronological,orders_day_of_week,order_hour_of_day,days_since_prior_order,is_new,add_to_cart_order,reordered,exists,price_range,busiest_days,busiest_period_of_day,max_order
0,1,Chocolate Sandwich Cookies,61,19,5.8,3139998,138,28,6,11,3.0,False,5,0,both,Mid-range product,Regularly busy,Most orders,32
1,1,Chocolate Sandwich Cookies,61,19,5.8,1977647,138,30,6,17,20.0,False,1,1,both,Mid-range product,Regularly busy,Average orders,32
2,1,Chocolate Sandwich Cookies,61,19,5.8,389851,709,2,0,21,6.0,False,20,0,both,Mid-range product,Busiest days,Average orders,5
3,1,Chocolate Sandwich Cookies,61,19,5.8,652770,764,1,3,13,,True,10,0,both,Mid-range product,Least busy days,Most orders,3
4,1,Chocolate Sandwich Cookies,61,19,5.8,1813452,764,3,4,17,9.0,False,11,1,both,Mid-range product,Least busy days,Average orders,3


#05 Deriving columns with loc()

In [19]:
#Setting conditions
df_ords_prods_merge.loc[df_ords_prods_merge['max_order'] > 40, 'loyalty_flag'] = 'Loyal customer'

In [20]:
df_ords_prods_merge.loc[(df_ords_prods_merge['max_order'] <= 40) & (df_ords_prods_merge['max_order'] > 10), 'loyalty_flag'] = 'Regular customer'

In [21]:
df_ords_prods_merge.loc[df_ords_prods_merge['max_order'] <= 10, 'loyalty_flag'] = 'New customer'

In [22]:
df_ords_prods_merge['loyalty_flag'].value_counts(dropna = False)

loyalty_flag
Regular customer    15874128
Loyal customer      10282763
New customer         6242841
Name: count, dtype: int64

In [23]:
df_ords_prods_merge[['user_id', 'loyalty_flag', 'orders_chronological']].head(60)

Unnamed: 0,user_id,loyalty_flag,orders_chronological
0,138,Regular customer,28
1,138,Regular customer,30
2,709,New customer,2
3,764,New customer,1
4,764,New customer,3
5,777,Regular customer,16
6,825,New customer,3
7,910,Regular customer,12
8,1052,Regular customer,10
9,1052,Regular customer,15
