## Instacart Market Basket Analysis 

### a. Load the Data:
- Load the CSV files into pandas DataFrames.
- Check for missing values and data types.

In [15]:
import numpy as np
import pandas as pd

df_aisles = pd.read_csv('aisles.csv')
df_departments = pd.read_csv('departments.csv')
df_order_products_prior = pd.read_csv('order_products__prior.csv')
df_order_products_train = pd.read_csv('order_products__train.csv')
df_orders = pd.read_csv('orders.csv')
df_products = pd.read_csv('products.csv')


### set the empty value in days_since_prior_order to 0 as it is the first order for each user

In [20]:
null_index = df_orders.loc[df_orders['days_since_prior_order'].isnull()].index
df_orders.loc[df_orders['days_since_prior_order'].isnull(),'days_since_prior_order'] = 0
#df_orders['days_since_prior_order'] = [row['order_dow'] if row['order_number'] == 1 else row['days_since_prior_order'] for _, row in df_orders.iterrows()]
#remove user_id = 0 as all its data are 0
#i = df_orders[(df_orders.user_id == 0)].index
#df_orders.drop(i, inplace = True)
print(df_orders.head(50))
#df_orders.dtypes


    order_id  user_id eval_set  order_number  order_dow  order_hour_of_day  \
0    2539329        1    prior             1          2                  8   
1    2398795        1    prior             2          3                  7   
2     473747        1    prior             3          3                 12   
3    2254736        1    prior             4          4                  7   
4     431534        1    prior             5          4                 15   
5    3367565        1    prior             6          2                  7   
6     550135        1    prior             7          1                  9   
7    3108588        1    prior             8          1                 14   
8    2295261        1    prior             9          1                 16   
9    2550362        1    prior            10          4                  8   
10   1187899        1    train            11          4                  8   
11   2168274        2    prior             1          2         

### merge order_products prior and training tables

In [21]:
temp = df_order_products_train.copy()
temp.insert(0,'istraining', 1)
df_order_products = df_order_products_prior.copy()
df_order_products.insert(0,'istraining', 0)
frames = [df_order_products, temp]
df_order_products = pd.concat(frames, ignore_index=True)

In [22]:
#convert datatypes
#aisle
df_aisles['aisle_id'] = df_aisles['aisle_id'].astype(int)
df_aisles['aisle'] = df_aisles['aisle'].astype(str)
#df_departments
df_departments['department_id'] = df_departments['department_id'].astype(int)
df_departments['department'] = df_departments['department'].astype(str)
#df_products
df_products['product_id'] = df_products['product_id'].astype(int)
df_products['product_name'] = df_products['product_name'].astype(str)
df_products['aisle_id'] = df_products['aisle_id'].astype(int)
df_products['department_id'] = df_products['department_id'].astype(int)
#df_order_products
df_order_products['product_id'] = df_order_products['product_id'].astype(int)
df_order_products['order_id'] = df_order_products['order_id'].astype(int)
df_order_products['add_to_cart_order'] = df_order_products['add_to_cart_order'].astype(int)
df_order_products['reordered'] = df_order_products['reordered'].astype(bool)
df_order_products['istraining'] = df_order_products['istraining'].astype(bool)
#df_orders
df_orders['order_id'] = df_orders['order_id'].astype(int)
df_orders['user_id'] = df_orders['user_id'].astype(int)
df_orders['eval_set'] = df_orders['eval_set'].astype(str)
df_orders['order_number'] = df_orders['order_number'].astype(int)
df_orders['order_dow'] = df_orders['order_dow'].astype(int)
df_orders['order_hour_of_day'] = df_orders['order_hour_of_day'].astype(int)
df_orders['days_since_prior_order'] = df_orders['days_since_prior_order'].astype(int)

In [5]:

df_array = [
    df_aisles,
    df_departments,
    df_order_products,
    df_orders,
    df_products
]


In [36]:
#sorted(pd.unique(df_orders['order_number']))
#df_orders.order_number.unique()
df_orders.iloc[null_index].days_since_prior_order
#df_orders.groupby('user_id').head()

0          0
11         0
26         0
39         0
45         0
          ..
3420930    0
3420934    0
3421002    0
3421019    0
3421069    0
Name: days_since_prior_order, Length: 206209, dtype: int64

### 2. Exploratory Data Analysis (EDA)
a. Customer Behavior:
- Average number of orders per user.
- Average time between orders for each user.
- Number of orders placed by each customer.
- Customer segments based on purchase frequency.

In [6]:
'''
function takes average days between orders and classify customer based on following rules:
# Frequent Buyers: Customers who make purchases regularly (e.g.,weekly or monthly)
# average days between orders FB < 30
# Occasional Buyers: Customers who purchase less frequently (e.g., quarterly).
# average days between orders 30 <= OB < 90
# Rare Buyers: Customers who make very few purchases (e.g., annually).
# average days between orders 90 <= RB 
'''
def customer_segment(avg_days):
    if avg_days < 25 :
        return 'FB'
    elif 25 <= avg_days < 70:
        return 'OB'
    else:
        return 'RB'
        
#Average number of orders per user = Total number of orders / total number of users
no_orders = len(pd.unique(df_orders['order_id']))
no_users = len(pd.unique(df_orders['user_id']))
avg_no = no_orders/no_users
print(f"Average number of orders per use = {avg_no}")

#Average time between orders for each user
df_customer_time = df_orders.groupby('user_id').agg({
    'order_id':'count',
    'order_hour_of_day':'sum'
}).round(2)
df_customer_time["avg_time"] = df_customer_time["order_hour_of_day"] / df_customer_time["order_id"]
df_customer_time.head()

#Number of orders placed by each customer
df_customer_groupby = df_orders.groupby('user_id').agg({
    'order_id':'count',
    'days_since_prior_order':'mean'
}).round(2)

#Customer segments based on purchase frequency
order_count = list(df_customer_groupby.order_id)

df_customer_segment= df_customer_groupby.assign(
    segment = [customer_segment(x) for x in order_count]
)
df_customer_segment.reset_index()
df_customer_segment.segment.value_counts()

Average number of orders per use = 16.590367054784224


segment
FB    165998
OB     35798
RB      4413
Name: count, dtype: int64

b. Product Analysis:
- Identify most popular products by frequency.
- Determine average order size (number of items per order).

In [7]:
#calculate frequency of ordering each product
df_order_products.groupby('product_id').agg({'order_id':'count'}).nlargest(10,columns = 'order_id')
#Determine average order size (number of items per order).
df_order_products.groupby('order_id').agg({'product_id':'mean'})
no_orders = len(pd.unique(df_orders['order_id']))
no_ordered_products = len(df_order_products)
avg_no = no_ordered_products / no_orders
print(f"Average average order size = {avg_no}")


Average average order size = 9.885497077972092


c. Temporal Patterns:
- Analyze orders by day of the week and hour of the day.
- Explore seasonal trends or patterns in purchasing behavior.
- Months with higher order volumes.

In [23]:
#Analyze orders by day of the week 
df_week = df_orders.groupby('order_dow').agg({
    'order_id':'count'
})
df_week.reset_index().nlargest(n=10, columns='order_id')
#Analyze orders by hour of the day
df_hour = df_orders.groupby('order_hour_of_day').agg({
    'order_id':'count'
})
df_hour.reset_index().nlargest(n=30, columns='order_id')
#-------------------------------------------#
#Explore seasonal trends or patterns in purchasing behavior.
#get describtion of order_dow and order_hour_of_day column
df_orders['order_dow'].describe()
df_orders['order_hour_of_day'].describe()
#get number of orders per month 
df_orders.groupby([pd.Grouper(key="date_of_order", freq="MS"), "order_id"]).count()
#-------------------------------------------#
#Months with higher order volumes
#assume start date 01012018 and all useres first order 
df_orders['date_of_order'] = pd.to_datetime('01/06/2018')
#df_month = df_orders.groupby('user_id')


KeyError: 'The grouper name date_of_order is not found'

In [None]:
from datetime import timedelta
#Months with higher order volumes
#assume start date 01012018 and all useres first order 
df_orders['date_of_order'] = pd.to_datetime('01/06/2018')
df_month = df_orders.groupby('user_id')
count1 = 0
#prev_date = pd.to_datetime('01/01/2018')
#current_user = 1
count2 =0
for user_id, group in df_orders.groupby('user_id'):
    prev_date = pd.to_datetime('01/06/2018')
    for index, row in group.iterrows():
        if row['order_number'] == 1:
            prev_date = pd.to_datetime('01/06/2018')+ timedelta(days=row['order_dow'])
        else:
            prev_date = prev_date + timedelta(days=row['days_since_prior_order'])
            
        df_orders.loc[((df_orders['order_id'] == row['order_id']) & (df_orders['user_id'] == row['user_id'])),\
        'date_of_order'] = pd.to_datetime(prev_date)



df_orders.head()
            

In [8]:
##ChatGPT code
import pandas as pd
from datetime import timedelta

#problem is each user has different start date
# Assuming df_orders is already defined
df_orders["days_since_prior_order"] = df_orders["order_dow"] 
# Convert the 'order_dow' to a timedelta relative to '01/06/2018' for first orders
df_orders['date_of_order'] = pd.to_datetime('01/06/2018') + pd.to_timedelta(df_orders['order_dow'], unit='D')

# Calculate the cumulative sum of 'days_since_prior_order' for each user
df_orders['cumulative_days'] = df_orders.groupby('user_id')['days_since_prior_order'].cumsum()

# Adjust the date for subsequent orders
df_orders['date_of_order'] = df_orders['date_of_order'] + pd.to_timedelta(df_orders['cumulative_days'].fillna(0), unit='D')

# Drop the temporary 'cumulative_days' column if it's no longer needed
df_orders.drop(columns=['cumulative_days'], inplace=True)

In [38]:
df_orders.head(20)

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,date_of_order
0,2539329,1,prior,1,2,8,0,2018-01-08
1,2398795,1,prior,2,3,7,15,2018-01-24
2,473747,1,prior,3,3,12,21,2018-02-14
3,2254736,1,prior,4,4,7,29,2018-03-16
4,431534,1,prior,5,4,15,28,2018-04-13
5,3367565,1,prior,6,2,7,19,2018-04-30
6,550135,1,prior,7,1,9,20,2018-05-19
7,3108588,1,prior,8,1,14,14,2018-06-02
8,2295261,1,prior,9,1,16,0,2018-06-02
9,2550362,1,prior,10,4,8,30,2018-07-05


In [40]:
from datetime import timedelta
#Months with higher order volumes
df_month = df_orders.groupby(df_orders.date_of_order.dt.month).agg({
    'order_id':'count'
})
df_month.head(20)

Unnamed: 0_level_0,order_id
date_of_order,Unnamed: 1_level_1
1,532720
2,458147
3,446046
4,374328
5,331791
6,283035
7,250825
8,211206
9,175928
10,153897


d. Basket Analysis:
- Identify most frequently co-purchased items.
- Products often bought together on weekends vs. weekdays.

In [41]:
#identify co-purchased items
from itertools import combinations
from collections import Counter

df_order_products_dow = df_order_products.iloc[ : , [1,2] ]
df_order_dow = df_orders.iloc[ : , [0,4] ]
temp = df_order_products_dow.join(df_order_dow.set_index('order_id'), on='order_id', how = 'left')

#temp[temp.order_dow.isin([-np.inf, np.inf])].shape[0]
#temp['order_dow'] = temp['order_dow'].astype('int64')

#temp dataframe contains order-product-DayOfWeek
#get list of products
product_lists=[]
product_lists=[product_lists.append(list(group['product_id'])) for order_id, group in df_order_products.groupby('order_id')]

    


In [34]:
# Create a list of all combinations of products for each order
combinations_list = []
for products in product_lists:
    combinations_list.extend(combinations(products, 2))

# Count the frequency of each combination
combination_counts = Counter(combinations_list)

# Convert to DataFrame for better readability
combination_df = pd.DataFrame(combination_counts.items(), columns=['product_pair', 'frequency'])
combination_df = combination_df.sort_values(by='frequency', ascending=False)

print(combination_df)


Unnamed: 0,order_id,product_id,order_dow
0,2,33120,5
1,2,28985,5
2,2,9327,5
3,2,45918,5
4,2,30035,5
5,2,17794,5
6,2,40141,5
7,2,1819,5
8,2,43668,5


In [None]:
from itertools import combinations
from collections import Counter
#Products often bought together on weekends vs. weekdays.
df_order_products_dow = df_order_products.iloc[ : , [1,2] ]
df_order_dow = df_orders.iloc[ : , [0,4] ]
temp = df_order_dow.join(df_order_products_dow.set_index('order_id'), on='order_id', how = 'left')
df_orders_weekends = temp[temp['order_dow'].isin([0,6])]
#get list of products
product_lists=[]
[product_lists.append(list(group['product_id'])) for order_id, group in df_orders_weekends.groupby('order_id')]
# Create a list of all combinations of products for each order
combinations_list = []
for products in product_lists:
    combinations_list.extend(combinations(products, 2))

# Count the frequency of each combination
combination_counts = Counter(combinations_list)

# Convert to DataFrame for better readability
combination_df = pd.DataFrame(combination_counts.items(), columns=['product_pair', 'frequency'])
combination_df = combination_df.sort_values(by='frequency', ascending=False)

print(combination_df)