# Contents:

#### Importing libraries and dataframes
#### Assessing orders by department
#### Creating a price range column
#### Creating a busiest day column
#### Convert busiestday column to busiest days
#### Create busiest period of day column
#### Exporting the dataframe as a pkl file

## Importing libraries and dataframes

In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
# To create a path to the Instacart project folder
path=r'C:\Users\Shelb\OneDrive\Documents\CF Coursework\Python Fundamentals for Data Analysts\10-2022 Instacart Basket Analysis'

In [3]:
# To import orders_products_merged.pkl as a dataframe object
df=pd.read_pickle(os.path.join(path,'Data','Prepared Data','orders_products_merged.pkl'))

## Assessing orders by department

In [4]:
df['department_id'].value_counts()

4     9479291
16    5398747
19    2887550
7     2688123
1     2234743
13    1875369
3     1172428
15    1068058
20    1051249
9      866627
17     738666
12     708927
14     703033
11     447572
18     423802
6      269253
5      153696
8       97716
21      69145
2       36291
10      34573
Name: department_id, dtype: int64

## Creating a price_range column

In [5]:
# Creating a price_range column to flag low, mid, and high price range products
df.loc[df['prices']>15,'price_range']='High range product'

In [6]:
df.loc[(df['prices']<=15)&(df['prices']>5),'price_range']='Mid range product'

In [7]:
df.loc[df['prices']<=5,'price_range']='Low range product'

In [8]:
# To check values for each category
df['price_range'].value_counts(dropna=False)

Mid range product     21860860
Low range product     10126321
High range product      417678
Name: price_range, dtype: int64

## Creating a busiest_day column

In [9]:
# Checking values for orders_days_of_the_week
df['order_day_of_week'].value_counts(dropna=False)

0    6204182
1    5660230
6    4496490
2    4213830
5    4205791
3    3840534
4    3783802
Name: order_day_of_week, dtype: int64

In [10]:
# Create empty list and logic for busiest_day column
result=[]

for value in df['order_day_of_week']:
    if value==0:
        result.append('Busiest Day')
    elif value==4:
        result.append('Least Busy')
    else:
        result.append('Regularly busy')

In [11]:
# Fill values in the busiest_day column with logic from result
df['busiest_day']=result

In [12]:
df['busiest_day'].value_counts(dropna=False)

Regularly busy    22416875
Busiest Day        6204182
Least Busy         3783802
Name: busiest_day, dtype: int64

## Convert busiest_day column to busiest_days

In [13]:
# Create empty list and logic for busiest_days column
result=[]

for value in df['order_day_of_week']:
    if value==0:
        result.append('Busiest Day')
    elif value==1:
        result.append('Busiest Day')
    elif value==4:
        result.append('Least Busy')
    elif value==3:
        result.append('Least Busy')
    else:
        result.append('Regularly Busy')

In [14]:
df['busiest_days']=result

In [15]:
# Check the values for the new busiest_days column
df['busiest_days'].value_counts(dropna=False)

Regularly Busy    12916111
Busiest Day       11864412
Least Busy         7624336
Name: busiest_days, dtype: int64

## Create busiest_period_of_day column

In [16]:
# To check the interquartile range of the hour_order_placed column
df['hour_order_placed'].describe()

count    3.240486e+07
mean     1.342515e+01
std      4.246380e+00
min      0.000000e+00
25%      1.000000e+01
50%      1.300000e+01
75%      1.600000e+01
max      2.300000e+01
Name: hour_order_placed, dtype: float64

In [17]:
# To count how many orders were placed each hour
df['hour_order_placed'].value_counts(dropna=False)

10    2761760
11    2736140
14    2689136
15    2662144
13    2660954
12    2618532
16    2535202
9     2454203
17    2087654
8     1718118
18    1636502
19    1258305
20     976156
7      891054
21     795637
22     634225
23     402316
6      290493
0      218769
1      115700
5       87961
2       69375
4       53242
3       51281
Name: hour_order_placed, dtype: int64

In [18]:
# To create an empty result list and logic
result=[]

for value in df['hour_order_placed']:
    if value in (10,11,14,15,13,12,16,9):
        result.append('Most Orders')
    elif value in (17,8,18,19,20,7,21,22):
        result.append('Average Orders')
    elif value in (23,6,0,1,5,2,4,3):
        result.append('Fewest Orders')

In [19]:
# To create a busiest_period_of_day column
df['busiest_period_of_day']=result

In [20]:
# To check values for each category
df['busiest_period_of_day'].value_counts(dropna=False)

Most Orders       21118071
Average Orders     9997651
Fewest Orders      1289137
Name: busiest_period_of_day, dtype: int64

## Export file as pkl

In [21]:
df.to_pickle(os.path.join(path,'Data','Prepared Data','11-9_orders_products_merged.pkl'))