### This script contains:

#### 1. Import libraries & dataframes
#### 2. Create flag for different price ranges - and Product pricing issue
#### 3. Frequency of orders by day of the week - with Flag
#### 4. Frequency of orders by hour of the day - with Flag
#### 5. Exporting the dataset

## Importing libraries and datasets

In [1]:
#Import libraries 
import pandas as pd
import numpy as np
import os

In [2]:
#Folder path into usable string
path = r'C:\Users\willm\Instacart Basket Analysis'

In [3]:
#Import Merged Dataset
ords_prods_merged = pd.read_pickle(os.path.join(path, '02 Data','Prepared Data', 'orders_products_merged.pkl'))

### Create flag for different price ranges - and Product pricing issue

#### If-statement with loc() defined functions

In [4]:
#Checking to see what the max and min prices are
ords_prods_merged.describe()

Unnamed: 0,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,aisle_id,department_id,prices
count,32404860.0,32404860.0,32404860.0,32404860.0,30328760.0,32404860.0,32404860.0,32404860.0,32404860.0,32404860.0,32404860.0
mean,102937.2,17.1423,2.738867,13.42515,11.10408,25598.66,8.352547,0.5895873,71.19612,9.919792,11.98023
std,59466.1,17.53532,2.090077,4.24638,8.779064,14084.0,7.127071,0.4919087,38.21139,6.281485,495.6554
min,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0
25%,51422.0,5.0,1.0,10.0,5.0,13544.0,3.0,0.0,31.0,4.0,4.2
50%,102616.0,11.0,3.0,13.0,8.0,25302.0,6.0,1.0,83.0,9.0,7.4
75%,154389.0,24.0,5.0,16.0,15.0,37947.0,11.0,1.0,107.0,16.0,11.3
max,206209.0,99.0,6.0,23.0,30.0,49688.0,145.0,1.0,134.0,21.0,99999.0


#### Having been informed that InstaCart has no products priced over 25 dollars, the 99,999 we are seeing as a Max above, is an issue

In [5]:
#Defining a function called price_range_loc to show what cost range the product is in - limiting it at 25 dollars, max

In [6]:
ords_prods_merged.loc[(ords_prods_merged['prices'] <= 25) & (ords_prods_merged['prices'] > 15), 'price_range_loc'] = 'High-range product'
ords_prods_merged.loc[(ords_prods_merged['prices'] <= 15) & (ords_prods_merged['prices'] > 5), 'price_range_loc'] = 'Mid-range product'
ords_prods_merged.loc[ords_prods_merged['prices'] <= 5, 'price_range_loc'] = 'Low-range product'

In [7]:
#Confirming the prices have been assigned the right labels, by seeing total
ords_prods_merged['price_range_loc'].value_counts(dropna = False)

Mid-range product     21860860
Low-range product     10126321
High-range product      412551
NaN                       5127
Name: price_range_loc, dtype: int64

In [8]:
#Creating a subset of the NaN entries
ords_prods_merged_nan = ords_prods_merged[ords_prods_merged['price_range_loc'].isnull() == True]

In [9]:
#Viewing the subset
ords_prods_merged_nan

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,first_order,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,prices,_merge,price_range_loc
10030345,912404,17,12,2,14,5.0,Repeat Customer,21553,5,0,Lowfat 2% Milkfat Cottage Cheese,108,16,14900.0,both,
10030346,603376,17,22,6,16,4.0,Repeat Customer,21553,3,1,Lowfat 2% Milkfat Cottage Cheese,108,16,14900.0,both,
10030347,3264360,135,2,2,21,13.0,Repeat Customer,21553,6,0,Lowfat 2% Milkfat Cottage Cheese,108,16,14900.0,both,
10030348,892534,135,3,0,8,12.0,Repeat Customer,21553,3,1,Lowfat 2% Milkfat Cottage Cheese,108,16,14900.0,both,
10030349,229704,342,8,1,19,30.0,Repeat Customer,21553,9,0,Lowfat 2% Milkfat Cottage Cheese,108,16,14900.0,both,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29166209,2249946,204099,29,0,8,4.0,Repeat Customer,33664,1,0,2 % Reduced Fat Milk,84,16,99999.0,both,
29166210,2363282,204099,31,0,9,2.0,Repeat Customer,33664,1,1,2 % Reduced Fat Milk,84,16,99999.0,both,
29166211,3181945,204395,13,3,15,8.0,Repeat Customer,33664,25,0,2 % Reduced Fat Milk,84,16,99999.0,both,
29166212,2486215,205227,7,3,20,4.0,Repeat Customer,33664,8,0,2 % Reduced Fat Milk,84,16,99999.0,both,


In [10]:
#Creating a subset of the individual products in the NaN subset
ords_prods_merged_nan_unique = ords_prods_merged_nan.drop_duplicates(subset = ["product_id"])

In [11]:
ords_prods_merged_nan_unique.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,first_order,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,prices,_merge,price_range_loc
10030345,912404,17,12,2,14,5.0,Repeat Customer,21553,5,0,Lowfat 2% Milkfat Cottage Cheese,108,16,14900.0,both,
29165516,183964,873,3,0,10,7.0,Repeat Customer,33664,11,0,2 % Reduced Fat Milk,84,16,99999.0,both,


#### There are only 2 products with prices over 25 dollars - priced at 14,900 and 99,999. They were purchased 5,127 times.
#### The true price of these should be clarified with the products team, and corrected.

In [12]:
#Exporting data to csv
ords_prods_merged_nan.to_csv(os.path.join(path, '02 Data','Prepared Data', 'prices_issue_products.csv'))

In [13]:
#Dropping the NaN entries from the ords_prods_merged dataset
ords_prods_merged = ords_prods_merged[ords_prods_merged['price_range_loc'].isnull() == False]

In [14]:
#Rechecking the flag count
ords_prods_merged['price_range_loc'].value_counts(dropna = False)

Mid-range product     21860860
Low-range product     10126321
High-range product      412551
Name: price_range_loc, dtype: int64

In [15]:
#Creating a subset of the individual products
ords_prods_merged_unique = ords_prods_merged.drop_duplicates(subset = ["product_id"])

In [16]:
#Rechecking the flag count
ords_prods_merged_unique['price_range_loc'].value_counts(dropna = False)

Mid-range product     32822
Low-range product     16290
High-range product      545
Name: price_range_loc, dtype: int64

### Frequency of orders by day of the week

In [17]:
#Getting the frequency of orders by the numerical day of the week
ords_prods_merged['orders_day_of_week'].value_counts(dropna = False)

0    6203329
1    5659298
6    4495887
2    4213105
5    4205076
3    3839865
4    3783172
Name: orders_day_of_week, dtype: int64

In [18]:
#We know the 2 busiest days are 0 and 1, and the two slowest are 3 and 4. 
#This assigns the labels to each of those 4 days specifically.
result = []
for value in ords_prods_merged["orders_day_of_week"]:
  if value == 0:
    result.append("Busiest days")
  elif value == 1:
    result.append("Busiest days")
  elif value == 3:
    result.append("Slowest days")
  elif value == 4:
    result.append("Slowest days")
  else:
    result.append("Regularly busy")

In [19]:
#Making the busiest_day column relate to the result list 
ords_prods_merged['busiest_day'] = result

In [20]:
#Confirming the days have been assigned the right labels, by seeing total
ords_prods_merged['busiest_day'].value_counts(dropna = False)

Regularly busy    12914068
Busiest days      11862627
Slowest days       7623037
Name: busiest_day, dtype: int64

### Frequency of orders by hour of the day

In [21]:
#Checking the frequency of orders, by hour of the day
ords_prods_merged['order_hour_of_day'].value_counts(dropna = False)

10    2761333
11    2735694
14    2688728
15    2661718
13    2660570
12    2618104
16    2534744
9     2453842
17    2087273
8     1717863
18    1636226
19    1258076
20     976000
7      890923
21     795528
22     634159
23     402272
6      290450
0      218742
1      115683
5       87944
2       69360
4       53232
3       51268
Name: order_hour_of_day, dtype: int64

In [22]:
#Choosing the Top 7 hours, and Bottom 7 hours as the Most Orders and Fewest Orders
hourly_result = []
for value in ords_prods_merged["order_hour_of_day"]:
  if (value >= 10 and value <= 16):
    hourly_result.append("Most orders")
  elif (value <= 6):
    hourly_result.append("Fewest orders")
  else:
    hourly_result.append("Average orders")

In [23]:
#Making the busiest_period_of_day column relate to the result list
ords_prods_merged['busiest_period_of_day'] = hourly_result

In [24]:
#Confirming the hours have been assigned the right labels, by seeing total
ords_prods_merged['busiest_period_of_day'].value_counts(dropna = False)

Most orders       18660891
Average orders    12852162
Fewest orders       886679
Name: busiest_period_of_day, dtype: int64

In [25]:
#Checking to see that the dataset looks as expected
ords_prods_merged.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,first_order,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,prices,_merge,price_range_loc,busiest_day,busiest_period_of_day
0,2539329,1,1,2,8,,First Order,196,1,0,Soda,77,7,9.0,both,Mid-range product,Regularly busy,Average orders
1,2398795,1,2,3,7,15.0,Repeat Customer,196,1,1,Soda,77,7,9.0,both,Mid-range product,Slowest days,Average orders
2,473747,1,3,3,12,21.0,Repeat Customer,196,1,1,Soda,77,7,9.0,both,Mid-range product,Slowest days,Most orders
3,2254736,1,4,4,7,29.0,Repeat Customer,196,1,1,Soda,77,7,9.0,both,Mid-range product,Slowest days,Average orders
4,431534,1,5,4,15,28.0,Repeat Customer,196,1,1,Soda,77,7,9.0,both,Mid-range product,Slowest days,Most orders


### Exporting the dataset

In [26]:
#Exporting data to pkl
ords_prods_merged.to_pickle(os.path.join(path, '02 Data','Prepared Data', 'orders_products_hourly_daily.pkl'))