# 01. Importing libraries

In [30]:
# Import libraries
import pandas as pd
import numpy as np
import os

# 02. Importing data

In [31]:
# Create path
path = r'C:\Users\Quinn\OneDrive\Documents\CF - Data Analyst\Data Immersion\Instacart Basket Analysis'

In [32]:
# Needed columns
vars_list = ['order_id', 'user_id', 'order_number', 'order_dow', 'order_hour_of_day', 'days_since_prior_order']

In [33]:
# Import orders data
df_ords = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'orders.csv'), usecols = vars_list)

In [34]:
# Import products data
df_prods = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'products.csv'), index_col = False)

In [35]:
# Import departments data
df_deps = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'departments.csv'), index_col = False)

In [36]:
# Transposing dep
df_deps_t = df_deps.T

In [37]:
# reset the index of dep
df_deps_t.reset_index()

Unnamed: 0,index,0
0,department_id,department
1,1,frozen
2,2,other
3,3,bakery
4,4,produce
5,5,alcohol
6,6,international
7,7,beverages
8,8,pets
9,9,dry goods pasta


In [38]:
# create new header
new_header = df_deps_t.iloc[0]

In [39]:
df_deps_t_new = df_deps_t[1:]

In [40]:
# add header to new deps dataframe
df_deps_t_new.columns = new_header

In [41]:
data_dict = df_deps_t_new.to_dict('index')

In [42]:
data_dict

{'1': {'department': 'frozen'},
 '2': {'department': 'other'},
 '3': {'department': 'bakery'},
 '4': {'department': 'produce'},
 '5': {'department': 'alcohol'},
 '6': {'department': 'international'},
 '7': {'department': 'beverages'},
 '8': {'department': 'pets'},
 '9': {'department': 'dry goods pasta'},
 '10': {'department': 'bulk'},
 '11': {'department': 'personal care'},
 '12': {'department': 'meat seafood'},
 '13': {'department': 'pantry'},
 '14': {'department': 'breakfast'},
 '15': {'department': 'canned goods'},
 '16': {'department': 'dairy eggs'},
 '17': {'department': 'household'},
 '18': {'department': 'babies'},
 '19': {'department': 'snacks'},
 '20': {'department': 'deli'},
 '21': {'department': 'missing'}}

## Observation: There is missing name for department_id number 21

In [43]:
df_snacks = df_prods[df_prods['department_id']==19]

In [44]:
df_snacks.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,1,Chocolate Sandwich Cookies,61,19,5.8
15,16,Mint Chocolate Flavored Syrup,103,19,5.2
24,25,Salted Caramel Lean Protein & Fiber Bar,3,19,1.9
31,32,Nacho Cheese White Bean Chips,107,19,4.9
40,41,Organic Sourdough Einkorn Crackers Rosemary,78,19,6.5


# Exercise 4.4

In [45]:
df_ords.head(5)

Unnamed: 0,order_id,user_id,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2539329,1,1,2,8,
1,2398795,1,2,3,7,15.0
2,473747,1,3,3,12,21.0
3,2254736,1,4,4,7,29.0
4,431534,1,5,4,15,28.0


In [46]:
# Q2.Change datatype for user_id, order_id, order_dow, product_id, aisle_id
df_ords['user_id'] = df_ords['user_id'].astype('str')
df_ords['order_id'] = df_ords['order_id'].astype('str')
df_ords['order_dow'] = df_ords ['order_dow'].astype('str')
df_prods['product_id'] = df_prods['product_id'].astype('str')
df_prods['aisle_id'] = df_prods['aisle_id'].astype('str')

In [47]:
# Q3.Change columns name
df_ords.rename(columns = {'order_dow' : 'orders_day_of_week'}, inplace = True)
df_ords.rename(columns = {'days_since_prior_order' : 'days_since_last_order'}, inplace = True)

In [48]:
# Q4. Busiest hour is for placing orders
df_ords['order_hour_of_day'].value_counts(dropna = False)

order_hour_of_day
10    288418
11    284728
15    283639
14    283042
13    277999
12    272841
16    272553
9     257812
17    228795
18    182912
8     178201
19    140569
20    104292
7      91868
21     78109
22     61468
23     40043
6      30529
0      22758
1      12398
5       9569
2       7539
4       5527
3       5474
Name: count, dtype: int64

## Observation: The most busiest hour is 10 AM. This means customers most likely to order at this time.

In [49]:
# Q5.Meaning behind a value of 4
print(data_dict.get('4'))

{'department': 'produce'}


In [50]:
# Q6.Creat a subset only containing breakfast items
df_breakfast = df_prods[df_prods['department_id']== 14]

In [51]:
df_breakfast.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
27,28,Wheat Chex Cereal,121,14,10.1
33,34,,121,14,12.2
67,68,"Pancake Mix, Buttermilk",130,14,13.7
89,90,Smorz Cereal,121,14,3.9
210,211,Gluten Free Organic Cereal Coconut Maple Vanilla,130,14,3.6


## Observation: people likely to eat cereal for breakfast. This department has a missing value for product_id number 34.

In [52]:
# Q7.Products that customers might use to throw dinner parties
df_dinner_parties = df_prods.loc[df_prods['department_id'].isin([5,7,12,20])]

In [53]:
df_dinner_parties.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
2,3,Robust Golden Unsweetened Oolong Tea,94,7,4.5
6,7,Pure Coconut Water With Orange,98,7,4.4
9,10,Sparkling Orange Juice & Prickly Pear Beverage,115,7,8.4
10,11,Peach Mango Juice,31,7,2.8
16,17,Rendered Duck Fat,35,12,17.1


## Observation: for dinner party, people tend to spend mostly on beverage, meat and seafood are not order.

In [54]:
# Q8.How many rows does the last dataframe you created have?
df_dinner_parties.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7650 entries, 2 to 49688
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   product_id     7650 non-null   object 
 1   product_name   7647 non-null   object 
 2   aisle_id       7650 non-null   object 
 3   department_id  7650 non-null   int64  
 4   prices         7650 non-null   float64
dtypes: float64(1), int64(1), object(3)
memory usage: 358.6+ KB


## Observation: The product_name column has 3 N/A values. 

## It has 7650 rows and 5 columns

In [55]:
# Q9.Information about user No.1
df_user_1 = df_ords[df_ords['user_id']=='1']

In [56]:
df_user_1

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_last_order
0,2539329,1,1,2,8,
1,2398795,1,2,3,7,15.0
2,473747,1,3,3,12,21.0
3,2254736,1,4,4,7,29.0
4,431534,1,5,4,15,28.0
5,3367565,1,6,2,7,19.0
6,550135,1,7,1,9,20.0
7,3108588,1,8,1,14,14.0
8,2295261,1,9,1,16,0.0
9,2550362,1,10,4,8,30.0


## Observation: There are both N/A and 0 values stay in the same column: days_since_last_order.  They usually order on the weekdays. They usually order in the morning and afternoon. They use IC's services only 1 or 2 times per month.

In [57]:
# Q10. Basic stats of user No.1
df_user_1.describe()

Unnamed: 0,order_number,order_hour_of_day,days_since_last_order
count,11.0,11.0,10.0
mean,6.0,10.090909,19.0
std,3.316625,3.477198,9.030811
min,1.0,7.0,0.0
25%,3.5,7.5,14.25
50%,6.0,8.0,19.5
75%,8.5,13.0,26.25
max,11.0,16.0,30.0


In [61]:
# Exporting data
df_ords.to_csv(os.path.join(path, '02 Data','Prepared Data', 'orders_wrangled.csv'))
df_deps_t_new.to_csv(os.path.join(path, '02 Data', 'Prepared Data', 'departments_wrangled.csv'))