# 01. Importing Libraries

In [2]:
import pandas as pd
import numpy  as np
import os

# 02. Importing Data

In [3]:
# Create Path
path = r'C:\Users\esteb\OneDrive\Desktop\2025_Instacart Basket Analysis'

In [4]:
# Order Data
df_ords = pd.read_csv(os.path.join(path, '02_Data', 'Original Data', 'orders.csv' ))

In [5]:
# Products Data
df_prods = pd.read_csv(os.path.join(path, '02_Data', 'Original Data', 'products.csv' ))

In [6]:
# Departments Data
df_deps = pd.read_csv(os.path.join(path, '02_Data', 'Original Data', 'departments.csv' ))

# 03. Data Wrangling

In [7]:
df_prods.shape

(49693, 5)

In [7]:
df_ords.head()

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2539329,1,prior,1,2,8,
1,2398795,1,prior,2,3,7,15.0
2,473747,1,prior,3,3,12,21.0
3,2254736,1,prior,4,4,7,29.0
4,431534,1,prior,5,4,15,28.0


In [8]:
# Eliminate eval_set column
df_ords = df_ords.drop(columns = ['eval_set'])

In [9]:
# Changing Data Type for Values to exclude
df_ords['order_id'] = df_ords['order_id'].astype('str')
df_ords['user_id'] = df_ords['user_id'].astype('str')

In [10]:
# Changing Column Name
df_ords.rename(columns = {'order_dow' : 'order_day_of_week'}, inplace = True)

In [11]:
df_ords.head()

Unnamed: 0,order_id,user_id,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order
0,2539329,1,1,2,8,
1,2398795,1,2,3,7,15.0
2,473747,1,3,3,12,21.0
3,2254736,1,4,4,7,29.0
4,431534,1,5,4,15,28.0


In [12]:
# Frequency of Busiest Hour to place order
df_ords['order_hour_of_day'].value_counts()

order_hour_of_day
10    288418
11    284728
15    283639
14    283042
13    277999
12    272841
16    272553
9     257812
17    228795
18    182912
8     178201
19    140569
20    104292
7      91868
21     78109
22     61468
23     40043
6      30529
0      22758
1      12398
5       9569
2       7539
4       5527
3       5474
Name: count, dtype: int64

The busiest hour of the day to place orders is at 10am.

# 04. Transposing Data

In [13]:
# Taking a look at columns
df_deps.head()

Unnamed: 0,department_id,1,2,3,4,5,6,7,8,9,...,12,13,14,15,16,17,18,19,20,21
0,department,frozen,other,bakery,produce,alcohol,international,beverages,pets,dry goods pasta,...,meat seafood,pantry,breakfast,canned goods,dairy eggs,household,babies,snacks,deli,missing


In [14]:
# Execute (Transposing df_dep)
df_deps_t = df_deps.T

In [15]:
# Take the first row of df_deps_t for the header
new_header = df_deps_t.iloc[0]

In [16]:
new_header

0    department
Name: department_id, dtype: object

In [17]:
# New Dataframe after Row 1
df_deps_t_new = df_deps_t[1:]

In [18]:
# Set the New Header to the Dataframe
df_deps_t_new.columns = new_header

In [19]:
df_deps_t_new

department_id,department
1,frozen
2,other
3,bakery
4,produce
5,alcohol
6,international
7,beverages
8,pets
9,dry goods pasta
10,bulk


# 05. Data Dictionary

In [20]:
# Assign a value to data_dict
data_dict = df_deps_t_new.to_dict('index')

In [21]:
data_dict

{'1': {'department': 'frozen'},
 '2': {'department': 'other'},
 '3': {'department': 'bakery'},
 '4': {'department': 'produce'},
 '5': {'department': 'alcohol'},
 '6': {'department': 'international'},
 '7': {'department': 'beverages'},
 '8': {'department': 'pets'},
 '9': {'department': 'dry goods pasta'},
 '10': {'department': 'bulk'},
 '11': {'department': 'personal care'},
 '12': {'department': 'meat seafood'},
 '13': {'department': 'pantry'},
 '14': {'department': 'breakfast'},
 '15': {'department': 'canned goods'},
 '16': {'department': 'dairy eggs'},
 '17': {'department': 'household'},
 '18': {'department': 'babies'},
 '19': {'department': 'snacks'},
 '20': {'department': 'deli'},
 '21': {'department': 'missing'}}

The department_id 4 is equal to produce department.

# 06. Subset

In [22]:
# Subset to only view Data from the Breakfast Department
df_breakfast =  df_prods[df_prods['department_id']==14]

In [23]:
df_breakfast

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
27,28,Wheat Chex Cereal,121,14,10.1
33,34,,121,14,12.2
67,68,"Pancake Mix, Buttermilk",130,14,13.7
89,90,Smorz Cereal,121,14,3.9
210,211,Gluten Free Organic Cereal Coconut Maple Vanilla,130,14,3.6
...,...,...,...,...,...
49330,49326,Cereal Variety Fun Pack,121,14,9.1
49395,49391,Light and Fluffy Buttermilk Pancake Mix,130,14,2.0
49547,49543,Chocolate Cheerios Cereal,121,14,10.8
49637,49633,Shake 'N Pour Buttermilk Pancake Mix,130,14,14.2


In [24]:
# Subset of Dinner Parties using various department_id
df_dinner_parties = df_prods.loc[df_prods['department_id'].isin([5, 7, 12, 20])]

In [25]:
# New Dataframe of department_id (5, 7, 12, 20)
df_dinner_parties

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
2,3,Robust Golden Unsweetened Oolong Tea,94,7,4.5
6,7,Pure Coconut Water With Orange,98,7,4.4
9,10,Sparkling Orange Juice & Prickly Pear Beverage,115,7,8.4
10,11,Peach Mango Juice,31,7,2.8
16,17,Rendered Duck Fat,35,12,17.1
...,...,...,...,...,...
49676,49672,Cafe Mocha K-Cup Packs,26,7,6.5
49679,49675,Cinnamon Dolce Keurig Brewed K Cups,26,7,14.0
49680,49676,Ultra Red Energy Drink,64,7,14.5
49686,49682,California Limeade,98,7,4.3


By executing the new dataframe name df_dinner_parties it will show me a preview and a total amount of rows and columns in that new dataframe. The new dataframe has 7650 Rows.

In [26]:
# Alternate way of showing the amount of rows and columns without displaying a preview.
df_dinner_parties.shape

(7650, 5)

In [27]:
# Subset of Customer using user_id 1
df_customer_1 = df_ords[df_ords['user_id']== '1']

In [28]:
# Stats about user_id 1
df_customer_1.describe()

Unnamed: 0,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order
count,11.0,11.0,11.0,10.0
mean,6.0,2.636364,10.090909,19.0
std,3.316625,1.286291,3.477198,9.030811
min,1.0,1.0,7.0,0.0
25%,3.5,1.5,7.5,14.25
50%,6.0,3.0,8.0,19.5
75%,8.5,4.0,13.0,26.25
max,11.0,4.0,16.0,30.0


# 07. Exporting Data

In [30]:
# Export Orders Dataframe
df_ords.to_csv(os.path.join(path, '02_Data', 'Prepared Data', 'orders_wrangled_task.csv'))

In [56]:
# Export Departments Dataframe
df_deps_t_new.to_csv(os.path.join(path, '02_Data', 'Prepared Data', 'departments_wrangled_task.csv'))