#    Import libraries to be used for analysis

In [2]:
# import libraries
import numpy as np
import pandas as pd
import os

In [3]:
# import dataset by connecting through the folder path and create a data frame for each; orders_csv , products_csv and departments_csv

In [4]:
file_path = r"D:\career Foundary\Data_immersion\4_python_projects\Instacart Basket Analysis"


In [5]:
df_orders = pd.read_csv(os.path.join(file_path, '02 Data', 'Original Data', 'orders.csv'), index_col = False)

In [6]:
df_products = pd.read_csv(os.path.join(file_path, '02 Data', 'Original Data', 'products.csv'), index_col = False)

In [7]:
df_departments = pd.read_csv(os.path.join(file_path, '02 Data', 'Original Data', 'departments.csv'), index_col = False)

In [8]:
df_orders.head()

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2539329,1,prior,1,2,8,
1,2398795,1,prior,2,3,7,15.0
2,473747,1,prior,3,3,12,21.0
3,2254736,1,prior,4,4,7,29.0
4,431534,1,prior,5,4,15,28.0


#    Wrangling procedures

In [9]:
# since we don't need the "eval_set" column for analysis, we could drop this column. 
# Do take caution here, however, as overwriting dataframes can be a bit risky instead create anew data frame for it instead of overridding this.

In [10]:
df_orders_no_eval_set = df_orders.drop(columns = ["eval_set"])

In [11]:
df_orders_no_eval_set

Unnamed: 0,order_id,user_id,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2539329,1,1,2,8,
1,2398795,1,2,3,7,15.0
2,473747,1,3,3,12,21.0
3,2254736,1,4,4,7,29.0
4,431534,1,5,4,15,28.0
...,...,...,...,...,...,...
3421078,2266710,206209,10,5,18,29.0
3421079,1854736,206209,11,4,10,30.0
3421080,626363,206209,12,1,12,18.0
3421081,2977660,206209,13,1,12,7.0


In [12]:
# if you notice a column contains multiple missing values, you can investigate further by using....df['variable'].value_counts(dropna = False)
# dropna = False. This is included to ensure the function doesn’t drop any missing values, which is important because you’re currently searching the column specifically to look for missing values. 
# Dropping them would eliminate the point!

In [13]:
df_orders_no_eval_set['days_since_prior_order'].value_counts(dropna = False)

30.0    369323
7.0     320608
6.0     240013
4.0     221696
3.0     217005
5.0     214503
NaN     206209
2.0     193206
8.0     181717
1.0     145247
9.0     118188
14.0    100230
10.0     95186
13.0     83214
11.0     80970
12.0     76146
0.0      67755
15.0     66579
16.0     46941
21.0     45470
17.0     39245
20.0     38527
18.0     35881
19.0     34384
22.0     32012
28.0     26777
23.0     23885
27.0     22013
24.0     20712
25.0     19234
29.0     19191
26.0     19016
Name: days_since_prior_order, dtype: int64

In [14]:
 # To rename a colume for clarity,but always check with a client before haphazardly changing column names.

In [15]:
df_orders_no_eval_set.rename(columns = {'order_dow' : 'orders_day_of_week'}, inplace = True)

In [16]:
df_orders_no_eval_set

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order
0,2539329,1,1,2,8,
1,2398795,1,2,3,7,15.0
2,473747,1,3,3,12,21.0
3,2254736,1,4,4,7,29.0
4,431534,1,5,4,15,28.0
...,...,...,...,...,...,...
3421078,2266710,206209,10,5,18,29.0
3421079,1854736,206209,11,4,10,30.0
3421080,626363,206209,12,1,12,18.0
3421081,2977660,206209,13,1,12,7.0


# Changing a Variable’s Data Type

When using the describe() function, you don’t necessarily want to perform it on your entire dataframe. Usually, you only need it for specific columns relevant to the analysis you’re currently conducting. For instance, there would never be a need to calculate statistics for the "order_id" and "user_id" columns—they don’t represent real numeric values, rather, keys related to specific orders and users. When executing the describe() function, you can exclude these columns by changing their data type to something describe() will ignore, such as a string:

In [17]:
df_orders_no_eval_set['order_id'] = df_orders_no_eval_set['order_id'].astype('str')

In [18]:
df_orders_no_eval_set['order_id'].dtype

dtype('O')

In [19]:
# You should get an “O,” which stands for object, pandas’ version of the string:

# Transposing Data

In [20]:
# Transposing refers to turning your dataframe’s rows into columns, and vice versa. This is also known as changing your data from “wide format” into “long format.”

In [21]:
df_departments

Unnamed: 0,department_id,1,2,3,4,5,6,7,8,9,...,12,13,14,15,16,17,18,19,20,21
0,department,frozen,other,bakery,produce,alcohol,international,beverages,pets,dry goods pasta,...,meat seafood,pantry,breakfast,canned goods,dairy eggs,household,babies,snacks,deli,missing


In [22]:
# To turn the table or dataframe we use .T which is short for transpose

In [23]:
df_departments_transposed = df_departments.T

In [24]:
df_departments_transposed

Unnamed: 0,0
department_id,department
1,frozen
2,other
3,bakery
4,produce
5,alcohol
6,international
7,beverages
8,pets
9,dry goods pasta


In [25]:
# To add an index for easy assessing use

In [26]:
df_departments_transposed.reset_index()

Unnamed: 0,index,0
0,department_id,department
1,1,frozen
2,2,other
3,3,bakery
4,4,produce
5,5,alcohol
6,6,international
7,7,beverages
8,8,pets
9,9,dry goods pasta


In [27]:
# the goal is to remove the 0 as header and give the table a proper header

In [28]:
New_header = df_departments_transposed.iloc[0]

In [29]:
New_header

0    department
Name: department_id, dtype: object

In [30]:
df_departments_transposed

Unnamed: 0,0
department_id,department
1,frozen
2,other
3,bakery
4,produce
5,alcohol
6,international
7,beverages
8,pets
9,dry goods pasta


In [31]:
#  As such, let’s create a new dataframe that only copies over rows beyond the first row and leave behind the old header

In [32]:
df_departments_transposed_2 = df_departments_transposed[1:22]

In [33]:
df_departments_transposed_2

Unnamed: 0,0
1,frozen
2,other
3,bakery
4,produce
5,alcohol
6,international
7,beverages
8,pets
9,dry goods pasta
10,bulk


In [34]:
df_departments_transposed_2.columns = New_header

In [35]:
df_departments_transposed_2

department_id,department
1,frozen
2,other
3,bakery
4,produce
5,alcohol
6,international
7,beverages
8,pets
9,dry goods pasta
10,bulk


#   Data Dictionaries

In [36]:
# The goal is to create a key; value pair so we could assign values to our keys in this case departments. CREATE dictionary using index as key.

In [37]:
data_dict = df_departments_transposed_2.to_dict("index")

In [38]:
data_dict

{'1': {'department': 'frozen'},
 '2': {'department': 'other'},
 '3': {'department': 'bakery'},
 '4': {'department': 'produce'},
 '5': {'department': 'alcohol'},
 '6': {'department': 'international'},
 '7': {'department': 'beverages'},
 '8': {'department': 'pets'},
 '9': {'department': 'dry goods pasta'},
 '10': {'department': 'bulk'},
 '11': {'department': 'personal care'},
 '12': {'department': 'meat seafood'},
 '13': {'department': 'pantry'},
 '14': {'department': 'breakfast'},
 '15': {'department': 'canned goods'},
 '16': {'department': 'dairy eggs'},
 '17': {'department': 'household'},
 '18': {'department': 'babies'},
 '19': {'department': 'snacks'},
 '20': {'department': 'deli'},
 '21': {'department': 'missing'}}

In [39]:
df_products.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,1,Chocolate Sandwich Cookies,61,19,5.8
1,2,All-Seasons Salt,104,13,9.3
2,3,Robust Golden Unsweetened Oolong Tea,94,7,4.5
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1,10.5
4,5,Green Chile Anytime Sauce,5,13,4.3


In [40]:
data_dict.get('19')

{'department': 'snacks'}

#  Subsetting

In [41]:
# create a subset for your df_prods dataframe that only contains data from the snacks department. There are two method for this

In [42]:
# 1

In [43]:
df_snacks =  df_products[df_products['department_id']==19]

In [44]:
df_snacks.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,1,Chocolate Sandwich Cookies,61,19,5.8
15,16,Mint Chocolate Flavored Syrup,103,19,5.2
24,25,Salted Caramel Lean Protein & Fiber Bar,3,19,1.9
31,32,Nacho Cheese White Bean Chips,107,19,4.9
40,41,Organic Sourdough Einkorn Crackers Rosemary,78,19,6.5


In [45]:
# 2

In [46]:
df_snacks_2 =  df_products[df_products['department_id'].isin([19])]

In [47]:
df_snacks_2

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,1,Chocolate Sandwich Cookies,61,19,5.8
15,16,Mint Chocolate Flavored Syrup,103,19,5.2
24,25,Salted Caramel Lean Protein & Fiber Bar,3,19,1.9
31,32,Nacho Cheese White Bean Chips,107,19,4.9
40,41,Organic Sourdough Einkorn Crackers Rosemary,78,19,6.5
...,...,...,...,...,...
49666,49662,Bacon Cheddar Pretzel Pieces,107,19,3.6
49669,49665,Super Dark Coconut Ash & Banana Chocolate Bar,45,19,6.9
49670,49666,Ginger Snaps Snacking Cookies,61,19,5.2
49675,49671,Milk Chocolate Drops,45,19,3.0


In [48]:
df_orders.to_csv(os.path.join(file_path, '02 Data','Prepared Data', 'orders_wrangled.csv'))

# TASK 4.4

In [49]:
# Find another identifier variable in the df_ords dataframe that doesn’t need to be included in your analysis as a numeric variable and change it to a suitable format.

In [50]:
df_orders_no_eval_set['user_id'] = df_orders_no_eval_set['user_id'].astype('str')

In [51]:
# Look for a variable in your df_ords dataframe with an unintuitive name and change its name without overwriting the data frame.

In [52]:
df_orders_no_eval_set.rename(columns = {"order_hour_of_day": "hour_of_the_day_order_was_made"}, inplace = True)

In [53]:
# Your client wants to know what the busiest hour is for placing orders. Find the frequency of the corresponding variable and share your findings.

In [54]:
df_orders_no_eval_set["hour_of_the_day_order_was_made"].value_counts(dropna = False)

10    288418
11    284728
15    283639
14    283042
13    277999
12    272841
16    272553
9     257812
17    228795
18    182912
8     178201
19    140569
20    104292
7      91868
21     78109
22     61468
23     40043
6      30529
0      22758
1      12398
5       9569
2       7539
4       5527
3       5474
Name: hour_of_the_day_order_was_made, dtype: int64

   The busiest hour for palcing orders is 10 am (288418)

In [55]:
#  Determine the meaning behind a value of 4 in the "department_id" column within the df_prods dataframe using a data dictionary.

In [56]:
data_dict

{'1': {'department': 'frozen'},
 '2': {'department': 'other'},
 '3': {'department': 'bakery'},
 '4': {'department': 'produce'},
 '5': {'department': 'alcohol'},
 '6': {'department': 'international'},
 '7': {'department': 'beverages'},
 '8': {'department': 'pets'},
 '9': {'department': 'dry goods pasta'},
 '10': {'department': 'bulk'},
 '11': {'department': 'personal care'},
 '12': {'department': 'meat seafood'},
 '13': {'department': 'pantry'},
 '14': {'department': 'breakfast'},
 '15': {'department': 'canned goods'},
 '16': {'department': 'dairy eggs'},
 '17': {'department': 'household'},
 '18': {'department': 'babies'},
 '19': {'department': 'snacks'},
 '20': {'department': 'deli'},
 '21': {'department': 'missing'}}

In [57]:
data_dict.get('4')

{'department': 'produce'}

In [58]:
# The sales team in your client’s organization wants to know more about breakfast item sales. Create a subset containing only the required information.

In [59]:
df_products

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,1,Chocolate Sandwich Cookies,61,19,5.8
1,2,All-Seasons Salt,104,13,9.3
2,3,Robust Golden Unsweetened Oolong Tea,94,7,4.5
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1,10.5
4,5,Green Chile Anytime Sauce,5,13,4.3
...,...,...,...,...,...
49688,49684,"Vodka, Triple Distilled, Twist of Vanilla",124,5,5.3
49689,49685,En Croute Roast Hazelnut Cranberry,42,1,3.1
49690,49686,Artisan Baguette,112,3,7.8
49691,49687,Smartblend Healthy Metabolism Dry Cat Food,41,8,4.7


In [60]:
df_breakfast = df_products[df_products["department_id"] == 14]

In [61]:
df_breakfast

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
27,28,Wheat Chex Cereal,121,14,10.1
33,34,,121,14,12.2
67,68,"Pancake Mix, Buttermilk",130,14,13.7
89,90,Smorz Cereal,121,14,3.9
210,211,Gluten Free Organic Cereal Coconut Maple Vanilla,130,14,3.6
...,...,...,...,...,...
49330,49326,Cereal Variety Fun Pack,121,14,9.1
49395,49391,Light and Fluffy Buttermilk Pancake Mix,130,14,2.0
49547,49543,Chocolate Cheerios Cereal,121,14,10.8
49637,49633,Shake 'N Pour Buttermilk Pancake Mix,130,14,14.2


In [62]:
# They’d also like to see details about customers who might be throwing dinner parties. Your task is to find all observations from the entire dataframe that include items from the following departments: alcohol, deli, beverages, and meat/seafood. You’ll need to present this subset to your client.

In [63]:
df_dinner_party  =  df_products[df_products['department_id'].isin([5,7,12,20])]

In [64]:
df_dinner_party = df_products.loc[df_products['department_id'].isin([5,7,12,20])]

In [65]:
df_dinner_party 

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
2,3,Robust Golden Unsweetened Oolong Tea,94,7,4.5
6,7,Pure Coconut Water With Orange,98,7,4.4
9,10,Sparkling Orange Juice & Prickly Pear Beverage,115,7,8.4
10,11,Peach Mango Juice,31,7,2.8
16,17,Rendered Duck Fat,35,12,17.1
...,...,...,...,...,...
49676,49672,Cafe Mocha K-Cup Packs,26,7,6.5
49679,49675,Cinnamon Dolce Keurig Brewed K Cups,26,7,14.0
49680,49676,Ultra Red Energy Drink,64,7,14.5
49686,49682,California Limeade,98,7,4.3


In [66]:
# It has 7650 rows and 5 columns

In [67]:
# Someone from the data engineers team in Instacart thinks they’ve spotted something strange about the customer with a "user_id" of “1.” Extract all the information you can about this user.

In [68]:
df_orders_no_eval_set

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,hour_of_the_day_order_was_made,days_since_prior_order
0,2539329,1,1,2,8,
1,2398795,1,2,3,7,15.0
2,473747,1,3,3,12,21.0
3,2254736,1,4,4,7,29.0
4,431534,1,5,4,15,28.0
...,...,...,...,...,...,...
3421078,2266710,206209,10,5,18,29.0
3421079,1854736,206209,11,4,10,30.0
3421080,626363,206209,12,1,12,18.0
3421081,2977660,206209,13,1,12,7.0


In [69]:
 df_orders_no_eval_set["user_id"]

0               1
1               1
2               1
3               1
4               1
            ...  
3421078    206209
3421079    206209
3421080    206209
3421081    206209
3421082    206209
Name: user_id, Length: 3421083, dtype: object

In [70]:

 df_user_1 = df_orders_no_eval_set[df_orders_no_eval_set.user_id.isin(['1'])]

In [71]:
df_user_1

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,hour_of_the_day_order_was_made,days_since_prior_order
0,2539329,1,1,2,8,
1,2398795,1,2,3,7,15.0
2,473747,1,3,3,12,21.0
3,2254736,1,4,4,7,29.0
4,431534,1,5,4,15,28.0
5,3367565,1,6,2,7,19.0
6,550135,1,7,1,9,20.0
7,3108588,1,8,1,14,14.0
8,2295261,1,9,1,16,0.0
9,2550362,1,10,4,8,30.0


In [72]:
df_user_1.describe()

Unnamed: 0,order_number,orders_day_of_week,hour_of_the_day_order_was_made,days_since_prior_order
count,11.0,11.0,11.0,10.0
mean,6.0,2.636364,10.090909,19.0
std,3.316625,1.286291,3.477198,9.030811
min,1.0,1.0,7.0,0.0
25%,3.5,1.5,7.5,14.25
50%,6.0,3.0,8.0,19.5
75%,8.5,4.0,13.0,26.25
max,11.0,4.0,16.0,30.0


In [73]:
df_orders.to_csv(os.path.join(file_path, '02 Data','Prepared Data', 'orders_wrangled.csv'))

In [74]:
df_departments_transposed_2.to_csv(os.path.join(file_path, '02 Data','Prepared Data', 'departments_wrangled.csv'))