## 01. Importing Libraries

In [45]:
import pandas as pd
import numpy as np 
import os

## 02. Importing Data

In [46]:
# Creating a path to project folder
path = r'C:\Users\TanaT\(CF) Achievement 4 - Instacart Basket Analysis'

In [47]:
path

'C:\\Users\\TanaT\\(CF) Achievement 4 - Instacart Basket Analysis'

In [48]:
# Importing orders data as a dataframe
df_ords = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'orders.csv'), index_col = False)

In [49]:
# Importing products data
df_prods = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'products.csv'), index_col = False)

In [50]:
# Importing department data 
df_dept = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'departments.csv'), index_col = False)

## 03. Data Wrangling Procedures (from exercise)

In [51]:
# Checking orders dataframe
df_ords.head()

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2539329,1,prior,1,2,8,
1,2398795,1,prior,2,3,7,15.0
2,473747,1,prior,3,3,12,21.0
3,2254736,1,prior,4,4,7,29.0
4,431534,1,prior,5,4,15,28.0


In [52]:
# Dropping 'eval_set' column from orders dataframe
df_ords.drop(columns = ['eval_set'])

Unnamed: 0,order_id,user_id,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2539329,1,1,2,8,
1,2398795,1,2,3,7,15.0
2,473747,1,3,3,12,21.0
3,2254736,1,4,4,7,29.0
4,431534,1,5,4,15,28.0
...,...,...,...,...,...,...
3421078,2266710,206209,10,5,18,29.0
3421079,1854736,206209,11,4,10,30.0
3421080,626363,206209,12,1,12,18.0
3421081,2977660,206209,13,1,12,7.0


In [53]:
# Overwriting original orders dataframe (without 'eval_set' column)
df_ords = df_ords.drop(columns = ['eval_set'])

## 04. Renaming Columns

In [54]:
# Changing column 'order_dow' name (overwriting the original column name with new column name)
df_ords.rename(columns = {'order_dow' : 'orders_day_of_week'}, inplace = True)

In [55]:
# Checking new column name
df_ords.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order
0,2539329,1,1,2,8,
1,2398795,1,2,3,7,15.0
2,473747,1,3,3,12,21.0
3,2254736,1,4,4,7,29.0
4,431534,1,5,4,15,28.0


## 05. Changing a Variable's Data Type

In [56]:
df_ords.describe()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order
count,3421083.0,3421083.0,3421083.0,3421083.0,3421083.0,3214874.0
mean,1710542.0,102978.2,17.15486,2.776219,13.45202,11.11484
std,987581.7,59533.72,17.73316,2.046829,4.226088,9.206737
min,1.0,1.0,1.0,0.0,0.0,0.0
25%,855271.5,51394.0,5.0,1.0,10.0,4.0
50%,1710542.0,102689.0,11.0,3.0,13.0,7.0
75%,2565812.0,154385.0,23.0,5.0,16.0,15.0
max,3421083.0,206209.0,100.0,6.0,23.0,30.0


In [57]:
# Excluding columns by changing their data type (to be ignored for statistical operations)
## Changing 'order_id' column data type from number to string (object)
df_ords['order_id'] = df_ords['order_id'].astype('str')

In [58]:
df_ords['order_id'].dtype

dtype('O')

In [59]:
#Checking data types
df_ords.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3421083 entries, 0 to 3421082
Data columns (total 6 columns):
 #   Column                  Dtype  
---  ------                  -----  
 0   order_id                object 
 1   user_id                 int64  
 2   order_number            int64  
 3   orders_day_of_week      int64  
 4   order_hour_of_day       int64  
 5   days_since_prior_order  float64
dtypes: float64(1), int64(4), object(1)
memory usage: 156.6+ MB


## 06. Transposing Department Data

In [60]:
# Checking department dataframe
df_dept.head()

Unnamed: 0,department_id,1,2,3,4,5,6,7,8,9,...,12,13,14,15,16,17,18,19,20,21
0,department,frozen,other,bakery,produce,alcohol,international,beverages,pets,dry goods pasta,...,meat seafood,pantry,breakfast,canned goods,dairy eggs,household,babies,snacks,deli,missing


In [61]:
# Transposing df_dept
df_dept.T

Unnamed: 0,0
department_id,department
1,frozen
2,other
3,bakery
4,produce
5,alcohol
6,international
7,beverages
8,pets
9,dry goods pasta


In [62]:
# Overwriting old df_dept with the new transposed version
df_dept_t = df_dept.T

In [63]:
df_dept.T

Unnamed: 0,0
department_id,department
1,frozen
2,other
3,bakery
4,produce
5,alcohol
6,international
7,beverages
8,pets
9,dry goods pasta


In [64]:
# Fixing header and first row issues
## Adding an index (doesn't have one) to remove 0 at the top - creating an index column
df_dept_t.reset_index()

Unnamed: 0,index,0
0,department_id,department
1,1,frozen
2,2,other
3,3,bakery
4,4,produce
5,5,alcohol
6,6,international
7,7,beverages
8,8,pets
9,9,dry goods pasta


In [65]:
# Creating a new header - turning first row (index 0) into the new header
new_header = df_dept_t.iloc[0]

In [66]:
# Made the first row into the variable 'new_header'
new_header

0    department
Name: department_id, dtype: object

In [67]:
# Creating a new department dataframe
## Copying everything from df_dept_t EXCEPT the first row 
df_dept_t_new = df_dept_t[1:]

In [68]:
df_dept_t_new

Unnamed: 0,0
1,frozen
2,other
3,bakery
4,produce
5,alcohol
6,international
7,beverages
8,pets
9,dry goods pasta
10,bulk


In [69]:
# Putting the 'new_header' variable as the header for df_dept_t_new
df_dept_t_new.columns = new_header

In [70]:
df_dept_t_new

department_id,department
1,frozen
2,other
3,bakery
4,produce
5,alcohol
6,international
7,beverages
8,pets
9,dry goods pasta
10,bulk


## 07. Data Dictionary

In [71]:
# Turning departments data into a data dictionary 
data_dict = df_dept_t_new.to_dict('index')

In [72]:
data_dict

{'1': {'department': 'frozen'},
 '2': {'department': 'other'},
 '3': {'department': 'bakery'},
 '4': {'department': 'produce'},
 '5': {'department': 'alcohol'},
 '6': {'department': 'international'},
 '7': {'department': 'beverages'},
 '8': {'department': 'pets'},
 '9': {'department': 'dry goods pasta'},
 '10': {'department': 'bulk'},
 '11': {'department': 'personal care'},
 '12': {'department': 'meat seafood'},
 '13': {'department': 'pantry'},
 '14': {'department': 'breakfast'},
 '15': {'department': 'canned goods'},
 '16': {'department': 'dairy eggs'},
 '17': {'department': 'household'},
 '18': {'department': 'babies'},
 '19': {'department': 'snacks'},
 '20': {'department': 'deli'},
 '21': {'department': 'missing'}}

In [73]:
df_prods.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,1,Chocolate Sandwich Cookies,61,19,5.8
1,2,All-Seasons Salt,104,13,9.3
2,3,Robust Golden Unsweetened Oolong Tea,94,7,4.5
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1,10.5
4,5,Green Chile Anytime Sauce,5,13,4.3


In [74]:
# Using the data dictionary to find what the department numbers mean
## Finding what department 19 means in row 0 
print(data_dict.get('19'))

{'department': 'snacks'}


## Task 

In [75]:
# Step 2 - Find identifier variable and change data type. 
## Checking which variable to change 
df_ords.head(25)

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order
0,2539329,1,1,2,8,
1,2398795,1,2,3,7,15.0
2,473747,1,3,3,12,21.0
3,2254736,1,4,4,7,29.0
4,431534,1,5,4,15,28.0
5,3367565,1,6,2,7,19.0
6,550135,1,7,1,9,20.0
7,3108588,1,8,1,14,14.0
8,2295261,1,9,1,16,0.0
9,2550362,1,10,4,8,30.0


In [76]:
## Changing 'order_number' data type 
df_ords['order_number'] = df_ords['order_number'].astype('str')

In [77]:
df_ords['order_number'].dtype

dtype('O')

In [78]:
## Checking 'order_number' column excluded from descriptive statistics 
df_ords.describe()

Unnamed: 0,user_id,orders_day_of_week,order_hour_of_day,days_since_prior_order
count,3421083.0,3421083.0,3421083.0,3214874.0
mean,102978.2,2.776219,13.45202,11.11484
std,59533.72,2.046829,4.226088,9.206737
min,1.0,0.0,0.0,0.0
25%,51394.0,1.0,10.0,4.0
50%,102689.0,3.0,13.0,7.0
75%,154385.0,5.0,16.0,15.0
max,206209.0,6.0,23.0,30.0


In [111]:
# Step 3 - Change variable name without overwriting the dataframe.

## Changing 'order_number' to 'lifetime_order_count' (excluded 'inplace' to not overwrite dataframe)
df_ords.rename(columns = {'order_number': 'lifetime_order_count'})


Unnamed: 0,order_id,user_id,lifetime_order_count,orders_day_of_week,order_hour_of_day,days_since_prior_order
0,2539329,1,1,2,8,
1,2398795,1,2,3,7,15.0
2,473747,1,3,3,12,21.0
3,2254736,1,4,4,7,29.0
4,431534,1,5,4,15,28.0
...,...,...,...,...,...,...
3421078,2266710,206209,10,5,18,29.0
3421079,1854736,206209,11,4,10,30.0
3421080,626363,206209,12,1,12,18.0
3421081,2977660,206209,13,1,12,7.0


In [80]:
## Checking to see that I didn't overwrite the original dataframe
df_ords.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order
0,2539329,1,1,2,8,
1,2398795,1,2,3,7,15.0
2,473747,1,3,3,12,21.0
3,2254736,1,4,4,7,29.0
4,431534,1,5,4,15,28.0


In [84]:
# Step 4 - What is the busiest hour for placing orders?
## Finding the count of each hour 

frequency_hours = df_ords['order_hour_of_day'].value_counts()
print(frequency_hours)

order_hour_of_day
10    288418
11    284728
15    283639
14    283042
13    277999
12    272841
16    272553
9     257812
17    228795
18    182912
8     178201
19    140569
20    104292
7      91868
21     78109
22     61468
23     40043
6      30529
0      22758
1      12398
5       9569
2       7539
4       5527
3       5474
Name: count, dtype: int64


In [85]:
## Hour 10 or 10am is the busiest hour for placing orders, because it has the highest count.

In [88]:
# Step 5 - Determine the meaning behind department id of 4 using a data dictionary.
## Using the data dictionary previously made from department data
data_dict

{'1': {'department': 'frozen'},
 '2': {'department': 'other'},
 '3': {'department': 'bakery'},
 '4': {'department': 'produce'},
 '5': {'department': 'alcohol'},
 '6': {'department': 'international'},
 '7': {'department': 'beverages'},
 '8': {'department': 'pets'},
 '9': {'department': 'dry goods pasta'},
 '10': {'department': 'bulk'},
 '11': {'department': 'personal care'},
 '12': {'department': 'meat seafood'},
 '13': {'department': 'pantry'},
 '14': {'department': 'breakfast'},
 '15': {'department': 'canned goods'},
 '16': {'department': 'dairy eggs'},
 '17': {'department': 'household'},
 '18': {'department': 'babies'},
 '19': {'department': 'snacks'},
 '20': {'department': 'deli'},
 '21': {'department': 'missing'}}

In [89]:
## Finding what id of 4 means
print(data_dict.get('4'))


{'department': 'produce'}


In [90]:
## Department_id of 4 is 'produce'.

In [93]:
# Step 6 - Create a subset of only breakfast items.
## Breakfast items is 'department_id' 14 
df_breakfast_items = df_prods[df_prods['department_id']==14]

In [94]:
## Checking subset created
df_breakfast_items.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
27,28,Wheat Chex Cereal,121,14,10.1
33,34,,121,14,12.2
67,68,"Pancake Mix, Buttermilk",130,14,13.7
89,90,Smorz Cereal,121,14,3.9
210,211,Gluten Free Organic Cereal Coconut Maple Vanilla,130,14,3.6


In [96]:
# Step 7 - Details about products that customers might use to throw dinner parties.
## Making a subset with the specified departments (alcohol, deli, beverages, meat/seafood)
df_dinner_party_items = df_prods.loc[df_prods['department_id'].isin([5,20,7,12])]

In [99]:
## Checking first 25 rows of new subset
df_dinner_party_items.head(25)

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
2,3,Robust Golden Unsweetened Oolong Tea,94,7,4.5
6,7,Pure Coconut Water With Orange,98,7,4.4
9,10,Sparkling Orange Juice & Prickly Pear Beverage,115,7,8.4
10,11,Peach Mango Juice,31,7,2.8
16,17,Rendered Duck Fat,35,12,17.1
19,20,Pomegranate Cranberry & Aloe Vera Enrich Drink,98,7,6.0
22,23,Organic Turkey Burgers,49,12,8.2
34,35,Italian Herb Porcini Mushrooms Chicken Sausage,106,12,15.1
38,39,Daily Tangerine Citrus Flavored Beverage,64,7,12.5
39,40,Beef Hot Links Beef Smoked Sausage With Chile ...,106,12,22.5


In [101]:
# Step 8 - How many rows of data does the last dataframe you created have?
df_dinner_party_items.shape

(7650, 5)

In [102]:
## 7,650 rows and 5 columns in the last dataframe made.

In [107]:
# Step 9 - Extract all information about 'user_id' of '1'.
df_ords_user_1 = df_ords.loc[df_ords['user_id']==1]

In [108]:
## Info about 'user_id' of '1'
df_ords_user_1

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order
0,2539329,1,1,2,8,
1,2398795,1,2,3,7,15.0
2,473747,1,3,3,12,21.0
3,2254736,1,4,4,7,29.0
4,431534,1,5,4,15,28.0
5,3367565,1,6,2,7,19.0
6,550135,1,7,1,9,20.0
7,3108588,1,8,1,14,14.0
8,2295261,1,9,1,16,0.0
9,2550362,1,10,4,8,30.0


In [109]:
# Step 10 - Provide details about user_id 1 behaviors. What basic stats can you provide based on the information you have?
## Getting descriptive statistics on user_id of 1
df_ords_user_1.describe()

Unnamed: 0,user_id,orders_day_of_week,order_hour_of_day,days_since_prior_order
count,11.0,11.0,11.0,10.0
mean,1.0,2.636364,10.090909,19.0
std,0.0,1.286291,3.477198,9.030811
min,1.0,1.0,7.0,0.0
25%,1.0,1.5,7.5,14.25
50%,1.0,3.0,8.0,19.5
75%,1.0,4.0,13.0,26.25
max,1.0,4.0,16.0,30.0


In [None]:
## User_id 1 has placed 11 orders, most frequently on a Wednesday, and most orders are placed between 7-8am.
## Orders are placed quite infrequently (avg of 19 days). The longest period between orders was 30 days, with 1 instance of back-to-back orders.

## Exporting Data

In [112]:
# Step 12 - Export df_ords dataframe as a csv file.
df_ords.to_csv(os.path.join(path, '02 Data','Prepared Data', 'orders_wrangled.csv'))

In [113]:
# Step 13 - Export df_dept_t_new dataframe as a csv file.
df_dept_t_new.to_csv(os.path.join(path, '02 Data','Prepared Data', 'departments_wrangled.csv'))