# The Script wrangles and reformats data from the 'orders' and 'products' datasets. Then answers are provided to some ad-hoc client requests, and the wrangled data is exported

## The Script contains the following sections
### 1. Importing Libraries
### 2. Importing Data
### 3. Data Wrangling Steps
### 4. Reformatting Variables
### 5. Ad-hoc client requests
### 6. Exporting DFs


## 1. Importing Libraries

In [1]:
# Importing Libraries
import pandas as pd
import numpy as np
import os

## 2. Importing Data

In [2]:
# Creating shortcut to folder
path = r'C:\Users\seank\OneDrive\Dokumente\Career Foundry Data Analytics Course\Data Immersion\4 Python\03-2020_Instacart_Basket _Analysis'

# Creating list of variables to keep for orders df
orders_vars_list = ['order_id', 'user_id', 'order_number', 'order_dow', 'order_hour_of_day', 'days_since_prior_order']

# Importing datasets using path
     # Orders excluding eval_set
df_ords = pd.read_csv(os.path.join(path, '02_Data', 'Original_Data', 'orders.csv'), 
                            index_col = False,
                       usecols = orders_vars_list)
    # Product with all variables
df_prods = pd.read_csv(os.path.join(path, '02_Data', 'Original_Data', 'products.csv'), 
                            index_col = False)

In [3]:
    # Importing departments dataset
df_dep = pd.read_csv(os.path.join(path, '02_Data', 'Original_Data', 'departments.csv'),
                                   index_col = False)
df_dep.head()

Unnamed: 0,department_id,1,2,3,4,5,6,7,8,9,...,12,13,14,15,16,17,18,19,20,21
0,department,frozen,other,bakery,produce,alcohol,international,beverages,pets,dry goods pasta,...,meat seafood,pantry,breakfast,canned goods,dairy eggs,household,babies,snacks,deli,missing


## 3. Data Wrangling

In [4]:
# Transposing departments df
df_dep_t = df_dep.T

In [5]:
# Resetting Index
df_dep_t.reset_index()

Unnamed: 0,index,0
0,department_id,department
1,1,frozen
2,2,other
3,3,bakery
4,4,produce
5,5,alcohol
6,6,international
7,7,beverages
8,8,pets
9,9,dry goods pasta


In [6]:
# Changing the header to the first row
    # Extract new header
new_header = df_dep_t.iloc[0]
    # Deleting old first row
df_dep_t_new = df_dep_t[1:]
    # Assigning the header
df_dep_t_new.columns = new_header

In [7]:
# Turning dep df into data dictionary
data_dict = df_dep_t_new.to_dict('index')
data_dict

{'1': {'department': 'frozen'},
 '2': {'department': 'other'},
 '3': {'department': 'bakery'},
 '4': {'department': 'produce'},
 '5': {'department': 'alcohol'},
 '6': {'department': 'international'},
 '7': {'department': 'beverages'},
 '8': {'department': 'pets'},
 '9': {'department': 'dry goods pasta'},
 '10': {'department': 'bulk'},
 '11': {'department': 'personal care'},
 '12': {'department': 'meat seafood'},
 '13': {'department': 'pantry'},
 '14': {'department': 'breakfast'},
 '15': {'department': 'canned goods'},
 '16': {'department': 'dairy eggs'},
 '17': {'department': 'household'},
 '18': {'department': 'babies'},
 '19': {'department': 'snacks'},
 '20': {'department': 'deli'},
 '21': {'department': 'missing'}}

In [10]:
# Converting to df and saving
df_data_dict = pd.DataFrame.from_dict(data_dict, 
                                      orient="index")

# Saving dictionary
df_data_dict.to_pickle(os.path.join(path, "02_Data", "Prepared_Data", "dep_data_dict.pkl"))

In [9]:
# Renaming day of week variable
df_ords.rename(columns = {'order_dow' : 'orders_day_of_week'}, inplace = True)

In [10]:
# Changing order number to string
df_ords['order_id'] = df_ords['order_id'].astype('str')
df_ords['order_id'].dtype    # checking if it worked - it did

dtype('O')

## 4. Reformatting Variables

Data Type Suitability

In [11]:
    # Investigating the variables
df_ords.info()    # user_id is an index that doesn't need to be integer
df_ords['user_id'] = df_ords['user_id'].astype('str')


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3421083 entries, 0 to 3421082
Data columns (total 6 columns):
 #   Column                  Dtype  
---  ------                  -----  
 0   order_id                object 
 1   user_id                 int64  
 2   order_number            int64  
 3   orders_day_of_week      int64  
 4   order_hour_of_day       int64  
 5   days_since_prior_order  float64
dtypes: float64(1), int64(4), object(1)
memory usage: 156.6+ MB


Column Renaming Check

In [12]:
# Check current column names
df_ords.columns

Index(['order_id', 'user_id', 'order_number', 'orders_day_of_week',
       'order_hour_of_day', 'days_since_prior_order'],
      dtype='object')

## 5. Ad-hoc Client Requests

In [None]:
# Client wants to know what the busiest hour is for placing orders
busiest_hour = df_ords['order_hour_of_day'].value_counts().idxmax()  # Hour with max orders
order_count = df_ords['order_hour_of_day'].value_counts().max()  # Count of orders at busiest hour
print(f"The busiest hour for placing orders is {busiest_hour}:00 with {order_count} orders.")

The busiest hour for placing orders is 10:00 with 288418 orders.


In [None]:
# Checking overall distribution
df_ords['order_hour_of_day'].value_counts(dropna = False)

order_hour_of_day
10    288418
11    284728
15    283639
14    283042
13    277999
12    272841
16    272553
9     257812
17    228795
18    182912
8     178201
19    140569
20    104292
7      91868
21     78109
22     61468
23     40043
6      30529
0      22758
1      12398
5       9569
2       7539
4       5527
3       5474
Name: count, dtype: int64

Sales Team wants a subset with breakfast item sales

In [None]:
# Locating Breakfast in the departments dictionary
data_dict    # it's 14

{'1': {'department': 'frozen'},
 '2': {'department': 'other'},
 '3': {'department': 'bakery'},
 '4': {'department': 'produce'},
 '5': {'department': 'alcohol'},
 '6': {'department': 'international'},
 '7': {'department': 'beverages'},
 '8': {'department': 'pets'},
 '9': {'department': 'dry goods pasta'},
 '10': {'department': 'bulk'},
 '11': {'department': 'personal care'},
 '12': {'department': 'meat seafood'},
 '13': {'department': 'pantry'},
 '14': {'department': 'breakfast'},
 '15': {'department': 'canned goods'},
 '16': {'department': 'dairy eggs'},
 '17': {'department': 'household'},
 '18': {'department': 'babies'},
 '19': {'department': 'snacks'},
 '20': {'department': 'deli'},
 '21': {'department': 'missing'}}

In [20]:
# Creating subset of all items in breakfast department
df_breakfast = df_prods[df_prods['department_id'] == 14]
df_breakfast.head()    # viewing first 5 rows

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
27,28,Wheat Chex Cereal,121,14,10.1
33,34,,121,14,12.2
67,68,"Pancake Mix, Buttermilk",130,14,13.7
89,90,Smorz Cereal,121,14,3.9
210,211,Gluten Free Organic Cereal Coconut Maple Vanilla,130,14,3.6


They want another subset of data with all dinner party items (alcohol, deli, beverages, and meat/seafood), and want to know how many rows in the dataframe

In [None]:
# Locating these departments in the data dictionary
data_dict    # alcohol = 5, deli = 20, beverages = 7, meat seafod = 12


{'1': {'department': 'frozen'},
 '2': {'department': 'other'},
 '3': {'department': 'bakery'},
 '4': {'department': 'produce'},
 '5': {'department': 'alcohol'},
 '6': {'department': 'international'},
 '7': {'department': 'beverages'},
 '8': {'department': 'pets'},
 '9': {'department': 'dry goods pasta'},
 '10': {'department': 'bulk'},
 '11': {'department': 'personal care'},
 '12': {'department': 'meat seafood'},
 '13': {'department': 'pantry'},
 '14': {'department': 'breakfast'},
 '15': {'department': 'canned goods'},
 '16': {'department': 'dairy eggs'},
 '17': {'department': 'household'},
 '18': {'department': 'babies'},
 '19': {'department': 'snacks'},
 '20': {'department': 'deli'},
 '21': {'department': 'missing'}}

In [22]:
df_prods['department_id'].dtype    # ensuring department_id is integer

dtype('int64')

In [23]:
 # subsetting
df_dinner_party = df_prods.loc[df_prods['department_id'].isin([5,20,7,12])]

In [24]:
# Counting rows of dinner party subset
num_of_rows = df_dinner_party.shape[0]
print(f"The number of rows is {num_of_rows}")

The number of rows is 7650


Someone from the data engineers team thinks they’ve spotted something strange about the customer with a "user_id" of “1.” Extracting all the information you can about this user.

In [None]:
# Subsetting for this user
df_user1 = df_ords.loc[df_ords['user_id'] == '1']
df_user1

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order
0,2539329,1,1,2,8,
1,2398795,1,2,3,7,15.0
2,473747,1,3,3,12,21.0
3,2254736,1,4,4,7,29.0
4,431534,1,5,4,15,28.0
5,3367565,1,6,2,7,19.0
6,550135,1,7,1,9,20.0
7,3108588,1,8,1,14,14.0
8,2295261,1,9,1,16,0.0
9,2550362,1,10,4,8,30.0


In [26]:
# Basic stats on this user
df_user1.describe()  

Unnamed: 0,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order
count,11.0,11.0,11.0,10.0
mean,6.0,2.636364,10.090909,19.0
std,3.316625,1.286291,3.477198,9.030811
min,1.0,1.0,7.0,0.0
25%,3.5,1.5,7.5,14.25
50%,6.0,3.0,8.0,19.5
75%,8.5,4.0,13.0,26.25
max,11.0,4.0,16.0,30.0


In [27]:
user1modes = df_user1.mode().iloc[0] # also calculating most often occuring values
print(user1modes)

order_id                  1187899
user_id                         1
order_number                    1
orders_day_of_week            4.0
order_hour_of_day             7.0
days_since_prior_order       14.0
Name: 0, dtype: object


In [28]:
print('they have 11 total orders, \
most frequently ordered on Wednesday, \
Average time is 10am, \
with an average of 19 days between orders')

they have 11 total orders, most frequently ordered on Wednesday, Average time is 10am, with an average of 19 days between orders


## 6. Exporting DFs

In [60]:
# exporting orders and departments dfs to csv files
df_ords.to_csv(os.path.join(path, '02_Data', 'Prepared_Data', 'orders_wrangled.csv'),
              index=False)

In [62]:
df_dep_t_new.to_csv(os.path.join(path, '02_Data', 'Prepared_Data', 'departments_wrangled.csv'),
                   index=False)