# 4.9 Part 1.2 - Intro to Data Visualisation with Python

## Clean & wrangle ords_prods; merge with customers dataframes

### This script contains the following points:

#### 1. Import libraries
#### 2. Import dataframes
#### 3. Data wrangling
####      3.1 ords_prods dataframe
#### 4. Consistency checks
####      4.1 ords_prods dataframe
#### 5. Merging the dataframes


# Import libraries

In [1]:
# import libraries

import pandas as pd
import numpy as np
import os

# Import dataframes

In [2]:
# project folder path

path = r'C:\Users\Odette\Desktop\CareerFoundry\Immersion Courses\Course 4\Instacart Basket Analysis'

In [3]:
# import of customers data

customers = pd.read_csv(os.path.join(path, '02 Data', 'Prepared Data', 'customers_clean.csv'), index_col = False)

In [4]:
# import of orders_products data

ords_prods = pd.read_pickle(os.path.join(path, '02 Data','Prepared Data', 'orders_products.pkl'))

In [5]:
ords_prods.head()

Unnamed: 0.1,Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_last_order,product_id,add_to_cart_order,reordered,...,_merge,price_range_loc,busiest_days,busiest_period_of_day,max_order,loyalty_flag,average_spend,spending_type,median_order_days,order_frequency
0,0,2539329,1,1,2,8,0.0,196,1,0,...,both,Mid-range product,Regularly busy,Average orders,10,New customer,6.367797,Low spender,20.0,Regular customer
1,1,2398795,1,2,3,7,15.0,196,1,1,...,both,Mid-range product,Least busy,Average orders,10,New customer,6.367797,Low spender,20.0,Regular customer
2,2,473747,1,3,3,12,21.0,196,1,1,...,both,Mid-range product,Least busy,Most orders,10,New customer,6.367797,Low spender,20.0,Regular customer
3,3,2254736,1,4,4,7,29.0,196,1,1,...,both,Mid-range product,Least busy,Average orders,10,New customer,6.367797,Low spender,20.0,Regular customer
4,4,431534,1,5,4,15,28.0,196,1,1,...,both,Mid-range product,Least busy,Most orders,10,New customer,6.367797,Low spender,20.0,Regular customer


In [6]:
ords_prods.shape

(32404859, 24)

# Data Wrangling

## ords_prods dataframe

In [7]:
# dropping Unnamed: 0 column

ords_prods = ords_prods.drop(columns = ['Unnamed: 0', '_merge'])

In [8]:
ords_prods.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_last_order,product_id,add_to_cart_order,reordered,product_name,...,prices,price_range_loc,busiest_days,busiest_period_of_day,max_order,loyalty_flag,average_spend,spending_type,median_order_days,order_frequency
0,2539329,1,1,2,8,0.0,196,1,0,Soda,...,9.0,Mid-range product,Regularly busy,Average orders,10,New customer,6.367797,Low spender,20.0,Regular customer
1,2398795,1,2,3,7,15.0,196,1,1,Soda,...,9.0,Mid-range product,Least busy,Average orders,10,New customer,6.367797,Low spender,20.0,Regular customer
2,473747,1,3,3,12,21.0,196,1,1,Soda,...,9.0,Mid-range product,Least busy,Most orders,10,New customer,6.367797,Low spender,20.0,Regular customer
3,2254736,1,4,4,7,29.0,196,1,1,Soda,...,9.0,Mid-range product,Least busy,Average orders,10,New customer,6.367797,Low spender,20.0,Regular customer
4,431534,1,5,4,15,28.0,196,1,1,Soda,...,9.0,Mid-range product,Least busy,Most orders,10,New customer,6.367797,Low spender,20.0,Regular customer


# Consistency checks

## ords_prods dataframe

In [9]:
ords_prods.dtypes

order_id                   int64
user_id                    int64
order_number               int64
orders_day_of_week         int64
order_hour_of_day          int64
days_since_last_order    float64
product_id                 int64
add_to_cart_order          int64
reordered                  int64
product_name              object
aisle_id                   int64
department_id              int64
prices                   float64
price_range_loc           object
busiest_days              object
busiest_period_of_day     object
max_order                  int64
loyalty_flag              object
average_spend            float64
spending_type             object
median_order_days        float64
order_frequency           object
dtype: object

In [10]:
# convert the days_since_last_order, orders_day_of_week, order_number & order_hour_of_day columns to int8

ords_prods[['days_since_last_order', 'orders_day_of_week', 'order_number', 'order_hour_of_day', 'max_order']] = ords_prods[['days_since_last_order', 'orders_day_of_week', 'order_number', 'order_hour_of_day', 'max_order']].astype('int8')

In [11]:
# convert the user_id, product_id, aisle_id, department_id & order_id column to category format

ords_prods[['user_id', 'product_id', 'aisle_id', 'department_id', 'order_id']] = ords_prods[['user_id', 'product_id', 'aisle_id', 'department_id', 'order_id']].astype('category')

In [12]:
# convert the median_order_days, prices & average_spend columns to float16

ords_prods[['median_order_days', 'prices', 'average_spend']] = ords_prods[['median_order_days', 'prices', 'average_spend']].astype('float16')

In [13]:
# convert the price_range_loc, loyalty_flag, order_frequency, busiest_days & busiest_period_of_day columns to category

ords_prods[['price_range_loc', 'loyalty_flag', 'order_frequency', 'busiest_days','busiest_period_of_day']] = ords_prods[['price_range_loc', 'loyalty_flag', 'order_frequency', 'busiest_days', 'busiest_period_of_day']].astype('category')

In [14]:
# convert the reordered column to bool format

ords_prods['reordered'] = ords_prods['reordered'].astype('bool')

In [15]:
# convert the add_to_cart_order column to int16 format

ords_prods['add_to_cart_order'] = ords_prods['add_to_cart_order'].astype('int16')

In [16]:
ords_prods.dtypes

order_id                 category
user_id                  category
order_number                 int8
orders_day_of_week           int8
order_hour_of_day            int8
days_since_last_order        int8
product_id               category
add_to_cart_order           int16
reordered                    bool
product_name               object
aisle_id                 category
department_id            category
prices                    float16
price_range_loc          category
busiest_days             category
busiest_period_of_day    category
max_order                    int8
loyalty_flag             category
average_spend             float16
spending_type              object
median_order_days         float16
order_frequency          category
dtype: object

In [17]:
# mark values over $100 in prices as missing (NaN) as they don't make sense in relation to every other price

ords_prods.loc[ords_prods['prices'] >100, 'prices'] = np.nan

In [18]:
# check it worked by running a max value check

ords_prods['prices'].max()

25.0

## customers dataframe

In [19]:
customers.dtypes

user_id          int64
gender          object
state           object
age              int64
date_joined     object
n_dependants     int64
fam_status      object
income           int64
dtype: object

In [20]:
# convert the user_id column to string/object format

customers['user_id'] = customers['user_id'].astype('category')

In [21]:
# convert the date_joined column to datetime format

customers['date_joined'] = customers['date_joined'].astype('datetime64[s]')

In [22]:
# check data types of all columns

customers.dtypes

user_id               category
gender                  object
state                   object
age                      int64
date_joined     datetime64[ns]
n_dependants             int64
fam_status              object
income                   int64
dtype: object

# Merging the dataframes

In [23]:
# merge

df_merged = ords_prods.merge(customers, on = 'user_id', indicator = True)

In [24]:
df_merged.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_last_order,product_id,add_to_cart_order,reordered,product_name,...,median_order_days,order_frequency,gender,state,age,date_joined,n_dependants,fam_status,income,_merge
0,2539329,1,1,2,8,0,196,1,False,Soda,...,20.0,Regular customer,Female,Alabama,31,2019-02-17,3,married,40423,both
1,2398795,1,2,3,7,15,196,1,True,Soda,...,20.0,Regular customer,Female,Alabama,31,2019-02-17,3,married,40423,both
2,473747,1,3,3,12,21,196,1,True,Soda,...,20.0,Regular customer,Female,Alabama,31,2019-02-17,3,married,40423,both
3,2254736,1,4,4,7,29,196,1,True,Soda,...,20.0,Regular customer,Female,Alabama,31,2019-02-17,3,married,40423,both
4,431534,1,5,4,15,28,196,1,True,Soda,...,20.0,Regular customer,Female,Alabama,31,2019-02-17,3,married,40423,both


In [25]:
# export data to pkl 
 
df_merged.to_pickle(os.path.join(path, '02 Data','Prepared Data', 'orders_products_final.pkl'))