# 4.9.1 Data Visualization - Part 1

## This script contain following points:

### 01. Wrangle the customer dataset so that it follows consistent logic

### 02. Complete the fundamental data quality and consistency checks

### 03. Combine customer data with the rest of prepared Instacart data

### 04. Export this new dataframe as a pickle file

## 01. Import libraries

In [44]:
# Import libraries
import pandas as pd
import numpy as np
import os

## 02. Import the customer dataset

In [45]:
# Create path
path = r'C:\Users\Quinn\OneDrive\Documents\CF - Data Analyst\Data Immersion\Instacart Basket Analysis'

In [46]:
# Import customer dataset
customer = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'customers.csv'))

## 03. Wrangle the customer dataset 

In [47]:
# Check out the original dataset
customer.shape

(206209, 10)

In [48]:
customer.info

<bound method DataFrame.info of         user_id First Name    Surnam  Gender           STATE  Age date_joined  \
0         26711    Deborah  Esquivel  Female        Missouri   48    1/1/2017   
1         33890   Patricia      Hart  Female      New Mexico   36    1/1/2017   
2         65803    Kenneth    Farley    Male           Idaho   35    1/1/2017   
3        125935   Michelle     Hicks  Female            Iowa   40    1/1/2017   
4        130797        Ann   Gilmore  Female        Maryland   26    1/1/2017   
...         ...        ...       ...     ...             ...  ...         ...   
206204   168073       Lisa      Case  Female  North Carolina   44    4/1/2020   
206205    49635     Jeremy   Robbins    Male          Hawaii   62    4/1/2020   
206206   135902      Doris  Richmond  Female        Missouri   66    4/1/2020   
206207    81095       Rose   Rollins  Female      California   27    4/1/2020   
206208    80148    Cynthia     Noble  Female        New York   55    4/1/2020

#### Observation: The first name and surname of the customers are PII, which would be unnecessary information for our analysis. These columns should be dropped. 

#### I also renamed n_dependants to dependants and fam_status to family_status, which have more precise meanings.

In [49]:
# Drop the unnecessary columns
customer = customer.drop(columns = ['First Name','Surnam'])

In [50]:
# Rename two columns mentioned above
customer.rename(columns = {'n_dependants' : 'dependants', 'fam_status' : ' family_status'}, inplace = True)

In [51]:
# Check the result
customer.head()

Unnamed: 0,user_id,Gender,STATE,Age,date_joined,dependants,family_status,income
0,26711,Female,Missouri,48,1/1/2017,3,married,165665
1,33890,Female,New Mexico,36,1/1/2017,0,single,59285
2,65803,Male,Idaho,35,1/1/2017,2,married,99568
3,125935,Female,Iowa,40,1/1/2017,0,single,42049
4,130797,Female,Maryland,26,1/1/2017,1,married,40374


In [52]:
# Check for datatype of each column
customer.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 206209 entries, 0 to 206208
Data columns (total 8 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   user_id         206209 non-null  int64 
 1   Gender          206209 non-null  object
 2   STATE           206209 non-null  object
 3   Age             206209 non-null  int64 
 4   date_joined     206209 non-null  object
 5   dependants      206209 non-null  int64 
 6    family_status  206209 non-null  object
 7   income          206209 non-null  int64 
dtypes: int64(4), object(4)
memory usage: 12.6+ MB


#### The user_id column indicates the sequence of the customer; it shouldn't represent actual numeric values. I shall change the data type for this column first.

In [53]:
customer['user_id'] = customer['user_id'].astype('str')

In [54]:
customer['user_id'].dtype

dtype('O')

#### The date_joined column is set as 'str' datatype, which is not usefull for our analysis, so I change it to datetime datatype

In [55]:
customer['date_joined'] = pd.to_datetime(customer['date_joined'])

In [56]:
# Check out the result
customer.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 206209 entries, 0 to 206208
Data columns (total 8 columns):
 #   Column          Non-Null Count   Dtype         
---  ------          --------------   -----         
 0   user_id         206209 non-null  object        
 1   Gender          206209 non-null  object        
 2   STATE           206209 non-null  object        
 3   Age             206209 non-null  int64         
 4   date_joined     206209 non-null  datetime64[ns]
 5   dependants      206209 non-null  int64         
 6    family_status  206209 non-null  object        
 7   income          206209 non-null  int64         
dtypes: datetime64[ns](1), int64(3), object(4)
memory usage: 12.6+ MB


## 04. Complete the fundamental data quality and consistency checks

In [57]:
# Check for mixtyped data
mixed_dtype = pd.api.types.is_object_dtype(customer)

In [58]:
print("Mixed Data Types:", mixed_dtype) 

Mixed Data Types: False


#### The result shows that there is no column with mixtyped data

In [59]:
# Check for missing values
customer.isnull().sum()

user_id           0
Gender            0
STATE             0
Age               0
date_joined       0
dependants        0
 family_status    0
income            0
dtype: int64

#### There is no missing value in the dataframe

In [60]:
# Check for duplicated values
customer_dups = customer[customer.duplicated()]

In [61]:
customer_dups

Unnamed: 0,user_id,Gender,STATE,Age,date_joined,dependants,family_status,income


#### There is no duplicates found in the customer dataset

In [66]:
# Check out the final observations in the customer dataset
customer.shape

(206209, 8)

## 05. Combine customer data with the rest of prepared Instacart data

In [63]:
#Import the grouped dataset from previous exercise
ords_prods_merge = pd.read_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'orders_products_merged_group.pkl'))

In [64]:
# Check the shape of the ords_prods_merge dataset
ords_prods_merge.shape

(32404854, 25)

In [73]:
ords_prods_merge.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_last_order,first_order,product_id,add_to_cart_order,reordered,...,price_range_loc,busiest day,busiest days,busiest_period_of_day,max_order,loyalty_flag,avg_price,spender_flag,Customer_frequency,frequency_flag
0,2539329,1,1,2,8,,First Order,196,1,0,...,Mid-range product,Regularly busy,Regularly busy,Average orders,10,New customer,6.367797,Low spender,20.5,Non_frequent_customer
1,2398795,1,2,3,7,15.0,Repeat Customer,196,1,1,...,Mid-range product,Regularly busy,Regularly busy,Average orders,10,New customer,6.367797,Low spender,20.5,Non_frequent_customer
2,473747,1,3,3,12,21.0,Repeat Customer,196,1,1,...,Mid-range product,Regularly busy,Regularly busy,Most orders,10,New customer,6.367797,Low spender,20.5,Non_frequent_customer
3,2254736,1,4,4,7,29.0,Repeat Customer,196,1,1,...,Mid-range product,Least busy,Least busy,Average orders,10,New customer,6.367797,Low spender,20.5,Non_frequent_customer
4,431534,1,5,4,15,28.0,Repeat Customer,196,1,1,...,Mid-range product,Least busy,Least busy,Most orders,10,New customer,6.367797,Low spender,20.5,Non_frequent_customer


#### Both datasets have the shared column "user_id". This should be a fully matching column; hence, we don't need to specify a type of join and inner join will be used in default. The resulting data set will only contain observations included in both input data sets

In [69]:
# Check the datatype of 'user_id' column from the ords_prods_merge
ords_prods_merge['user_id'].dtype

dtype('int64')

In [70]:
# Change the datatype of the user_id from ords_prods_merge to 'str' to match user_id column from customer
ords_prods_merge['user_id'] = ords_prods_merge['user_id'].astype('str')

In [71]:
ords_prods_merge['user_id'].dtype

dtype('O')

In [75]:
# Check for columns name of the ords_prods_merge
ords_prods_merge.columns

Index(['order_id', 'user_id', 'order_number', 'orders_day_of_week',
       'order_hour_of_day', 'days_since_last_order', 'first_order',
       'product_id', 'add_to_cart_order', 'reordered', 'product_name',
       'aisle_id', 'department_id', 'prices', '_merge', 'price_range_loc',
       'busiest day', 'busiest days', 'busiest_period_of_day', 'max_order',
       'loyalty_flag', 'avg_price', 'spender_flag', 'Customer_frequency',
       'frequency_flag'],
      dtype='object')

In [76]:
# Drop the _merge columns to prevent error
ords_prods_merge = ords_prods_merge.drop(columns = ['_merge'])

In [77]:
ords_prods_merge.columns

Index(['order_id', 'user_id', 'order_number', 'orders_day_of_week',
       'order_hour_of_day', 'days_since_last_order', 'first_order',
       'product_id', 'add_to_cart_order', 'reordered', 'product_name',
       'aisle_id', 'department_id', 'prices', 'price_range_loc', 'busiest day',
       'busiest days', 'busiest_period_of_day', 'max_order', 'loyalty_flag',
       'avg_price', 'spender_flag', 'Customer_frequency', 'frequency_flag'],
      dtype='object')

In [78]:
# Merge both datasets
df_large = ords_prods_merge.merge(customer, on = 'user_id', indicator = True)

In [79]:
# Check out the result
df_large.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_last_order,first_order,product_id,add_to_cart_order,reordered,...,Customer_frequency,frequency_flag,Gender,STATE,Age,date_joined,dependants,family_status,income,_merge
0,2539329,1,1,2,8,,First Order,196,1,0,...,20.5,Non_frequent_customer,Female,Alabama,31,2019-02-17,3,married,40423,both
1,2398795,1,2,3,7,15.0,Repeat Customer,196,1,1,...,20.5,Non_frequent_customer,Female,Alabama,31,2019-02-17,3,married,40423,both
2,473747,1,3,3,12,21.0,Repeat Customer,196,1,1,...,20.5,Non_frequent_customer,Female,Alabama,31,2019-02-17,3,married,40423,both
3,2254736,1,4,4,7,29.0,Repeat Customer,196,1,1,...,20.5,Non_frequent_customer,Female,Alabama,31,2019-02-17,3,married,40423,both
4,431534,1,5,4,15,28.0,Repeat Customer,196,1,1,...,20.5,Non_frequent_customer,Female,Alabama,31,2019-02-17,3,married,40423,both


In [80]:
df_large.columns

Index(['order_id', 'user_id', 'order_number', 'orders_day_of_week',
       'order_hour_of_day', 'days_since_last_order', 'first_order',
       'product_id', 'add_to_cart_order', 'reordered', 'product_name',
       'aisle_id', 'department_id', 'prices', 'price_range_loc', 'busiest day',
       'busiest days', 'busiest_period_of_day', 'max_order', 'loyalty_flag',
       'avg_price', 'spender_flag', 'Customer_frequency', 'frequency_flag',
       'Gender', 'STATE', 'Age', 'date_joined', 'dependants', ' family_status',
       'income', '_merge'],
      dtype='object')

In [81]:
# Confirm the results of the merge using the merge flag
df_large['_merge'].value_counts()

_merge
both          32404854
left_only            0
right_only           0
Name: count, dtype: int64

#### The new dataframe has 32,404,859 observations. As mentioned, the result data set only shows observations included in both data set.

In [82]:
df_large.shape

(32404854, 32)

## 06. Export this new dataframe as a pickle file

In [83]:
df_large.to_pickle(os.path.join(path, '02 Data','Prepared Data', 'orders_products_all.pkl'))