# 4.6 Combining & Exporting Data

## This script contains the following points:

### 1. Importing Libraries

### 2. Importing Data

### 3. Data Consistency Checks

### 4. Combining Data Frames

### 5. Export Data

# 1. Importing Libraries

In [1]:
import pandas as pd
import numpy as np
import os

# 2. Importing Data

In [4]:
# Creating path string to folder
path = r'C:\Users\Admin\Desktop\Instacart Basket Analysis'

In [5]:
# Importing products_checked file using os library
df_prods = pd.read_csv(os.path.join(path, '02 Data', 'Prepared Data', 'products_checked.csv'), index_col = False)

In [6]:
# Importing orders_products_combined df from pickle file
df_merged_large = pd.read_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'orders_products_combined.pkl'))

# 3. Data Consistency Checks

In [7]:
# Check shape of imported df_merged_large
df_merged_large.shape

(32434489, 11)

In [8]:
# Check header of imported df_merged_large
df_merged_large.head()

Unnamed: 0.1,Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,hour_order_placed,days_since_prior_order,product_id,add_to_cart_order,reordered,_merge
0,0,2539329,1,1,2,8,,196,1,0,both
1,0,2539329,1,1,2,8,,14084,2,0,both
2,0,2539329,1,1,2,8,,12427,3,0,both
3,0,2539329,1,1,2,8,,26088,4,0,both
4,0,2539329,1,1,2,8,,26405,5,0,both


In [9]:
# Check merge of imported df_merged_large using flag column and frequency check
df_merged_large['_merge'].value_counts()

both          32434489
right_only           0
left_only            0
Name: _merge, dtype: int64

In [10]:
# Drop '_merge' column from df_merged_large
df_merged_large = df_merged_large.drop(columns = ['_merge'])

In [12]:
# Check columns of df_merged_large
df_merged_large.columns

Index(['Unnamed: 0', 'order_id', 'user_id', 'order_number',
       'orders_day_of_week', 'hour_order_placed', 'days_since_prior_order',
       'product_id', 'add_to_cart_order', 'reordered'],
      dtype='object')

In [13]:
# Check shape of imported df_prods
df_prods.shape

(49672, 6)

In [14]:
# Check header of imported df_prods
df_prods.head()

Unnamed: 0.1,Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,0,1,Chocolate Sandwich Cookies,61,19,5.8
1,1,2,All-Seasons Salt,104,13,9.3
2,2,3,Robust Golden Unsweetened Oolong Tea,94,7,4.5
3,3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1,10.5
4,4,5,Green Chile Anytime Sauce,5,13,4.3


# 4. Combining Data Frames

In [15]:
# Merge df_merged_large and df_prods dataframes to create df_combined
df_combined = df_merged_large.merge(df_prods, on = 'product_id', indicator = True)

In [16]:
# Check shape of df_combined
df_combined.shape

(32404859, 16)

In [17]:
# Check merge of imported df_combined using flag column and frequency check
df_combined['_merge'].value_counts()

both          32404859
right_only           0
left_only            0
Name: _merge, dtype: int64

In [18]:
# Check information of df_combined
df_combined.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 32404859 entries, 0 to 32404858
Data columns (total 16 columns):
 #   Column                  Dtype   
---  ------                  -----   
 0   Unnamed: 0_x            int64   
 1   order_id                int64   
 2   user_id                 int64   
 3   order_number            int64   
 4   orders_day_of_week      int64   
 5   hour_order_placed       int64   
 6   days_since_prior_order  float64 
 7   product_id              int64   
 8   add_to_cart_order       int64   
 9   reordered               int64   
 10  Unnamed: 0_y            int64   
 11  product_name            object  
 12  aisle_id                int64   
 13  department_id           int64   
 14  prices                  float64 
 15  _merge                  category
dtypes: category(1), float64(2), int64(12), object(1)
memory usage: 3.9+ GB


In [19]:
# Check header of df_combined
df_combined.head()

Unnamed: 0,Unnamed: 0_x,order_id,user_id,order_number,orders_day_of_week,hour_order_placed,days_since_prior_order,product_id,add_to_cart_order,reordered,Unnamed: 0_y,product_name,aisle_id,department_id,prices,_merge
0,0,2539329,1,1,2,8,,196,1,0,195,Soda,77,7,9.0,both
1,1,2398795,1,2,3,7,15.0,196,1,1,195,Soda,77,7,9.0,both
2,2,473747,1,3,3,12,21.0,196,1,1,195,Soda,77,7,9.0,both
3,3,2254736,1,4,4,7,29.0,196,1,1,195,Soda,77,7,9.0,both
4,4,431534,1,5,4,15,28.0,196,1,1,195,Soda,77,7,9.0,both


# 5. Export Data in Pickle Format

In [20]:
df_combined.to_pickle(os.path.join(path, '02 Data','Prepared Data', 'orders_products_merged.pkl'))