# 4.6 Combining & Exporting Data

## This script contains the following points:

### 1. Importing Libraries

### 2. Importing Data

### 3. Data Consistency Check

### 4. Combining Data Frames

# 1. Importing Libraries

In [2]:
import pandas as pd
import numpy as np
import os

# 2. Importing Data

In [3]:
# Creating path string to data folder
path = r'C:\Users\bexlu\Desktop\Career Foundry\Data Immersion\Achievement 4\Instacart Basket Analysis'

In [4]:
# Importing products_checked file using os library
df_prods = pd.read_csv(os.path.join(path, '02 Data', 'Prepared Data', 'products_checked.csv'), index_col = False)

In [5]:
# Importing orders_products_combined df from pickle file
df_merged_large = pd.read_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'orders_products_combined.pkl'))

# 3. Data Consistency Check

In [6]:
# Check shape of imported df
df_merged_large.shape

(32434489, 11)

In [7]:
# Check header of imported df
df_merged_large.head()

Unnamed: 0.1,Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,hour_order_placed,days_since_prior_order,product_id,add_to_cart_order,reordered,_merge
0,0,2539329,1,1,2,8,,196,1,0,both
1,0,2539329,1,1,2,8,,14084,2,0,both
2,0,2539329,1,1,2,8,,12427,3,0,both
3,0,2539329,1,1,2,8,,26088,4,0,both
4,0,2539329,1,1,2,8,,26405,5,0,both


In [8]:
# Check shape of imported df
df_prods.shape

(49672, 6)

In [9]:
# Check header of imported df
df_prods.head()

Unnamed: 0.1,Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,0,1,Chocolate Sandwich Cookies,61,19,5.8
1,1,2,All-Seasons Salt,104,13,9.3
2,2,3,Robust Golden Unsweetened Oolong Tea,94,7,4.5
3,3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1,10.5
4,4,5,Green Chile Anytime Sauce,5,13,4.3


# 4. Combining Data Frames

In [10]:
# Merge df_merged_large and df_prods dataframes to create df_final
df_final = df_merged_large.merge(df_prods, on = 'product_id', indicator = True)

ValueError: Cannot use name of an existing column for indicator column

In [11]:
# Drop _merge column from df_merged_large
df_merged_large = df_merged_large.drop(columns = ['_merge'])

In [12]:
# Merge df_merged_large and df_prods dataframes to create df_final using right join 
df_final = df_merged_large.merge(df_prods, on = ['product_id'], how = 'right')

In [13]:
# Check head of df_final
df_final.head()

Unnamed: 0,Unnamed: 0_x,order_id,user_id,order_number,orders_day_of_week,hour_order_placed,days_since_prior_order,product_id,add_to_cart_order,reordered,Unnamed: 0_y,product_name,aisle_id,department_id,prices
0,1987.0,3139998.0,138.0,28.0,6.0,11.0,3.0,1,5.0,0.0,0,Chocolate Sandwich Cookies,61,19,5.8
1,1989.0,1977647.0,138.0,30.0,6.0,17.0,20.0,1,1.0,1.0,0,Chocolate Sandwich Cookies,61,19,5.8
2,11433.0,389851.0,709.0,2.0,0.0,21.0,6.0,1,20.0,0.0,0,Chocolate Sandwich Cookies,61,19,5.8
3,12198.0,652770.0,764.0,1.0,3.0,13.0,,1,10.0,0.0,0,Chocolate Sandwich Cookies,61,19,5.8
4,12200.0,1813452.0,764.0,3.0,4.0,17.0,9.0,1,11.0,1.0,0,Chocolate Sandwich Cookies,61,19,5.8


In [14]:
# Check information on df_final, particularly data types as they look to have changed from df_merged_large
df_final.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 32404870 entries, 0 to 32404869
Data columns (total 15 columns):
 #   Column                  Dtype  
---  ------                  -----  
 0   Unnamed: 0_x            float64
 1   order_id                float64
 2   user_id                 float64
 3   order_number            float64
 4   orders_day_of_week      float64
 5   hour_order_placed       float64
 6   days_since_prior_order  float64
 7   product_id              int64  
 8   add_to_cart_order       float64
 9   reordered               float64
 10  Unnamed: 0_y            int64  
 11  product_name            object 
 12  aisle_id                int64  
 13  department_id           int64  
 14  prices                  float64
dtypes: float64(10), int64(4), object(1)
memory usage: 3.9+ GB


In [15]:
# Check information on df_merged_large
df_merged_large.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 32434489 entries, 0 to 32434488
Data columns (total 10 columns):
 #   Column                  Dtype  
---  ------                  -----  
 0   Unnamed: 0              int64  
 1   order_id                int64  
 2   user_id                 int64  
 3   order_number            int64  
 4   orders_day_of_week      int64  
 5   hour_order_placed       int64  
 6   days_since_prior_order  float64
 7   product_id              int64  
 8   add_to_cart_order       int64  
 9   reordered               int64  
dtypes: float64(1), int64(9)
memory usage: 2.7 GB
