# Importing Libraries

In [1]:
import pandas as pd
import numpy as np
import os

# Importing Data

In [10]:
# Creating path to project folder

path = r'C:\Users\TanaT\(CF) Achievement 4 - Instacart Basket Analysis'

In [12]:
#  Step 3 - Importing merged dataframe

df_orders_products_merged = pd.read_pickle(os.path.join(path, '02 Data','Prepared Data', 'orders_products_combined.pkl'))


# Checking Data

In [13]:
# Step 4 - Checking number of rows and columns in merged data 

df_orders_products_merged.shape

(32434489, 10)

In [20]:
# Checkiing output of merged data

df_orders_products_merged.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,_merge
0,2539329,1,1,2,8,,196,1,0,both
1,2539329,1,1,2,8,,14084,2,0,both
2,2539329,1,1,2,8,,12427,3,0,both
3,2539329,1,1,2,8,,26088,4,0,both
4,2539329,1,1,2,8,,26405,5,0,both


In [14]:
# Importing cleaned products data 

df_prods = pd.read_csv(os.path.join(path, '02 Data','Prepared Data', 'cleaned_products_data.csv'), index_col = False)

In [15]:
df_prods.head()

Unnamed: 0.1,Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,0,1,Chocolate Sandwich Cookies,61,19,5.8
1,1,2,All-Seasons Salt,104,13,9.3
2,2,3,Robust Golden Unsweetened Oolong Tea,94,7,4.5
3,3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1,10.5
4,4,5,Green Chile Anytime Sauce,5,13,4.3


In [16]:
# Dropping the 'Unnamed: 0' column - not overwriting (will fix this in the future during importation or saving file)
## Next time will use index_col = 0 to avoid this

df_prods_clean = df_prods.drop(columns=['Unnamed: 0'])

In [17]:
df_prods_clean.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,1,Chocolate Sandwich Cookies,61,19,5.8
1,2,All-Seasons Salt,104,13,9.3
2,3,Robust Golden Unsweetened Oolong Tea,94,7,4.5
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1,10.5
4,5,Green Chile Anytime Sauce,5,13,4.3


In [28]:
# Checking number of rows and columns in cleaned products data 

df_prods_clean.shape

(49672, 5)

# Merging Data 

In [27]:
# Products and merged data both have a 'product_id' column - will use this column to merge the two dataframes together
## Testing merge
## The column '_merged' already exists in the merged dataframe, so had to use a custom string as the value for indicator parameter to avoid naming conflict

pd.merge(df_orders_products_merged, df_prods_clean, on = ['product_id'], indicator = 'merge_check')

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,_merge,product_name,aisle_id,department_id,prices,merge_check
0,2539329,1,1,2,8,,196,1,0,both,Soda,77,7,9.0,both
1,2539329,1,1,2,8,,14084,2,0,both,Organic Unsweetened Vanilla Almond Milk,91,16,12.5,both
2,2539329,1,1,2,8,,12427,3,0,both,Original Beef Jerky,23,19,4.4,both
3,2539329,1,1,2,8,,26088,4,0,both,Aged White Cheddar Popcorn,23,19,4.7,both
4,2539329,1,1,2,8,,26405,5,0,both,XL Pick-A-Size Paper Towel Rolls,54,17,1.0,both
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32404854,2977660,206209,13,1,12,7.0,14197,5,1,both,Tomato Paste,9,9,5.6,both
32404855,2977660,206209,13,1,12,7.0,38730,6,0,both,Brownie Crunch High Protein Bar,3,19,5.9,both
32404856,2977660,206209,13,1,12,7.0,31477,7,0,both,High Protein Bar Chunky Peanut Butter,3,19,4.2,both
32404857,2977660,206209,13,1,12,7.0,6567,8,0,both,Chocolate Peanut Butter Protein Bar,3,19,4.9,both


In [29]:
# Merging the data and making a new dataframe of the merge
## First - dropping the merge column in the already merged dataframe and making a new dataframe

df_orders_products_merge_prep = df_orders_products_merged.drop(columns=['_merge'])


In [30]:
df_orders_products_merge_prep.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered
0,2539329,1,1,2,8,,196,1,0
1,2539329,1,1,2,8,,14084,2,0
2,2539329,1,1,2,8,,12427,3,0
3,2539329,1,1,2,8,,26088,4,0
4,2539329,1,1,2,8,,26405,5,0


In [31]:
df_orders_products_merge_prep.shape

(32434489, 9)

In [36]:
# Step 5 - Determine a suitable way to combine the dataframes. 
## Final merge of dataframes - with a merge flag column - using default inner merge 

df_ords_prods_merge = df_prods_clean.merge(df_orders_products_merge_prep, on = 'product_id', indicator = True)

In [33]:
# Checking merge

df_ords_prods_merge.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,add_to_cart_order,reordered,_merge
0,1,Chocolate Sandwich Cookies,61,19,5.8,3139998,138,28,6,11,3.0,5,0,both
1,1,Chocolate Sandwich Cookies,61,19,5.8,1977647,138,30,6,17,20.0,1,1,both
2,1,Chocolate Sandwich Cookies,61,19,5.8,389851,709,2,0,21,6.0,20,0,both
3,1,Chocolate Sandwich Cookies,61,19,5.8,652770,764,1,3,13,,10,0,both
4,1,Chocolate Sandwich Cookies,61,19,5.8,1813452,764,3,4,17,9.0,11,1,both


In [34]:
# Checking number of rows and columns of new merged dataframe

df_ords_prods_merge.shape

(32404859, 14)

In [35]:
# Step 6 - Confirm the results of the merge using the merge flag.

df_ords_prods_merge['_merge'].value_counts()

_merge
both          32404859
left_only            0
right_only           0
Name: count, dtype: int64

# Exporting Data

In [38]:
# Step 7 - Export newly created dataframe in a suitable format (taking into consideration the size).
## Exporting data as a pickle to save dataframes as they are.  


df_ords_prods_merge.to_pickle(os.path.join(path, '02 Data','Prepared Data', 'ords_prods_merge.pkl'))