# Script that merges cleaned customer data with orders_products data and exports it as pickle

## Script contains following sections
### 1. Importing Libraries
### 2. Importing Data
### 3. Merging Data
### 4. Exporting Merged Data

# 1. Importing Libraries

In [1]:
# Importing libraries
import pandas as pd
import numpy as np
import os

# 2. Importing Data

In [2]:
# Folder Shortcut
path = r'C:\Users\seank\OneDrive\Dokumente\Career Foundry Data Analytics Course\Data Immersion\4 Python\03-2020_Instacart_Basket _Analysis'

In [3]:
# Importing Merged Orders Products Data
ords_prods = pd.read_pickle(os.path.join(path, '02_Data', 'Prepared_Data', 'ords_prods_vars_flags.pkl'))

In [4]:
# Importing Customer Data
cust = pd.read_csv(os.path.join(path, '02_Data', 'Prepared_Data', 'customers_checked.csv'),
                   index_col=False)

In [5]:
cust.head()

Unnamed: 0,user_id,first_name,surname,gender,state,age,date_joined,n_dependants,fam_status,income
0,26711,Deborah,Esquivel,Female,Missouri,48,2017-01-01,3,married,165665
1,33890,Patricia,Hart,Female,New Mexico,36,2017-01-01,0,single,59285
2,65803,Kenneth,Farley,Male,Idaho,35,2017-01-01,2,married,99568
3,125935,Michelle,Hicks,Female,Iowa,40,2017-01-01,0,single,42049
4,130797,Ann,Gilmore,Female,Maryland,26,2017-01-01,1,married,40374


In [6]:
# Keep only necessary columns
cust = cust[['user_id', 'gender', 'state', 'age', 'n_dependants', 'fam_status','income']]

In [7]:
ords_prods.columns

Index(['order_id', 'user_id', 'order_number', 'orders_day_of_week',
       'order_hour_of_day', 'days_since_prior_order', 'product_id',
       'add_to_cart_order', 'reordered', 'product_name', 'aisle_id',
       'department_id', 'prices', '_merge', 'price_range_loc', 'busiest_day',
       'busiest_days', 'Busiest_hours', 'max_order', 'loyalty_flag',
       'avg_price', 'spending_flag', 'median_days_tween_orders',
       'frequency_flag'],
      dtype='object')

In [8]:
# Dropping aisle_id, _merge, busiest_day, median_days_tween_orders
ords_prods = ords_prods[['order_id', 'user_id', 'order_number', 'orders_day_of_week',
                               'order_hour_of_day', 'days_since_prior_order', 'product_id', 
                               'add_to_cart_order', 'reordered', 'product_name', 
                               'department_id', 'prices', 'price_range_loc', 
                               'busiest_days', 'Busiest_hours', 'max_order', 
                               'loyalty_flag', 'avg_price', 'spending_flag', 'frequency_flag']]

In [None]:
# Changing variables types to reduce size
    # First making these both matching and string for merge
cust['user_id'] = cust['user_id'].astype('str')
ords_prods['user_id'] = ords_prods['user_id'].astype('str')

In [10]:
# Convert all int64 columns to int32
for col in cust.select_dtypes(include=['int64']).columns:
    cust[col] = cust[col].astype('int32')

for col in ords_prods.select_dtypes(include=['int64']).columns:
    ords_prods[col] = ords_prods[col].astype('int32')

In [11]:
# Reducing float64s to float32
for col in ords_prods.select_dtypes(include=['float64']).columns:
    ords_prods[col] = ords_prods[col].astype('float32')

In [12]:
ords_prods.dtypes

order_id                    int32
user_id                    object
order_number                int32
orders_day_of_week          int32
order_hour_of_day           int32
days_since_prior_order    float32
product_id                  int32
add_to_cart_order           int32
reordered                   int32
product_name               object
department_id               int32
prices                    float32
price_range_loc            object
busiest_days               object
Busiest_hours              object
max_order                   int32
loyalty_flag               object
avg_price                 float32
spending_flag              object
frequency_flag             object
dtype: object

In [13]:
# Converting some columns to string in order products df
# List of columns to convert from int32 to string
columns_to_convert_op = ['order_id', 'order_day_of_week', 'product_id', 'department_id']  # Replace with your column names

# Convert the selected columns to string
for col in columns_to_convert_op:
    if col in ords_prods.columns:  # Ensure the column exists in the DataFrame
        ords_prods[col] = ords_prods[col].astype('string')

In [14]:
cust.dtypes

user_id         object
gender          object
state           object
age              int32
n_dependants     int32
fam_status      object
income           int32
dtype: object

# 3. Merging Orders-Products with Customers

In [15]:
# Merging DFs on user_id
ords_prods_cust = cust.merge(ords_prods,
                             how='inner',
                             on='user_id',
                             indicator=True)

In [16]:
ords_prods_cust.head()

Unnamed: 0,user_id,gender,state,age,n_dependants,fam_status,income,order_id,order_number,orders_day_of_week,...,prices,price_range_loc,busiest_days,Busiest_hours,max_order,loyalty_flag,avg_price,spending_flag,frequency_flag,_merge
0,26711,Female,Missouri,48,3,married,165665,518967,1,2,...,4.3,Low-range product,Regularly busy,Most orders,8,New customer,7.988889,Low spender,Regular customer,both
1,26711,Female,Missouri,48,3,married,165665,423547,2,2,...,12.6,Mid-range product,Regularly busy,Most orders,8,New customer,7.988889,Low spender,Regular customer,both
2,26711,Female,Missouri,48,3,married,165665,2524893,3,3,...,12.6,Mid-range product,Least busy days,Most orders,8,New customer,7.988889,Low spender,Regular customer,both
3,26711,Female,Missouri,48,3,married,165665,2524893,3,3,...,4.3,Low-range product,Least busy days,Most orders,8,New customer,7.988889,Low spender,Regular customer,both
4,26711,Female,Missouri,48,3,married,165665,2524893,3,3,...,7.1,Mid-range product,Least busy days,Most orders,8,New customer,7.988889,Low spender,Regular customer,both


In [17]:
ords_prods_cust['_merge'].value_counts()

_merge
both          32404859
left_only            0
right_only           0
Name: count, dtype: int64

# 4. Exporting as Pickle

In [18]:
# Exporting as Pickle
ords_prods_cust.to_pickle(os.path.join(path, '02_Data', 'Prepared_Data', 'ords_prods_cust.pkl'))