# Contents in this notebook
1. Importing Libraries and Data
2. Checking and cleaning customers dataset
3. Dealing with memory error
4. Merging customer and ords_prods_merge_new_new dataset

# Importing Libraries

In [1]:
# Importing Libraries
import pandas as pd
import numpy as np
import os

In [2]:
# Defining Path
path = r'C:\Users\seann\Downloads\Career Foundry Tasks\12-08-22 Instacart Basket Analysis'

In [3]:
# Importing Customers data set
df_customers = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'customers.csv'), index_col = False)

In [4]:
# Importing Orders and Products Merged Data Set
ords_prods_merge = pd.read_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'ords_prods_merge_new_updated.pkl'))

## Checking and cleaning customers dataset

In [5]:
df_customers.head()

Unnamed: 0,user_id,First Name,Surnam,Gender,STATE,Age,date_joined,n_dependants,fam_status,income
0,26711,Deborah,Esquivel,Female,Missouri,48,1/1/2017,3,married,165665
1,33890,Patricia,Hart,Female,New Mexico,36,1/1/2017,0,single,59285
2,65803,Kenneth,Farley,Male,Idaho,35,1/1/2017,2,married,99568
3,125935,Michelle,Hicks,Female,Iowa,40,1/1/2017,0,single,42049
4,130797,Ann,Gilmore,Female,Maryland,26,1/1/2017,1,married,40374


In [6]:
# Renaming Columns
df_customers.rename(columns = {'Surnam' : 'Surname'}, inplace = True)

In [7]:
df_customers.rename(columns = {'STATE' : 'State'}, inplace = True)

In [8]:
df_customers.rename(columns = {'n_dependants' : 'dependants'}, inplace = True)

In [9]:
# Checking df_customers columns
df_customers.columns

Index(['user_id', 'First Name', 'Surname', 'Gender', 'State', 'Age',
       'date_joined', 'dependants', 'fam_status', 'income'],
      dtype='object')

In [10]:
# Checking for mixed data-types
for col in df_customers.columns.tolist():
  weird = (df_customers[[col]].applymap(type) != df_customers[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df_customers[weird]) > 0:
    print (col)

First Name


In [11]:
# Changing mixed data type to string
df_customers['First Name'] = df_customers['First Name'].astype('str')

In [12]:
# Finding Missing Values
df_customers.isnull().sum()

user_id        0
First Name     0
Surname        0
Gender         0
State          0
Age            0
date_joined    0
dependants     0
fam_status     0
income         0
dtype: int64

In [13]:
# locating Duplicates
df_dups = df_customers[df_customers.duplicated()]

In [14]:
df_dups

Unnamed: 0,user_id,First Name,Surname,Gender,State,Age,date_joined,dependants,fam_status,income


In [15]:
# Checking ords prods merge columns
ords_prods_merge.columns

Index(['order_id', 'user_id', 'amount_of_orders_made', 'order_day_of_week',
       'order_hour_of_day', 'days_since_prior_order', 'product_id',
       'add_to_cart_order', 'reordered', 'product_name', 'aisle_id',
       'department_id', 'prices', 'price_range_loc', 'busiest day',
       'busiest days', 'Busiest Order Periods', 'max_order', 'loyalty_flag',
       'avg_price', 'spending_flag', 'median_days_since_prior_order',
       'frequency_flag'],
      dtype='object')

In [16]:
# Checking df_customers columns
df_customers.columns

Index(['user_id', 'First Name', 'Surname', 'Gender', 'State', 'Age',
       'date_joined', 'dependants', 'fam_status', 'income'],
      dtype='object')

## Dealing with memory error

In [23]:
# Dealing with memory error
df_customers.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 206209 entries, 0 to 206208
Data columns (total 10 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   user_id      206209 non-null  int64 
 1   First Name   206209 non-null  object
 2   Surname      206209 non-null  object
 3   Gender       206209 non-null  object
 4   State        206209 non-null  object
 5   Age          206209 non-null  int64 
 6   date_joined  206209 non-null  object
 7   dependants   206209 non-null  int64 
 8   fam_status   206209 non-null  object
 9   income       206209 non-null  int64 
dtypes: int64(4), object(6)
memory usage: 15.7+ MB


In [18]:
ords_prods_merge.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 32404859 entries, 0 to 32404858
Data columns (total 23 columns):
 #   Column                         Dtype  
---  ------                         -----  
 0   order_id                       int64  
 1   user_id                        int64  
 2   amount_of_orders_made          int64  
 3   order_day_of_week              int64  
 4   order_hour_of_day              int64  
 5   days_since_prior_order         float64
 6   product_id                     int64  
 7   add_to_cart_order              int64  
 8   reordered                      int64  
 9   product_name                   object 
 10  aisle_id                       int64  
 11  department_id                  int64  
 12  prices                         float64
 13  price_range_loc                object 
 14  busiest day                    object 
 15  busiest days                   object 
 16  Busiest Order Periods          object 
 17  max_order                      int64  
 18  

In [21]:
ords_prods_merge['order_id']=ords_prods_merge['order_id'].astype('int32')
ords_prods_merge['user_id']=ords_prods_merge['user_id'].astype('int32')
ords_prods_merge['amount_of_orders_made']=ords_prods_merge['amount_of_orders_made'].astype('int32')
ords_prods_merge['order_day_of_week']=ords_prods_merge['order_day_of_week'].astype('int8')
ords_prods_merge['order_hour_of_day']=ords_prods_merge['order_hour_of_day'].astype('int8')
ords_prods_merge['days_since_prior_order']=ords_prods_merge['days_since_prior_order'].astype('float16')
ords_prods_merge['product_id']=ords_prods_merge['product_id'].astype('int32')
ords_prods_merge['add _to_cart_order']=ords_prods_merge['add_to_cart_order'].astype('int32')
ords_prods_merge['reordered']=ords_prods_merge['reordered'].astype('int8')
ords_prods_merge['product_name']=ords_prods_merge['product_name'].astype('object')
ords_prods_merge['aisle_id']=ords_prods_merge['aisle_id'].astype('int8')
ords_prods_merge['department_id']=ords_prods_merge['department_id'].astype('int8')
ords_prods_merge['prices']=ords_prods_merge['prices'].astype('float16')
ords_prods_merge['price_range_loc']=ords_prods_merge['price_range_loc'].astype('object')
ords_prods_merge['busiest day']=ords_prods_merge['busiest day'].astype('object')
ords_prods_merge['busiest days']=ords_prods_merge['busiest days'].astype('object')
ords_prods_merge['Busiest Order Periods']=ords_prods_merge['Busiest Order Periods'].astype('object')
ords_prods_merge['max_order']=ords_prods_merge['max_order'].astype('int32')
ords_prods_merge['loyalty_flag']=ords_prods_merge['loyalty_flag'].astype('object')
ords_prods_merge['avg_price']=ords_prods_merge['avg_price'].astype('float16')
ords_prods_merge['spending_flag']=ords_prods_merge['spending_flag'].astype('object')
ords_prods_merge['median_days_since_prior_order']=ords_prods_merge['median_days_since_prior_order'].astype('float16')
ords_prods_merge['frequency_flag']=ords_prods_merge['frequency_flag'].astype('object')

In [22]:
ords_prods_merge.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 32404859 entries, 0 to 32404858
Data columns (total 24 columns):
 #   Column                         Dtype  
---  ------                         -----  
 0   order_id                       int32  
 1   user_id                        int32  
 2   amount_of_orders_made          int32  
 3   order_day_of_week              int8   
 4   order_hour_of_day              int8   
 5   days_since_prior_order         float16
 6   product_id                     int32  
 7   add_to_cart_order              int64  
 8   reordered                      int8   
 9   product_name                   object 
 10  aisle_id                       int8   
 11  department_id                  int8   
 12  prices                         float16
 13  price_range_loc                object 
 14  busiest day                    object 
 15  busiest days                   object 
 16  Busiest Order Periods          object 
 17  max_order                      int32  
 18  

In [25]:
df_customers['user_id']=df_customers['user_id'].astype('int32')
df_customers['First Name']=df_customers['First Name'].astype('object')
df_customers['Surname']=df_customers['Surname'].astype('object')
df_customers['Gender']=df_customers['Gender'].astype('object')
df_customers['State']=df_customers['State'].astype('object')
df_customers['Age']=df_customers['Age'].astype('int8')
df_customers['date_joined']=df_customers['date_joined'].astype('object')
df_customers['dependants']=df_customers['dependants'].astype('int8')
df_customers['fam_status']=df_customers['fam_status'].astype('object')
df_customers['income']=df_customers['income'].astype('int8')

In [26]:
df_customers.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 206209 entries, 0 to 206208
Data columns (total 10 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   user_id      206209 non-null  int32 
 1   First Name   206209 non-null  object
 2   Surname      206209 non-null  object
 3   Gender       206209 non-null  object
 4   State        206209 non-null  object
 5   Age          206209 non-null  int8  
 6   date_joined  206209 non-null  object
 7   dependants   206209 non-null  int8  
 8   fam_status   206209 non-null  object
 9   income       206209 non-null  int8  
dtypes: int32(1), int8(3), object(6)
memory usage: 10.8+ MB


## Merging and exporting datasets

In [29]:
# Merging the two datasets
instacart_merged = ords_prods_merge.merge(df_customers, on = 'user_id', indicator = True)

In [30]:
# Exporting the merged Data Set
instacart_merged.to_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'instacart_merged.pkl'))