# Contents:

#### Importing libraries and establishing dataframe objects
#### Data wrangling and cleaning
#### Addressing memory issues by using more efficient data types
#### Importing and merging with the integrated Instacart data set
#### Exporting and saving as a pkl file

# Importing libraries and establishing dataframe objects

In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
path=r'C:\Users\Shelb\OneDrive\Documents\CF Coursework\Python Fundamentals for Data Analysts\10-2022 Instacart Basket Analysis'

In [3]:
df_cust=pd.read_csv(os.path.join(path,'Data','Original Data','customers.csv'),index_col=False)

# Data wrangling and cleaning

## Creating a consistent column naming system

In [4]:
df_cust.head()

Unnamed: 0,user_id,First Name,Surnam,Gender,STATE,Age,date_joined,n_dependants,fam_status,income
0,26711,Deborah,Esquivel,Female,Missouri,48,1/1/2017,3,married,165665
1,33890,Patricia,Hart,Female,New Mexico,36,1/1/2017,0,single,59285
2,65803,Kenneth,Farley,Male,Idaho,35,1/1/2017,2,married,99568
3,125935,Michelle,Hicks,Female,Iowa,40,1/1/2017,0,single,42049
4,130797,Ann,Gilmore,Female,Maryland,26,1/1/2017,1,married,40374


### Rename columns to 'first_name', 'last_name', 'state', 'age', 'gender', and 'dependents' to create consistent column names

In [5]:
# To rename columns
df_cust.rename(columns={'First Name':'first_name'},inplace=True)

In [6]:
df_cust.rename(columns={'Surnam':'last_name'},inplace=True)

In [7]:
df_cust.rename(columns={'STATE':'state'},inplace=True)

In [8]:
df_cust.rename(columns={'Age':'age'},inplace=True)

In [9]:
df_cust.rename(columns={'Gender':'gender'},inplace=True)

In [10]:
df_cust.rename(columns={'n_dependants':'dependants'},inplace=True)

In [11]:
# To check if renaming was successful
df_cust.head()

Unnamed: 0,user_id,first_name,last_name,gender,state,age,date_joined,dependants,fam_status,income
0,26711,Deborah,Esquivel,Female,Missouri,48,1/1/2017,3,married,165665
1,33890,Patricia,Hart,Female,New Mexico,36,1/1/2017,0,single,59285
2,65803,Kenneth,Farley,Male,Idaho,35,1/1/2017,2,married,99568
3,125935,Michelle,Hicks,Female,Iowa,40,1/1/2017,0,single,42049
4,130797,Ann,Gilmore,Female,Maryland,26,1/1/2017,1,married,40374


## Looking for dirty data

In [12]:
# To describe the df_cust dataframe
df_cust.describe()

Unnamed: 0,user_id,age,dependants,income
count,206209.0,206209.0,206209.0,206209.0
mean,103105.0,49.501646,1.499823,94632.852548
std,59527.555167,18.480962,1.118433,42473.786988
min,1.0,18.0,0.0,25903.0
25%,51553.0,33.0,0.0,59874.0
50%,103105.0,49.0,1.0,93547.0
75%,154657.0,66.0,3.0,124244.0
max,206209.0,81.0,3.0,593901.0


In [13]:
# To check data types of df_cust
df_cust.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 206209 entries, 0 to 206208
Data columns (total 10 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   user_id      206209 non-null  int64 
 1   first_name   194950 non-null  object
 2   last_name    206209 non-null  object
 3   gender       206209 non-null  object
 4   state        206209 non-null  object
 5   age          206209 non-null  int64 
 6   date_joined  206209 non-null  object
 7   dependants   206209 non-null  int64 
 8   fam_status   206209 non-null  object
 9   income       206209 non-null  int64 
dtypes: int64(4), object(6)
memory usage: 15.7+ MB


In [14]:
# To look for mixed-data type columns
for col in df_cust.columns.tolist():
    weird=(df_cust[[col]].applymap(type)!=df_cust[[col]].iloc[0].apply(type)).any(axis=1)
    if len (df_cust[weird])>0:
        print(col)

first_name


In [15]:
# To address mixed data types in the first_name column
df_cust['first_name']=df_cust['first_name'].astype('str')

In [16]:
# To check for null values
df_cust.isnull().sum()

user_id        0
first_name     0
last_name      0
gender         0
state          0
age            0
date_joined    0
dependants     0
fam_status     0
income         0
dtype: int64

In [17]:
# To find duplicate values in df_cust
df_dups=df_cust[df_cust.duplicated()]

In [18]:
df_dups

Unnamed: 0,user_id,first_name,last_name,gender,state,age,date_joined,dependants,fam_status,income


# Addressing memory issues by using more efficient data types

In [19]:
ords=pd.read_pickle(os.path.join(path,'Data','Prepared Data','11-13_orders_products_merged.pkl'))

In [20]:
ords.columns

Index(['order_id', 'user_id', 'order_number', 'order_day_of_week',
       'hour_order_placed', 'days_since_prior_order', 'new_customer',
       'product_id', 'add_to_cart_order', 'reordered', 'product_name',
       'aisle_id', 'department_id', 'prices', 'price_range', 'busiest_days',
       'busiest_period_of_day', 'max_order', 'loyalty_flag', 'spending_flag',
       'avg_days_since_last_order', 'order_frequency'],
      dtype='object')

In [21]:
# Change types for ords data set to help aleviate memory error when merging

ords['order_id']=ords['order_id'].astype('int32')
ords['user_id'] = ords['user_id'].astype('int32')
ords['order_number']=ords['order_number'].astype('int8')
ords['order_day_of_week']=ords['order_day_of_week'].astype('int8')
ords['hour_order_placed']=ords['hour_order_placed'].astype('int8')
ords['days_since_prior_order']=ords['days_since_prior_order'].astype('float16')

In [22]:
ords['product_id'] =ords['product_id'].astype('int32')
ords['reordered']=ords['reordered'].astype('int8')
ords['add_to_cart_order']=ords['add_to_cart_order'].astype('int32')
ords['order_id']=ords['order_id'].astype('int32')

In [23]:
ords['aisle_id'] = ords['aisle_id'].astype('int8')
ords['department_id'] = ords['department_id'].astype('int8')
ords['prices'] = ords['prices'].astype('float16')
df_cust['age'] = df_cust['age'].astype('int8')
df_cust['dependants'] = df_cust['dependants'].astype('int8')

# Importing and merging with the integrated Instacart data set

In [24]:
# To ensure that data types of merge keys are the same
df_cust['user_id'] = df_cust['user_id'].astype('int32')

In [25]:
ords.head()

Unnamed: 0,order_id,user_id,order_number,order_day_of_week,hour_order_placed,days_since_prior_order,new_customer,product_id,add_to_cart_order,reordered,...,department_id,prices,price_range,busiest_days,busiest_period_of_day,max_order,loyalty_flag,spending_flag,avg_days_since_last_order,order_frequency
0,2539329,1,1,2,8,,True,196,1,0,...,7,9.0,Mid range product,Regularly Busy,Average Orders,10,New customer,Low spender,20.5,Non-frequent customer
1,2398795,1,2,3,7,15.0,False,196,1,1,...,7,9.0,Mid range product,Least Busy,Average Orders,10,New customer,Low spender,20.5,Non-frequent customer
2,473747,1,3,3,12,21.0,False,196,1,1,...,7,9.0,Mid range product,Least Busy,Most Orders,10,New customer,Low spender,20.5,Non-frequent customer
3,2254736,1,4,4,7,29.0,False,196,1,1,...,7,9.0,Mid range product,Least Busy,Average Orders,10,New customer,Low spender,20.5,Non-frequent customer
4,431534,1,5,4,15,28.0,False,196,1,1,...,7,9.0,Mid range product,Least Busy,Most Orders,10,New customer,Low spender,20.5,Non-frequent customer


In [26]:
df_cust.head()

Unnamed: 0,user_id,first_name,last_name,gender,state,age,date_joined,dependants,fam_status,income
0,26711,Deborah,Esquivel,Female,Missouri,48,1/1/2017,3,married,165665
1,33890,Patricia,Hart,Female,New Mexico,36,1/1/2017,0,single,59285
2,65803,Kenneth,Farley,Male,Idaho,35,1/1/2017,2,married,99568
3,125935,Michelle,Hicks,Female,Iowa,40,1/1/2017,0,single,42049
4,130797,Ann,Gilmore,Female,Maryland,26,1/1/2017,1,married,40374


In [27]:
ords.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 32404859 entries, 0 to 32404858
Data columns (total 22 columns):
 #   Column                     Dtype  
---  ------                     -----  
 0   order_id                   int32  
 1   user_id                    int32  
 2   order_number               int8   
 3   order_day_of_week          int8   
 4   hour_order_placed          int8   
 5   days_since_prior_order     float16
 6   new_customer               bool   
 7   product_id                 int32  
 8   add_to_cart_order          int32  
 9   reordered                  int8   
 10  product_name               object 
 11  aisle_id                   int8   
 12  department_id              int8   
 13  prices                     float16
 14  price_range                object 
 15  busiest_days               object 
 16  busiest_period_of_day      object 
 17  max_order                  int64  
 18  loyalty_flag               object 
 19  spending_flag              object 
 20  

In [28]:
df_cust.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 206209 entries, 0 to 206208
Data columns (total 10 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   user_id      206209 non-null  int32 
 1   first_name   206209 non-null  object
 2   last_name    206209 non-null  object
 3   gender       206209 non-null  object
 4   state        206209 non-null  object
 5   age          206209 non-null  int8  
 6   date_joined  206209 non-null  object
 7   dependants   206209 non-null  int8  
 8   fam_status   206209 non-null  object
 9   income       206209 non-null  int64 
dtypes: int32(1), int64(1), int8(2), object(6)
memory usage: 12.2+ MB


In [29]:
# To test a merge between the ords and df_cust dataframes
pd.merge(ords,df_cust, on = ['user_id'], indicator=True)

Unnamed: 0,order_id,user_id,order_number,order_day_of_week,hour_order_placed,days_since_prior_order,new_customer,product_id,add_to_cart_order,reordered,...,first_name,last_name,gender,state,age,date_joined,dependants,fam_status,income,_merge
0,2539329,1,1,2,8,,True,196,1,0,...,Linda,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423,both
1,2398795,1,2,3,7,15.0,False,196,1,1,...,Linda,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423,both
2,473747,1,3,3,12,21.0,False,196,1,1,...,Linda,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423,both
3,2254736,1,4,4,7,29.0,False,196,1,1,...,Linda,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423,both
4,431534,1,5,4,15,28.0,False,196,1,1,...,Linda,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423,both
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32404854,156685,106143,26,4,23,5.0,False,19675,1,1,...,Gerald,Yates,Male,Hawaii,25,5/26/2017,0,single,53755,both
32404855,484769,66343,1,6,11,,True,47210,1,0,...,Jacqueline,Arroyo,Female,Tennessee,22,9/12/2017,3,married,46151,both
32404856,1561557,66343,2,1,11,30.0,False,47210,1,1,...,Jacqueline,Arroyo,Female,Tennessee,22,9/12/2017,3,married,46151,both
32404857,276317,66343,3,6,15,19.0,False,47210,1,1,...,Jacqueline,Arroyo,Female,Tennessee,22,9/12/2017,3,married,46151,both


In [31]:
# To check the shape of df_cust before merging
df_cust.shape

(206209, 10)

In [32]:
# To check the shape of ords before merging
ords.shape

(32404859, 22)

In [33]:
# To merge df_cust onto ords
df_merged=ords.merge(df_cust, on =['user_id'], indicator=True)

In [34]:
# To check the shape of df_merged
df_merged.shape

(32404859, 32)

In [36]:
# To check value counts of the _merge column
df_merged['_merge'].value_counts()

both          32404859
left_only            0
right_only           0
Name: _merge, dtype: int64

In [37]:
# To drop the _merge flag column as it will no longer be needed
df_merged.drop(columns=['_merge'])

Unnamed: 0,order_id,user_id,order_number,order_day_of_week,hour_order_placed,days_since_prior_order,new_customer,product_id,add_to_cart_order,reordered,...,order_frequency,first_name,last_name,gender,state,age,date_joined,dependants,fam_status,income
0,2539329,1,1,2,8,,True,196,1,0,...,Non-frequent customer,Linda,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423
1,2398795,1,2,3,7,15.0,False,196,1,1,...,Non-frequent customer,Linda,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423
2,473747,1,3,3,12,21.0,False,196,1,1,...,Non-frequent customer,Linda,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423
3,2254736,1,4,4,7,29.0,False,196,1,1,...,Non-frequent customer,Linda,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423
4,431534,1,5,4,15,28.0,False,196,1,1,...,Non-frequent customer,Linda,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32404854,156685,106143,26,4,23,5.0,False,19675,1,1,...,Frequent customer,Gerald,Yates,Male,Hawaii,25,5/26/2017,0,single,53755
32404855,484769,66343,1,6,11,,True,47210,1,0,...,Non-frequent customer,Jacqueline,Arroyo,Female,Tennessee,22,9/12/2017,3,married,46151
32404856,1561557,66343,2,1,11,30.0,False,47210,1,1,...,Non-frequent customer,Jacqueline,Arroyo,Female,Tennessee,22,9/12/2017,3,married,46151
32404857,276317,66343,3,6,15,19.0,False,47210,1,1,...,Non-frequent customer,Jacqueline,Arroyo,Female,Tennessee,22,9/12/2017,3,married,46151


# Exporting df_merged as a .pkl file

In [38]:
df_merged.to_pickle(os.path.join(path,'Data','Prepared Data','11-17_integrated_data.pkl'))