# Table of Contents

### Prepare Notebook
### 1.4 Wrangling
    Renaming Columns
    Specifying Data Types
### 1.5 Consistency Checks
    Mixed-Type Data
    Missing Values
    Duplicates
### 1.6 Combine Data
### 1.8 Export
    

### Prepare Notebook

In [1]:
#Import libraries

import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
import scipy

In [2]:
#Define 'path'

path = r'C:\Users\PC Planet\Desktop\Self-Education\Data Immersion\Achievement 4\Instacart Basket Analysis'

In [3]:
#Import dataframe 'customers'

customers = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'customers.csv'), index_col = False)

# 1.4 Wrangling

In [4]:
#Check columns

customers.head()

Unnamed: 0,user_id,First Name,Surnam,Gender,STATE,Age,date_joined,n_dependants,fam_status,income
0,26711,Deborah,Esquivel,Female,Missouri,48,1/1/2017,3,married,165665
1,33890,Patricia,Hart,Female,New Mexico,36,1/1/2017,0,single,59285
2,65803,Kenneth,Farley,Male,Idaho,35,1/1/2017,2,married,99568
3,125935,Michelle,Hicks,Female,Iowa,40,1/1/2017,0,single,42049
4,130797,Ann,Gilmore,Female,Maryland,26,1/1/2017,1,married,40374


### Renaming Columns

In [5]:
#Rename columns

customers.rename(columns = {'First Name':'first_name'}, inplace = True)
customers.rename(columns = {'Surnam':'last_name'}, inplace = True)
customers.rename(columns = {'Gender':'gender'}, inplace = True)
customers.rename(columns = {'STATE':'state'}, inplace = True)
customers.rename(columns = {'Age':'age'}, inplace = True)
customers.rename(columns = {'n_dependants':'dependants'}, inplace = True)
customers.rename(columns = {'fam_status':'marital_status'}, inplace = True)

In [6]:
#Check renaming of columns

customers.head()

Unnamed: 0,user_id,first_name,last_name,gender,state,age,date_joined,dependants,marital_status,income
0,26711,Deborah,Esquivel,Female,Missouri,48,1/1/2017,3,married,165665
1,33890,Patricia,Hart,Female,New Mexico,36,1/1/2017,0,single,59285
2,65803,Kenneth,Farley,Male,Idaho,35,1/1/2017,2,married,99568
3,125935,Michelle,Hicks,Female,Iowa,40,1/1/2017,0,single,42049
4,130797,Ann,Gilmore,Female,Maryland,26,1/1/2017,1,married,40374


### Specifying Data Types

In [7]:
#Check data types

customers.dtypes

user_id            int64
first_name        object
last_name         object
gender            object
state             object
age                int64
date_joined       object
dependants         int64
marital_status    object
income             int64
dtype: object

In [6]:
#Change data types

customers['user_id'] = customers['user_id'].astype('str')

In [9]:
#Check result

customers.dtypes

user_id           object
first_name        object
last_name         object
gender            object
state             object
age                int64
date_joined       object
dependants         int64
marital_status    object
income             int64
dtype: object

# 1.5 Consistency Checks

### Mixed-Type Data

In [10]:
#Check for mixed-type data

for col in customers.columns.tolist():
    weird = (customers[[col]].applymap(type)
            !=customers[[col]].iloc[0].apply(type)).any(axis = 1)
    if len(customers[weird]) > 0:
        print (col)

first_name


In [7]:
#Change mixed-type data to single-type

customers['first_name'] = customers['first_name'].astype('str')

In [12]:
#Recheck for mixed-type data

for col in customers.columns.tolist():
    weird = (customers[[col]].applymap(type)
            !=customers[[col]].iloc[0].apply(type)).any(axis = 1)
    if len(customers[weird]) > 0:
        print (col)

### Missing Values

In [13]:
#Check for missing values

customers.isnull().sum()

user_id           0
first_name        0
last_name         0
gender            0
state             0
age               0
date_joined       0
dependants        0
marital_status    0
income            0
dtype: int64

### Duplicates

In [15]:
#Check for duplicate entries

customers_dup = customers[customers.duplicated()]

customers_dup

Unnamed: 0,user_id,first_name,last_name,gender,state,age,date_joined,dependants,marital_status,income


# 1.6 Combine Data

In [8]:
#Import other dataframes

opf = pd.read_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'orders_products_with_flags.pkl'))

In [17]:
#Check

opf.head()

Unnamed: 0,Unnamed: 0_x,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,...,_merge,price_range_loc,busiest_days,busiest_period_of_day,max_order,loyalty_flag,average_price,spending_flag,median_frequency,user_frequency
0,0,2539329,1,1,2,8,,196,1,0,...,both,Mid-range product,Regular Day,Average Orders,10,New Customer,6.367797,Low Spender,20.5,Non-Frequent Customer
1,1,2398795,1,2,3,7,15.0,196,1,1,...,both,Mid-range product,Slow Day,Average Orders,10,New Customer,6.367797,Low Spender,20.5,Non-Frequent Customer
2,2,473747,1,3,3,12,21.0,196,1,1,...,both,Mid-range product,Slow Day,Most Orders,10,New Customer,6.367797,Low Spender,20.5,Non-Frequent Customer
3,3,2254736,1,4,4,7,29.0,196,1,1,...,both,Mid-range product,Slow Day,Average Orders,10,New Customer,6.367797,Low Spender,20.5,Non-Frequent Customer
4,4,431534,1,5,4,15,28.0,196,1,1,...,both,Mid-range product,Slow Day,Most Orders,10,New Customer,6.367797,Low Spender,20.5,Non-Frequent Customer


In [18]:
#Check shapes

opf.shape

(32404859, 25)

In [19]:
customers.shape

(206209, 10)

In [26]:
#Check data types

opf.dtypes

Unnamed: 0_x                 int64
order_id                     int64
user_id                      int64
order_number                 int64
orders_day_of_week           int64
order_hour_of_day            int64
days_since_prior_order     float64
product_id                   int64
add_to_cart_order            int64
reordered                    int64
Unnamed: 0_y                 int64
product_name                object
aisle_id                     int64
department_id                int64
prices                     float64
_merge                    category
price_range_loc             object
busiest_days                object
busiest_period_of_day       object
max_order                    int64
loyalty_flag                object
average_price              float64
spending_flag               object
median_frequency           float64
user_frequency              object
dtype: object

In [9]:
#Change data type for compatibility

opf['user_id'] = opf['user_id'].astype('str')

In [29]:
#Test Merge on 'user_id'

pd.merge(opf, customers, on = 'user_id', how = 'outer')

Unnamed: 0,Unnamed: 0_x,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,...,user_frequency,first_name,last_name,gender,state,age,date_joined,dependants,marital_status,income
0,0,2539329,1,1,2,8,,196,1,0,...,Non-Frequent Customer,Linda,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423
1,1,2398795,1,2,3,7,15.0,196,1,1,...,Non-Frequent Customer,Linda,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423
2,2,473747,1,3,3,12,21.0,196,1,1,...,Non-Frequent Customer,Linda,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423
3,3,2254736,1,4,4,7,29.0,196,1,1,...,Non-Frequent Customer,Linda,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423
4,4,431534,1,5,4,15,28.0,196,1,1,...,Non-Frequent Customer,Linda,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32404854,1768145,156685,106143,26,4,23,5.0,19675,1,1,...,Frequent Customer,Gerald,Yates,Male,Hawaii,25,5/26/2017,0,single,53755
32404855,1101646,484769,66343,1,6,11,,47210,1,0,...,Non-Frequent Customer,Jacqueline,Arroyo,Female,Tennessee,22,9/12/2017,3,married,46151
32404856,1101647,1561557,66343,2,1,11,30.0,47210,1,1,...,Non-Frequent Customer,Jacqueline,Arroyo,Female,Tennessee,22,9/12/2017,3,married,46151
32404857,1101648,276317,66343,3,6,15,19.0,47210,1,1,...,Non-Frequent Customer,Jacqueline,Arroyo,Female,Tennessee,22,9/12/2017,3,married,46151


In [10]:
#Merge on 'user_id'

opfc = opf.merge(customers, on = 'user_id', how = 'outer')

In [12]:
#Delete old indexes

del opfc['Unnamed: 0_x']

opfc.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,Unnamed: 0_y,...,user_frequency,first_name,last_name,gender,state,age,date_joined,dependants,marital_status,income
0,2539329,1,1,2,8,,196,1,0,195,...,Non-Frequent Customer,Linda,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423
1,2398795,1,2,3,7,15.0,196,1,1,195,...,Non-Frequent Customer,Linda,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423
2,473747,1,3,3,12,21.0,196,1,1,195,...,Non-Frequent Customer,Linda,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423
3,2254736,1,4,4,7,29.0,196,1,1,195,...,Non-Frequent Customer,Linda,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423
4,431534,1,5,4,15,28.0,196,1,1,195,...,Non-Frequent Customer,Linda,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423


In [13]:
del opfc['Unnamed: 0_y']

opfc.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,product_name,...,user_frequency,first_name,last_name,gender,state,age,date_joined,dependants,marital_status,income
0,2539329,1,1,2,8,,196,1,0,Soda,...,Non-Frequent Customer,Linda,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423
1,2398795,1,2,3,7,15.0,196,1,1,Soda,...,Non-Frequent Customer,Linda,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423
2,473747,1,3,3,12,21.0,196,1,1,Soda,...,Non-Frequent Customer,Linda,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423
3,2254736,1,4,4,7,29.0,196,1,1,Soda,...,Non-Frequent Customer,Linda,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423
4,431534,1,5,4,15,28.0,196,1,1,Soda,...,Non-Frequent Customer,Linda,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423


# 1.8 Export

In [14]:
#Export file

opfc.to_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'orders_products_all.pkl'))