# Importing Libraries

In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
import scipy

# Importing Data

In [2]:
# Creating path to project folder.

path = r'C:\Users\TanaT\(CF) Achievement 4 - Instacart Basket Analysis'

In [3]:
# Importing dataframe used in 4.9 Part 2 and exported from Part 1. 

df_cust_merge =  pd.read_pickle(os.path.join(path, '02 Data','Prepared Data', 'customer_and_merged.pkl'))   

In [4]:
# Checking dataframe.

df_cust_merge.shape

(32404859, 34)

# Checking NaN Values (now set as object)

In [5]:
# Checking if there are any missing values in the 'first_name' column. 
# Thought they were handled after changing the mixed-type column to a string (object).
# Now checking the NaN that are actually missing and now being stored as an object.

df_cust_merge.columns

Index(['product_id', 'product_name', 'aisle_id', 'department_id', 'prices',
       'order_id', 'user_id', 'order_number', 'orders_day_of_week',
       'order_hour_of_day', 'days_since_prior_order', 'add_to_cart_order',
       'reordered', '_merge', 'price_range_loc', 'busiest_day', 'busiest_days',
       'busiest_period_of_day', 'max_order', 'loyalty_flag',
       'avg_product_price', 'spending_flag', 'median_days_between_orders',
       'ordering_behavior_flag', 'first_name', 'last_name', 'gender', 'state',
       'age', 'date_joined', 'n_dependants', 'fam_status', 'income',
       'merge_status'],
      dtype='object')

In [6]:
df_cust_merge.isnull().sum()

product_id                          0
product_name                        0
aisle_id                            0
department_id                       0
prices                           5127
order_id                            0
user_id                             0
order_number                        0
orders_day_of_week                  0
order_hour_of_day                   0
days_since_prior_order        2076096
add_to_cart_order                   0
reordered                           0
_merge                              0
price_range_loc                     0
busiest_day                         0
busiest_days                        0
busiest_period_of_day               0
max_order                           0
loyalty_flag                        0
avg_product_price                   0
spending_flag                       0
median_days_between_orders          5
ordering_behavior_flag              5
first_name                          0
last_name                           0
gender      

- The outliers in 'prices' were previously labeled as missing (to not affect statistical checks).
- Missing values in 'days_since_prior_order' due to being the first order of users.
- Missing values in derived columns ('median_days_between_orders' and 'ordering_behavior_flag') due to being one user's only order. 

In [7]:
# Checking the 'first_name' column.

nan_first_name = df_cust_merge['first_name'].isna()

In [8]:
nan_string_present = df_cust_merge['first_name'].str.contains('nan', na=False)

In [9]:
NaN_string_present = df_cust_merge['first_name'].str.contains('NaN', na=False)

In [10]:
# Trying to find the missing value label. 

df_cust_merge['first_name'].unique()

array(['Charles', 'Deborah', 'Heather', 'Christina', 'nan', 'Sandra',
       'Ralph', 'Thomas', 'Randy', 'Kathryn', 'Willie', 'Billy', 'Steve',
       'Jonathan', 'James', 'Amanda', 'Todd', 'Andrea', 'Jimmy',
       'Cynthia', 'Elizabeth', 'Stephanie', 'Daniel', 'Nancy', 'Sarah',
       'Ann', 'Bruce', 'Major', 'Evelyn', 'Lori', 'Katherine', 'Joshua',
       'Mary', 'Lois', 'Alice', 'Merry', 'Johnny', 'Emily', 'Lisa',
       'Robert', 'Janet', 'Norma', 'Louise', 'Stephen', 'Rose', 'Juan',
       'Brian', 'Theresa', 'Samuel', 'Christopher', 'Harry', 'Carl',
       'Kelly', 'Matthew', 'John', 'Steven', 'Peter', 'Henry', 'Nicole',
       'Carolyn', 'Jeremy', 'Kevin', 'Terry', 'Annie', 'Dennis', 'Denise',
       'Beverly', 'Kimberly', 'Jerry', 'Carlos', 'Lawrence', 'Margaret',
       'Betty', 'Paul', 'Dorothy', 'Marie', 'Michael', 'Donald',
       'Roberto', 'Kathleen', 'Amy', 'Arthur', 'Jesse', 'Louis', 'Paula',
       'Patricia', 'Marilyn', 'Victor', 'Rebecca', 'Laura', 'Albert',
       

### The missing values are labeled as 'nan'.

In [11]:
nan_string_present = df_cust_merge['first_name'].str.contains('nan', na=True)

In [13]:
# Find rows where the first_name is a placeholder string for missing values

first_name_placeholders = df_cust_merge['first_name'].isin(['nan', 'None'])

In [15]:
df_cust_merge['first_name'] = df_cust_merge['first_name'].replace(
    to_replace=['nan', 'None', None, np.nan],
    value=np.nan)

In [17]:
df_cust_merge[df_cust_merge['first_name'].isna()]

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,...,first_name,last_name,gender,state,age,date_joined,n_dependants,fam_status,income,merge_status
6,1,Chocolate Sandwich Cookies,61,19,5.8,1871483,825,3,2,14,...,,Parks,Male,District of Columbia,20,5/8/2019,3,living with parents and siblings,34171,both
44,1,Chocolate Sandwich Cookies,61,19,5.8,2311344,2857,24,1,8,...,,Daniels,Female,Alabama,30,9/23/2017,2,married,91407,both
49,1,Chocolate Sandwich Cookies,61,19,5.8,1394659,4006,2,1,10,...,,Gould,Male,North Carolina,28,5/26/2017,0,single,96906,both
50,1,Chocolate Sandwich Cookies,61,19,5.8,2353526,4006,3,1,10,...,,Gould,Male,North Carolina,28,5/26/2017,0,single,96906,both
51,1,Chocolate Sandwich Cookies,61,19,5.8,2269226,4006,5,3,13,...,,Gould,Male,North Carolina,28,5/26/2017,0,single,96906,both
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32404783,49688,Fresh Foaming Cleanser,73,11,13.5,1834067,43508,22,5,11,...,,Warner,Female,California,36,1/24/2017,1,married,98347,both
32404788,49688,Fresh Foaming Cleanser,73,11,13.5,538174,57383,12,1,20,...,,Rasmussen,Female,Delaware,35,6/1/2017,2,married,79354,both
32404797,49688,Fresh Foaming Cleanser,73,11,13.5,1737708,81870,7,6,13,...,,Rasmussen,Female,Indiana,18,4/8/2018,2,living with parents and siblings,80296,both
32404812,49688,Fresh Foaming Cleanser,73,11,13.5,3290206,121646,3,0,9,...,,Huerta,Female,Georgia,45,11/18/2017,1,married,102081,both


In [34]:
# Making a new dataframe to only contain the missing values in 'first_name' column.

first_name_missing = df_cust_merge[df_cust_merge['first_name'].isna()]

In [35]:
first_name_missing

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,...,first_name,last_name,gender,state,age,date_joined,n_dependants,fam_status,income,merge_status
6,1,Chocolate Sandwich Cookies,61,19,5.8,1871483,825,3,2,14,...,,Parks,Male,District of Columbia,20,5/8/2019,3,living with parents and siblings,34171,both
44,1,Chocolate Sandwich Cookies,61,19,5.8,2311344,2857,24,1,8,...,,Daniels,Female,Alabama,30,9/23/2017,2,married,91407,both
49,1,Chocolate Sandwich Cookies,61,19,5.8,1394659,4006,2,1,10,...,,Gould,Male,North Carolina,28,5/26/2017,0,single,96906,both
50,1,Chocolate Sandwich Cookies,61,19,5.8,2353526,4006,3,1,10,...,,Gould,Male,North Carolina,28,5/26/2017,0,single,96906,both
51,1,Chocolate Sandwich Cookies,61,19,5.8,2269226,4006,5,3,13,...,,Gould,Male,North Carolina,28,5/26/2017,0,single,96906,both
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32404783,49688,Fresh Foaming Cleanser,73,11,13.5,1834067,43508,22,5,11,...,,Warner,Female,California,36,1/24/2017,1,married,98347,both
32404788,49688,Fresh Foaming Cleanser,73,11,13.5,538174,57383,12,1,20,...,,Rasmussen,Female,Delaware,35,6/1/2017,2,married,79354,both
32404797,49688,Fresh Foaming Cleanser,73,11,13.5,1737708,81870,7,6,13,...,,Rasmussen,Female,Indiana,18,4/8/2018,2,living with parents and siblings,80296,both
32404812,49688,Fresh Foaming Cleanser,73,11,13.5,3290206,121646,3,0,9,...,,Huerta,Female,Georgia,45,11/18/2017,1,married,102081,both


In [37]:
# Trying to find only the unique last names. 

first_name_missing['last_name'].unique()

array(['Parks', 'Daniels', 'Gould', 'Mason', 'Sanchez', 'Pratt', 'Grimes',
       'Davenport', 'Koch', 'Dawson', 'Middleton', 'Bean', 'Ingram',
       'Copeland', 'Choi', 'Patterson', 'Hampton', 'Huerta', 'Nicholson',
       'Orr', 'Davila', 'Faulkner', 'Morse', 'Warner', 'Floyd', 'Page',
       'Melton', 'Morton', 'Mcintyre', 'Avery', 'Stanley', 'Owens',
       'Fischer', 'Gilbert', 'Williamson', 'Oconnor', 'Rasmussen',
       'Dejesus', 'Simpson', 'Chen', 'Dixon', 'Frost', 'Brandt', 'Truong',
       'Hutchinson', 'Miranda', 'Snow', 'Zuniga', 'Sims', 'Reese',
       'Obrien', 'Moran', 'Skinner', 'Patton', 'Delgado'], dtype=object)

In [38]:
# Counting the number of unique last names. 

first_name_missing['last_name'].nunique()

55

### There are 55 unique last names, meaning there are 55 corresponding missing first names.

# Checking Raw Data 

In [18]:
# Importing customers data (raw)

df_customers = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'customers.csv'), index_col = False)

In [19]:
df_customers.shape

(206209, 10)

In [20]:
# Checking for missing values.

df_customers.isnull().sum()

user_id             0
First Name      11259
Surnam              0
Gender              0
STATE               0
Age                 0
date_joined         0
n_dependants        0
fam_status          0
income              0
dtype: int64

In [23]:
# Creating dataframe only showing missing values for 'first_name' column.

nan_first_name = df_customers['First Name'].isna()

In [24]:
nan_first_name

0         False
1         False
2         False
3         False
4         False
          ...  
206204    False
206205    False
206206    False
206207    False
206208    False
Name: First Name, Length: 206209, dtype: bool

In [25]:
nan_first_name.isna().sum()

np.int64(0)

### Missing values not showing. 

In [26]:
nan_first_name.value_counts(dropna = False)

First Name
False    194950
True      11259
Name: count, dtype: int64

### Missing values account for about 5.46% of the column.

In [27]:
nan_first_name.unique()

array([False,  True])

In [30]:
df_customers['First Name'].unique()

array(['Deborah', 'Patricia', 'Kenneth', 'Michelle', 'Ann', 'Cynthia',
       'Chris', 'Joseph', 'Jeremy', 'Shawn', 'Gloria', 'Roger',
       'Stephanie', 'Peter', 'Brandon', 'Amy', 'Brian', 'Victor',
       'Patrick', 'Randy', 'Wanda', 'Sandra', 'Martha', 'Wayne',
       'Charles', 'Ruth', 'Kimberly', 'Steve', 'Carl', 'Nicholas',
       'Stephen', 'Lisa', 'Judy', 'Doris', 'Barbara', 'Adam', 'Terry',
       'Marie', 'Rebecca', 'Lillian', 'Kathryn', 'Shirley', 'Douglas',
       'Christopher', 'Craig', 'Jimmy', 'Henry', 'Sara', 'Carlos', nan,
       'Jonathan', 'Bonnie', 'Julie', 'Beverly', 'Laura', 'Frank',
       'James', 'Brenda', 'Ashley', 'Bruce', 'Todd', 'Ralph', 'Keith',
       'Johnny', 'Irene', 'Annie', 'Carolyn', 'Gerald', 'Christina',
       'Matthew', 'Steven', 'Harry', 'Ruby', 'Joshua', 'Larry', 'John',
       'Emily', 'Albert', 'Kathy', 'Jacqueline', 'Karen', 'Alice',
       'Janice', 'Jeffrey', 'Benjamin', 'Phyllis', 'Susan', 'Katherine',
       'Jerry', 'Sharon', 'Daniel'

### The missing value is 'nan'