# Importing Libraries

In [2]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
import scipy

# Importing Data

In [3]:
# Creating path to project folder.

path = r'C:\Users\TanaT\(CF) Achievement 4 - Instacart Basket Analysis'

In [4]:
# Importing dataframe used in 4.9 Part 2 and exported from Part 1. 

df_cust_merge =  pd.read_pickle(os.path.join(path, '02 Data','Prepared Data', 'customer_and_merged.pkl'))   

In [5]:
# Checking dataframe.

df_cust_merge.shape

(32404859, 34)

# Checking NaN Values (now set as object)

In [6]:
# Checking if there are any missing values in the 'first_name' column. 
# Thought they were handled after changing the mixed-type column to a string (object).
# Now checking the NaN that are actually missing and now being stored as an object.

df_cust_merge.columns

Index(['product_id', 'product_name', 'aisle_id', 'department_id', 'prices',
       'order_id', 'user_id', 'order_number', 'orders_day_of_week',
       'order_hour_of_day', 'days_since_prior_order', 'add_to_cart_order',
       'reordered', '_merge', 'price_range_loc', 'busiest_day', 'busiest_days',
       'busiest_period_of_day', 'max_order', 'loyalty_flag',
       'avg_product_price', 'spending_flag', 'median_days_between_orders',
       'ordering_behavior_flag', 'first_name', 'last_name', 'gender', 'state',
       'age', 'date_joined', 'n_dependants', 'fam_status', 'income',
       'merge_status'],
      dtype='object')

In [7]:
df_cust_merge.isnull().sum()

product_id                          0
product_name                        0
aisle_id                            0
department_id                       0
prices                           5127
order_id                            0
user_id                             0
order_number                        0
orders_day_of_week                  0
order_hour_of_day                   0
days_since_prior_order        2076096
add_to_cart_order                   0
reordered                           0
_merge                              0
price_range_loc                     0
busiest_day                         0
busiest_days                        0
busiest_period_of_day               0
max_order                           0
loyalty_flag                        0
avg_product_price                   0
spending_flag                       0
median_days_between_orders          5
ordering_behavior_flag              5
first_name                          0
last_name                           0
gender      

- The outliers in 'prices' were previously labeled as missing (to not affect statistical checks).
- Missing values in 'days_since_prior_order' due to being the first order of users.
- Missing values in derived columns ('median_days_between_orders' and 'ordering_behavior_flag') due to being one user's only order. 

# Checking days since column

In [8]:
nan_days_since = df_cust_merge['days_since_prior_order'].isna()

In [31]:
nan_days_since.isna().sum()

np.int64(0)

In [32]:
df_cust_merge[nan_days_since]

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,...,first_name,last_name,gender,state,age,date_joined,n_dependants,fam_status,income,merge_status
3,1,Chocolate Sandwich Cookies,61,19,5.8,652770,764,1,3,13,...,Heather,Myers,Female,Wisconsin,40,2/9/2020,3,married,31308,both
10,1,Chocolate Sandwich Cookies,61,19,5.8,1798115,1379,1,5,11,...,Thomas,Howard,Male,Alaska,61,3/30/2020,1,married,42573,both
34,1,Chocolate Sandwich Cookies,61,19,5.8,2434812,1598,1,2,12,...,Billy,Ray,Male,Kansas,41,1/7/2018,2,married,122060,both
37,1,Chocolate Sandwich Cookies,61,19,5.8,627615,1647,1,1,9,...,Steve,Heath,Male,Indiana,67,4/11/2019,0,divorced/widowed,39869,both
42,1,Chocolate Sandwich Cookies,61,19,5.8,2722928,2833,1,0,14,...,Jonathan,Andersen,Male,North Carolina,37,1/30/2020,2,married,56559,both
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32404829,49688,Fresh Foaming Cleanser,73,11,13.5,2744037,159487,1,2,18,...,Barbara,Harrington,Female,Florida,37,8/11/2018,1,married,87012,both
32404834,49688,Fresh Foaming Cleanser,73,11,13.5,2530161,163632,1,3,12,...,Clarence,Marsh,Male,New Hampshire,31,3/27/2018,2,married,67824,both
32404839,49688,Fresh Foaming Cleanser,73,11,13.5,1400940,166213,1,1,21,...,Lois,Aguilar,Female,Arkansas,52,2/27/2020,2,married,125863,both
32404849,49688,Fresh Foaming Cleanser,73,11,13.5,2251059,184081,1,6,9,...,Gerald,Mcconnell,Male,Nebraska,66,5/18/2019,0,divorced/widowed,106393,both


In [33]:
df_cust_merge['missing_days_since'] = nan_days_since

In [34]:
df_cust_merge[['missing_days_since', 'order_number', 'days_since_prior_order', 'user_id']]

Unnamed: 0,missing_days_since,order_number,days_since_prior_order,user_id
0,False,28,3.0,138
1,False,30,20.0,138
2,False,2,6.0,709
3,True,1,,764
4,False,3,9.0,764
...,...,...,...,...
32404854,False,2,5.0,200215
32404855,True,1,,200377
32404856,False,5,15.0,200873
32404857,False,9,5.0,200873


In [35]:
df_cust_merge[df_cust_merge['order_number'] == 1][['missing_days_since', 'order_number', 'days_since_prior_order', 'user_id']]


Unnamed: 0,missing_days_since,order_number,days_since_prior_order,user_id
3,True,1,,764
10,True,1,,1379
34,True,1,,1598
37,True,1,,1647
42,True,1,,2833
...,...,...,...,...
32404829,True,1,,159487
32404834,True,1,,163632
32404839,True,1,,166213
32404849,True,1,,184081


### All missing values relate to the first order.

# Checking Missing Prices

In [37]:
df_cust_merge['prices'].isna().sum()

np.int64(5127)

In [39]:
# Making column for missing prices. 

df_cust_merge['missing_prices'] = df_cust_merge['prices'].isna()
df_cust_merge.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,...,gender,state,age,date_joined,n_dependants,fam_status,income,merge_status,missing_days_since,missing_prices
0,1,Chocolate Sandwich Cookies,61,19,5.8,3139998,138,28,6,11,...,Male,Minnesota,81,8/1/2019,1,married,49620,both,False,False
1,1,Chocolate Sandwich Cookies,61,19,5.8,1977647,138,30,6,17,...,Male,Minnesota,81,8/1/2019,1,married,49620,both,False,False
2,1,Chocolate Sandwich Cookies,61,19,5.8,389851,709,2,0,21,...,Female,Vermont,66,6/16/2018,2,married,158302,both,False,False
3,1,Chocolate Sandwich Cookies,61,19,5.8,652770,764,1,3,13,...,Female,Wisconsin,40,2/9/2020,3,married,31308,both,True,False
4,1,Chocolate Sandwich Cookies,61,19,5.8,1813452,764,3,4,17,...,Female,Wisconsin,40,2/9/2020,3,married,31308,both,False,False


In [43]:
# Checking only where prices are missing. 

df_cust_merge[df_cust_merge['missing_prices'] == True][['product_id', 'department_id', 'product_name', 'order_id', 'user_id']]

Unnamed: 0,product_id,department_id,product_name,order_id,user_id
13100147,21553,16,Lowfat 2% Milkfat Cottage Cheese,912404,17
13100148,21553,16,Lowfat 2% Milkfat Cottage Cheese,603376,17
13100149,21553,16,Lowfat 2% Milkfat Cottage Cheese,3264360,135
13100150,21553,16,Lowfat 2% Milkfat Cottage Cheese,892534,135
13100151,21553,16,Lowfat 2% Milkfat Cottage Cheese,229704,342
...,...,...,...,...,...
21786876,33664,16,2 % Reduced Fat Milk,2249946,204099
21786877,33664,16,2 % Reduced Fat Milk,2363282,204099
21786878,33664,16,2 % Reduced Fat Milk,3181945,204395
21786879,33664,16,2 % Reduced Fat Milk,2486215,205227


### 5127 - same number as the prices we set to be treated as missing (outliers) - anything over 100 dollars

In [46]:
# Checking unique missing values. 

df_cust_merge['department_id'].unique()

array([19, 13,  7,  1, 11, 16, 17, 18, 12,  9,  8, 14, 15,  4, 21,  6, 20,
        5,  3,  2, 10])

In [47]:
df_cust_merge['product_name'].unique()

array(['Chocolate Sandwich Cookies', 'All-Seasons Salt',
       'Robust Golden Unsweetened Oolong Tea', ..., 'Artisan Baguette',
       'Smartblend Healthy Metabolism Dry Cat Food',
       'Fresh Foaming Cleanser'], dtype=object)

In [49]:
df_cust_merge['product_name'].nunique()

49661

# Checking first name column

In [9]:
# Checking the 'first_name' column.

nan_first_name = df_cust_merge['first_name'].isna()

In [10]:
nan_string_present = df_cust_merge['first_name'].str.contains('nan', na=False)

In [11]:
NaN_string_present = df_cust_merge['first_name'].str.contains('NaN', na=False)

In [12]:
# Trying to find the missing value label. 

df_cust_merge['first_name'].unique()

array(['Charles', 'Deborah', 'Heather', 'Christina', 'nan', 'Sandra',
       'Ralph', 'Thomas', 'Randy', 'Kathryn', 'Willie', 'Billy', 'Steve',
       'Jonathan', 'James', 'Amanda', 'Todd', 'Andrea', 'Jimmy',
       'Cynthia', 'Elizabeth', 'Stephanie', 'Daniel', 'Nancy', 'Sarah',
       'Ann', 'Bruce', 'Major', 'Evelyn', 'Lori', 'Katherine', 'Joshua',
       'Mary', 'Lois', 'Alice', 'Merry', 'Johnny', 'Emily', 'Lisa',
       'Robert', 'Janet', 'Norma', 'Louise', 'Stephen', 'Rose', 'Juan',
       'Brian', 'Theresa', 'Samuel', 'Christopher', 'Harry', 'Carl',
       'Kelly', 'Matthew', 'John', 'Steven', 'Peter', 'Henry', 'Nicole',
       'Carolyn', 'Jeremy', 'Kevin', 'Terry', 'Annie', 'Dennis', 'Denise',
       'Beverly', 'Kimberly', 'Jerry', 'Carlos', 'Lawrence', 'Margaret',
       'Betty', 'Paul', 'Dorothy', 'Marie', 'Michael', 'Donald',
       'Roberto', 'Kathleen', 'Amy', 'Arthur', 'Jesse', 'Louis', 'Paula',
       'Patricia', 'Marilyn', 'Victor', 'Rebecca', 'Laura', 'Albert',
       

### The missing values are labeled as 'nan'.

In [13]:
nan_string_present = df_cust_merge['first_name'].str.contains('nan', na=True)

In [14]:
# Find rows where the first_name is a placeholder string for missing values

first_name_placeholders = df_cust_merge['first_name'].isin(['nan', 'None'])

In [15]:
df_cust_merge['first_name'] = df_cust_merge['first_name'].replace(
    to_replace=['nan', 'None', None, np.nan],
    value=np.nan)

In [16]:
df_cust_merge[df_cust_merge['first_name'].isna()]

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,...,first_name,last_name,gender,state,age,date_joined,n_dependants,fam_status,income,merge_status
6,1,Chocolate Sandwich Cookies,61,19,5.8,1871483,825,3,2,14,...,,Parks,Male,District of Columbia,20,5/8/2019,3,living with parents and siblings,34171,both
44,1,Chocolate Sandwich Cookies,61,19,5.8,2311344,2857,24,1,8,...,,Daniels,Female,Alabama,30,9/23/2017,2,married,91407,both
49,1,Chocolate Sandwich Cookies,61,19,5.8,1394659,4006,2,1,10,...,,Gould,Male,North Carolina,28,5/26/2017,0,single,96906,both
50,1,Chocolate Sandwich Cookies,61,19,5.8,2353526,4006,3,1,10,...,,Gould,Male,North Carolina,28,5/26/2017,0,single,96906,both
51,1,Chocolate Sandwich Cookies,61,19,5.8,2269226,4006,5,3,13,...,,Gould,Male,North Carolina,28,5/26/2017,0,single,96906,both
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32404783,49688,Fresh Foaming Cleanser,73,11,13.5,1834067,43508,22,5,11,...,,Warner,Female,California,36,1/24/2017,1,married,98347,both
32404788,49688,Fresh Foaming Cleanser,73,11,13.5,538174,57383,12,1,20,...,,Rasmussen,Female,Delaware,35,6/1/2017,2,married,79354,both
32404797,49688,Fresh Foaming Cleanser,73,11,13.5,1737708,81870,7,6,13,...,,Rasmussen,Female,Indiana,18,4/8/2018,2,living with parents and siblings,80296,both
32404812,49688,Fresh Foaming Cleanser,73,11,13.5,3290206,121646,3,0,9,...,,Huerta,Female,Georgia,45,11/18/2017,1,married,102081,both


In [17]:
# Making a new dataframe to only contain the missing values in 'first_name' column.

first_name_missing = df_cust_merge[df_cust_merge['first_name'].isna()]

In [18]:
first_name_missing

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,...,first_name,last_name,gender,state,age,date_joined,n_dependants,fam_status,income,merge_status
6,1,Chocolate Sandwich Cookies,61,19,5.8,1871483,825,3,2,14,...,,Parks,Male,District of Columbia,20,5/8/2019,3,living with parents and siblings,34171,both
44,1,Chocolate Sandwich Cookies,61,19,5.8,2311344,2857,24,1,8,...,,Daniels,Female,Alabama,30,9/23/2017,2,married,91407,both
49,1,Chocolate Sandwich Cookies,61,19,5.8,1394659,4006,2,1,10,...,,Gould,Male,North Carolina,28,5/26/2017,0,single,96906,both
50,1,Chocolate Sandwich Cookies,61,19,5.8,2353526,4006,3,1,10,...,,Gould,Male,North Carolina,28,5/26/2017,0,single,96906,both
51,1,Chocolate Sandwich Cookies,61,19,5.8,2269226,4006,5,3,13,...,,Gould,Male,North Carolina,28,5/26/2017,0,single,96906,both
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32404783,49688,Fresh Foaming Cleanser,73,11,13.5,1834067,43508,22,5,11,...,,Warner,Female,California,36,1/24/2017,1,married,98347,both
32404788,49688,Fresh Foaming Cleanser,73,11,13.5,538174,57383,12,1,20,...,,Rasmussen,Female,Delaware,35,6/1/2017,2,married,79354,both
32404797,49688,Fresh Foaming Cleanser,73,11,13.5,1737708,81870,7,6,13,...,,Rasmussen,Female,Indiana,18,4/8/2018,2,living with parents and siblings,80296,both
32404812,49688,Fresh Foaming Cleanser,73,11,13.5,3290206,121646,3,0,9,...,,Huerta,Female,Georgia,45,11/18/2017,1,married,102081,both


In [19]:
# Trying to find only the unique last names. 

first_name_missing['last_name'].unique()

array(['Parks', 'Daniels', 'Gould', 'Mason', 'Sanchez', 'Pratt', 'Grimes',
       'Davenport', 'Koch', 'Dawson', 'Middleton', 'Bean', 'Ingram',
       'Copeland', 'Choi', 'Patterson', 'Hampton', 'Huerta', 'Nicholson',
       'Orr', 'Davila', 'Faulkner', 'Morse', 'Warner', 'Floyd', 'Page',
       'Melton', 'Morton', 'Mcintyre', 'Avery', 'Stanley', 'Owens',
       'Fischer', 'Gilbert', 'Williamson', 'Oconnor', 'Rasmussen',
       'Dejesus', 'Simpson', 'Chen', 'Dixon', 'Frost', 'Brandt', 'Truong',
       'Hutchinson', 'Miranda', 'Snow', 'Zuniga', 'Sims', 'Reese',
       'Obrien', 'Moran', 'Skinner', 'Patton', 'Delgado'], dtype=object)

In [20]:
# Counting the number of unique last names. 

first_name_missing['last_name'].nunique()

55

### There are 55 unique last names, meaning there are 55 corresponding missing first names.

# Checking Raw Data 

In [21]:
# Importing customers data (raw)

df_customers = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'customers.csv'), index_col = False)

In [22]:
df_customers.shape

(206209, 10)

In [23]:
# Checking for missing values.

df_customers.isnull().sum()

user_id             0
First Name      11259
Surnam              0
Gender              0
STATE               0
Age                 0
date_joined         0
n_dependants        0
fam_status          0
income              0
dtype: int64

In [24]:
# Creating dataframe only showing missing values for 'first_name' column.

nan_first_name = df_customers['First Name'].isna()

In [25]:
nan_first_name

0         False
1         False
2         False
3         False
4         False
          ...  
206204    False
206205    False
206206    False
206207    False
206208    False
Name: First Name, Length: 206209, dtype: bool

In [26]:
nan_first_name.isna().sum()

np.int64(0)

### Missing values not showing. 

In [27]:
nan_first_name.value_counts(dropna = False)

First Name
False    194950
True      11259
Name: count, dtype: int64

### Missing values account for about 5.46% of the column.

In [28]:
nan_first_name.unique()

array([False,  True])

In [29]:
df_customers['First Name'].unique()

array(['Deborah', 'Patricia', 'Kenneth', 'Michelle', 'Ann', 'Cynthia',
       'Chris', 'Joseph', 'Jeremy', 'Shawn', 'Gloria', 'Roger',
       'Stephanie', 'Peter', 'Brandon', 'Amy', 'Brian', 'Victor',
       'Patrick', 'Randy', 'Wanda', 'Sandra', 'Martha', 'Wayne',
       'Charles', 'Ruth', 'Kimberly', 'Steve', 'Carl', 'Nicholas',
       'Stephen', 'Lisa', 'Judy', 'Doris', 'Barbara', 'Adam', 'Terry',
       'Marie', 'Rebecca', 'Lillian', 'Kathryn', 'Shirley', 'Douglas',
       'Christopher', 'Craig', 'Jimmy', 'Henry', 'Sara', 'Carlos', nan,
       'Jonathan', 'Bonnie', 'Julie', 'Beverly', 'Laura', 'Frank',
       'James', 'Brenda', 'Ashley', 'Bruce', 'Todd', 'Ralph', 'Keith',
       'Johnny', 'Irene', 'Annie', 'Carolyn', 'Gerald', 'Christina',
       'Matthew', 'Steven', 'Harry', 'Ruby', 'Joshua', 'Larry', 'John',
       'Emily', 'Albert', 'Kathy', 'Jacqueline', 'Karen', 'Alice',
       'Janice', 'Jeffrey', 'Benjamin', 'Phyllis', 'Susan', 'Katherine',
       'Jerry', 'Sharon', 'Daniel'

### The missing value is 'nan'