CONTENTS LIST:
- Checking for mixed data types
- Checking for missing data
- Checking for duplicates
- Checking for outliers

#01 Importing libraries

In [3]:
#importing libraries
import pandas as pd
import numpy as np
import os

#02 Import data

In [5]:
path = '/Users/gingermoore/Documents/04-2025 Instacart Basket Analysis'

In [6]:
df_prods = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'products.csv'), index_col = False)

In [7]:
df_ords = pd.read_csv(os.path.join(path, '02 Data', 'Prepared Data', 'orders_wrangled.csv'), index_col = False)

#03 Mixed Data Types

In [9]:
# Creating a playground data frame
df_test = pd.DataFrame()

In [10]:
# Creating a mixed type column
df_test['mix'] = ['a', 'b', 1, True]

In [11]:
df_test.head()

Unnamed: 0,mix
0,a
1,b
2,1
3,True


In [12]:
# Checking for mixed type data
for col in df_test.columns.tolist():
  weird = (df_test[[col]].map(type) != df_test[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df_test[weird]) > 0:
    print (col)

mix


In [13]:
df_test['mix'] = df_test['mix'].astype('str')

#04 Missing Values

In [15]:
df_prods.isnull().sum()

product_id        0
product_name     16
aisle_id          0
department_id     0
prices            0
dtype: int64

In [16]:
# Creating subset to view null values
df_nan = df_prods[df_prods['product_name'].isnull() == True]

In [17]:
df_nan

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
33,34,,121,14,12.2
68,69,,26,7,11.8
115,116,,93,3,10.8
261,262,,110,13,12.1
525,525,,109,11,1.2
1511,1511,,84,16,14.3
1780,1780,,126,11,12.3
2240,2240,,52,1,14.2
2586,2586,,104,13,12.4
3159,3159,,126,11,13.1


In [18]:
df_prods.shape

(49693, 5)

In [19]:
df_nan.shape

(16, 5)

In [20]:
# Creating new dataframe without missing values
df_prods_clean = df_prods[df_prods['product_name'].isnull() == False]

In [21]:
df_prods_clean.shape

(49677, 5)

#05 Duplicates

In [23]:
# Checking for full duplicates
df_dups = df_prods_clean[df_prods_clean.duplicated()]

In [24]:
df_dups

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
462,462,Fiber 4g Gummy Dietary Supplement,70,11,4.8
18459,18458,Ranger IPA,27,5,9.2
26810,26808,Black House Coffee Roasty Stout Beer,27,5,13.4
35309,35306,Gluten Free Organic Peanut Butter & Chocolate ...,121,14,6.8
35495,35491,Adore Forever Body Wash,127,11,9.9


In [25]:
# Checking df_prods_clean for comparison
df_prods_clean.shape

(49677, 5)

In [26]:
# Creating new df without duplicates
df_prods_clean_no_dups = df_prods_clean.drop_duplicates()

In [27]:
df_prods_clean_no_dups

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,1,Chocolate Sandwich Cookies,61,19,5.8
1,2,All-Seasons Salt,104,13,9.3
2,3,Robust Golden Unsweetened Oolong Tea,94,7,4.5
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1,10.5
4,5,Green Chile Anytime Sauce,5,13,4.3
...,...,...,...,...,...
49688,49684,"Vodka, Triple Distilled, Twist of Vanilla",124,5,5.3
49689,49685,En Croute Roast Hazelnut Cranberry,42,1,3.1
49690,49686,Artisan Baguette,112,3,7.8
49691,49687,Smartblend Healthy Metabolism Dry Cat Food,41,8,4.7


In [28]:
df_prods_clean_no_dups.to_csv(os.path.join(path, '02 Data', 'Prepared Data', 'products_checked.csv'))

#06 Cleaning the orders dataframe

In [30]:
# Checking the orders df
df_ords.describe()

Unnamed: 0.1,Unnamed: 0,order_id,user_id,orders_chronological,orders_day_of_week,order_hour_of_day,days_since_prior_order
count,3421083.0,3421083.0,3421083.0,3421083.0,3421083.0,3421083.0,3214874.0
mean,1710541.0,1710542.0,102978.2,17.15486,2.776219,13.45202,11.11484
std,987581.7,987581.7,59533.72,17.73316,2.046829,4.226088,9.206737
min,0.0,1.0,1.0,1.0,0.0,0.0,0.0
25%,855270.5,855271.5,51394.0,5.0,1.0,10.0,4.0
50%,1710541.0,1710542.0,102689.0,11.0,3.0,13.0,7.0
75%,2565812.0,2565812.0,154385.0,23.0,5.0,16.0,15.0
max,3421082.0,3421083.0,206209.0,100.0,6.0,23.0,30.0


The only thing I see with this data is that the order and user ID columns need to be formatted as strings, not integers. The rest looks fine.

In [32]:
# Looking for mixed data types
for col in df_ords.columns.tolist():
  weird = (df_ords[[col]].map(type) != df_ords[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df_ords[weird]) > 0:
    print (col)

There were no results returned, which I assume means that there are no mixed data types.

In [34]:
#Checking for null values
df_ords.isnull().sum()

Unnamed: 0                     0
order_id                       0
user_id                        0
eval_set                       0
orders_chronological           0
orders_day_of_week             0
order_hour_of_day              0
days_since_prior_order    206209
dtype: int64

In [35]:
df_ords.shape

(3421083, 8)

There are over 200k null values in the days_since_prior_order column. If this data is automatically generated when a customer places an order, that might mean there is or was some kind of bug in the app that's causing it not to report the data. If customers report it themselves (which I find unlikely), then it could just be that they're choosing not to report it.

In [37]:
# Reloading the orders df because I decided to impute values before thinking things all the way through.
df_ords = pd.read_csv(os.path.join(path, '02 Data', 'Prepared Data', 'orders_wrangled.csv'), index_col = False)

In [38]:
df_ords

Unnamed: 0.1,Unnamed: 0,order_id,user_id,eval_set,orders_chronological,orders_day_of_week,order_hour_of_day,days_since_prior_order
0,0,2539329,1,prior,1,2,8,
1,1,2398795,1,prior,2,3,7,15.0
2,2,473747,1,prior,3,3,12,21.0
3,3,2254736,1,prior,4,4,7,29.0
4,4,431534,1,prior,5,4,15,28.0
...,...,...,...,...,...,...,...,...
3421078,3421078,2266710,206209,prior,10,5,18,29.0
3421079,3421079,1854736,206209,prior,11,4,10,30.0
3421080,3421080,626363,206209,prior,12,1,12,18.0
3421081,3421081,2977660,206209,prior,13,1,12,7.0


I ran some code that imputed the mean value of 11 to the days_since_prior_order column when there were null values. However, I did not consider that perhaps these are just first-time customers, which means the null values are not incorrect--there are simply no prior orders to refer to. I decided to add a new column to indicate if this is a new customer or not. This took a little research.

In [40]:
df_ords['is_new'] = np.where(df_ords['days_since_prior_order'].isnull(), True, False)

In [41]:
df_ords.head()

Unnamed: 0.1,Unnamed: 0,order_id,user_id,eval_set,orders_chronological,orders_day_of_week,order_hour_of_day,days_since_prior_order,is_new
0,0,2539329,1,prior,1,2,8,,True
1,1,2398795,1,prior,2,3,7,15.0,False
2,2,473747,1,prior,3,3,12,21.0,False
3,3,2254736,1,prior,4,4,7,29.0,False
4,4,431534,1,prior,5,4,15,28.0,False


In [42]:
df_ords.drop(columns = 'eval_set')

Unnamed: 0.1,Unnamed: 0,order_id,user_id,orders_chronological,orders_day_of_week,order_hour_of_day,days_since_prior_order,is_new
0,0,2539329,1,1,2,8,,True
1,1,2398795,1,2,3,7,15.0,False
2,2,473747,1,3,3,12,21.0,False
3,3,2254736,1,4,4,7,29.0,False
4,4,431534,1,5,4,15,28.0,False
...,...,...,...,...,...,...,...,...
3421078,3421078,2266710,206209,10,5,18,29.0,False
3421079,3421079,1854736,206209,11,4,10,30.0,False
3421080,3421080,626363,206209,12,1,12,18.0,False
3421081,3421081,2977660,206209,13,1,12,7.0,False


In [43]:
# Checking for duplicates
df_dups = df_ords[df_ords.duplicated()]

In [44]:
df_dups

Unnamed: 0.1,Unnamed: 0,order_id,user_id,eval_set,orders_chronological,orders_day_of_week,order_hour_of_day,days_since_prior_order,is_new


There appear to be no duplicate values.

In [46]:
df_dups.shape

(0, 9)

Before exporting my data, I'm going to drop the eval_set column.

In [48]:
df_ords_checked = df_ords.drop(columns = 'eval_set')

In [49]:
df_ords_checked.head()

Unnamed: 0.1,Unnamed: 0,order_id,user_id,orders_chronological,orders_day_of_week,order_hour_of_day,days_since_prior_order,is_new
0,0,2539329,1,1,2,8,,True
1,1,2398795,1,2,3,7,15.0,False
2,2,473747,1,3,3,12,21.0,False
3,3,2254736,1,4,4,7,29.0,False
4,4,431534,1,5,4,15,28.0,False


In [50]:
df_ords_checked.to_csv(os.path.join(path, '02 Data', 'Prepared Data', 'orders_checked.csv'))

In [107]:
#running additional consistency check on df_prods
df_prods_checked = pd.read_csv(os.path.join(path, '02 Data', 'Prepared Data', 'products_checked.csv'), index_col = False)

In [52]:
df_prods_checked.describe()

Unnamed: 0.1,Unnamed: 0,product_id,aisle_id,department_id,prices
count,49672.0,49672.0,49672.0,49672.0,49672.0
mean,24852.005053,24850.349775,67.762442,11.728942,9.993282
std,14342.265579,14340.705287,38.315784,5.850779,453.615536
min,0.0,1.0,1.0,1.0,1.0
25%,12432.75,12432.75,35.0,7.0,4.1
50%,24851.5,24850.5,69.0,13.0,7.1
75%,37272.25,37268.25,100.0,17.0,11.1
max,49692.0,49688.0,134.0,21.0,99999.0


Ah, I see the problem. There's at least one value that's too high. Let's drop that observation.

In [54]:
#Checking frequency to see if there are any other values that may be high
df_prods_checked['prices'].value_counts()

prices
2.5        470
5.3        458
6.2        451
2.6        447
5.4        444
          ... 
15.6         1
21.0         1
99999.0      1
14900.0      1
18.3         1
Name: count, Length: 242, dtype: int64

In [55]:
# Locating the specific rows with these high values
df_prods_checked[df_prods_checked['prices'] >= 99999]

Unnamed: 0.1,Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
33649,33666,33664,2 % Reduced Fat Milk,84,16,99999.0


In [56]:
df_prods_checked[df_prods_checked['prices'] == 14900]

Unnamed: 0.1,Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
21538,21554,21553,Lowfat 2% Milkfat Cottage Cheese,108,16,14900.0


In [57]:
#Creating new df with these observations dropped
df_prods_cleaned = df_prods_checked.drop([33649,21538])

In [58]:
df_prods_cleaned.describe()

Unnamed: 0.1,Unnamed: 0,product_id,aisle_id,department_id,prices
count,49670.0,49670.0,49670.0,49670.0,49670.0
mean,24851.894,24850.238716,67.761305,11.72877,7.680437
std,14342.492172,14340.93185,38.31606,5.850834,4.199381
min,0.0,1.0,1.0,1.0,1.0
25%,12432.25,12432.25,35.0,7.0,4.1
50%,24851.5,24850.5,69.0,13.0,7.1
75%,37272.75,37268.75,100.0,17.0,11.1
max,49692.0,49688.0,134.0,21.0,25.0


I think this looks much better.

In [60]:
df_prods_cleaned.to_csv(os.path.join(path, '02 Data', 'Prepared Data', 'products_clean.csv'))