# Contents in this Notebook
1. Importing libraries and data
2. Data Consistency Checks (exercise)
3. Data Consistency Checks (task)

# Library Imports

In [37]:
# Importing Libraries
import pandas as pd
import numpy as np
import os

In [38]:
# Defining Path
path = r'C:\Users\seann\Downloads\Career Foundry Tasks\12-08-22 Instacart Basket Analysis'

In [39]:
# Importing Products Data Set
df_prods = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'products.csv'), index_col = False)

In [74]:
# Importing Orders Wrangled Data Set
df_ords = pd.read_csv(os.path.join(path, '02 Data', 'Prepared Data', 'orders_wrangled.csv'), index_col = False)

# 4.4 Exercise Data Consistency Checks

In [41]:
# Create a dataframe 
df_test = pd.DataFrame()

In [42]:
# Create a mixed type column
df_test['mix'] = ['a','b', 1, True]

In [43]:
df_test.head()

Unnamed: 0,mix
0,a
1,b
2,1
3,True


In [44]:
# Checked for mixed types
for col in df_test.columns.tolist():
  weird = (df_test[[col]].applymap(type) != df_test[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df_test[weird]) > 0:
    print (col)

mix


In [45]:
df_test['mix'] = df_test['mix'].astype('str')

In [46]:
df_prods.isnull().sum()

product_id        0
product_name     16
aisle_id          0
department_id     0
prices            0
dtype: int64

In [47]:
df_nan = df_prods[df_prods['product_name'].isnull() == True]

In [48]:
df_nan

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
33,34,,121,14,12.2
68,69,,26,7,11.8
115,116,,93,3,10.8
261,262,,110,13,12.1
525,525,,109,11,1.2
1511,1511,,84,16,14.3
1780,1780,,126,11,12.3
2240,2240,,52,1,14.2
2586,2586,,104,13,12.4
3159,3159,,126,11,13.1


In [49]:
df_prods.shape

(49693, 5)

In [50]:
df_prods_clean = df_prods[df_prods['product_name'].isnull() == False]

In [51]:
df_prods_clean.shape

(49677, 5)

In [52]:
df_dups = df_prods_clean[df_prods_clean.duplicated()]

In [53]:
df_dups

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
462,462,Fiber 4g Gummy Dietary Supplement,70,11,4.8
18459,18458,Ranger IPA,27,5,9.2
26810,26808,Black House Coffee Roasty Stout Beer,27,5,13.4
35309,35306,Gluten Free Organic Peanut Butter & Chocolate ...,121,14,6.8
35495,35491,Adore Forever Body Wash,127,11,9.9


In [54]:
df_prods_clean.shape

(49677, 5)

In [55]:
df_prods_clean_no_dups = df_prods_clean.drop_duplicates()

In [56]:
df_prods_clean_no_dups.shape

(49672, 5)

In [57]:
df_prods_clean_no_dups.to_csv(os.path.join(path, '02 Data','Prepared Data', 'products_checked.csv'))

# 4.5 Data Consistency Check (Task)

In [58]:
df_prods.describe()

Unnamed: 0,product_id,aisle_id,department_id,prices
count,49693.0,49693.0,49693.0,49693.0
mean,24844.345139,67.770249,11.728433,9.994136
std,14343.717401,38.316774,5.850282,453.519686
min,1.0,1.0,1.0,1.0
25%,12423.0,35.0,7.0,4.1
50%,24845.0,69.0,13.0,7.1
75%,37265.0,100.0,17.0,11.2
max,49688.0,134.0,21.0,99999.0


One thing that should be investigated further is the max for the price column. The max is $99,999 but this is an outlier number compared to the other descriptive categories, this may be a mistake, and should be investigated more.

In [59]:
# Checking for mixed type data in df_ords dataframe
for col in df_ords.columns.tolist():
  weird = (df_ords[[col]].applymap(type) != df_ords[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df_ords[weird]) > 0:
    print (col)

No mixed type data found

In [65]:
# Checking for missing values
df_ords.isnull().sum()

Unnamed: 0                     0
order_id                       0
user_id                        0
amount_of_orders_made          0
order_day_of_week              0
order_hour_of_day              0
days_since_prior_order    206209
dtype: int64

There is 206,209 missing values in the days since priod order column in the orders dataframe.

In [61]:
df_ords_nan = df_ords[df_ords['days_since_prior_order'].isnull() == True]

In [62]:
df_ords_nan

Unnamed: 0.1,Unnamed: 0,order_id,user_id,amount_of_orders_made,order_day_of_week,order_hour_of_day,days_since_prior_order
0,0,2539329,1,1,2,8,
11,11,2168274,2,1,2,11,
26,26,1374495,3,1,1,14,
39,39,3343014,4,1,6,11,
45,45,2717275,5,1,3,12,
...,...,...,...,...,...,...,...
3420930,3420930,969311,206205,1,4,12,
3420934,3420934,3189322,206206,1,3,18,
3421002,3421002,2166133,206207,1,6,19,
3421019,3421019,2227043,206208,1,1,15,


In [64]:
df_ords.head(30)

Unnamed: 0.1,Unnamed: 0,order_id,user_id,amount_of_orders_made,order_day_of_week,order_hour_of_day,days_since_prior_order
0,0,2539329,1,1,2,8,
1,1,2398795,1,2,3,7,15.0
2,2,473747,1,3,3,12,21.0
3,3,2254736,1,4,4,7,29.0
4,4,431534,1,5,4,15,28.0
5,5,3367565,1,6,2,7,19.0
6,6,550135,1,7,1,9,20.0
7,7,3108588,1,8,1,14,14.0
8,8,2295261,1,9,1,16,0.0
9,9,2550362,1,10,4,8,30.0


It seems that the missing values only show up for the first order ever made for each customer. Meaning it is just a place holder as there are no orders before that point.

In [67]:
# Address Missing Values
df_ords['days_since_prior_order'].fillna(999, inplace=True)

In [68]:
df_ords.head(20)

Unnamed: 0.1,Unnamed: 0,order_id,user_id,amount_of_orders_made,order_day_of_week,order_hour_of_day,days_since_prior_order
0,0,2539329,1,1,2,8,999.0
1,1,2398795,1,2,3,7,15.0
2,2,473747,1,3,3,12,21.0
3,3,2254736,1,4,4,7,29.0
4,4,431534,1,5,4,15,28.0
5,5,3367565,1,6,2,7,19.0
6,6,550135,1,7,1,9,20.0
7,7,3108588,1,8,1,14,14.0
8,8,2295261,1,9,1,16,0.0
9,9,2550362,1,10,4,8,30.0


To address the missing values I decided to add a flag value of 999 in place of the NaN value, so that it would not affect analysis results. This is important information and tells us that there was no prior orders before the NaN order so we don't want to remove those values. We don't want to imput the mean or median either because that wouldn't help with an accurate protrayal of the data in this instance. 

In [69]:
# Checking for duplicates
df_ords_dups = df_ords[df_ords.duplicated()]

In [70]:
df_ords_dups

Unnamed: 0.1,Unnamed: 0,order_id,user_id,amount_of_orders_made,order_day_of_week,order_hour_of_day,days_since_prior_order


In [72]:
df_ords_dups.size

0

There are no duplicates in the df_ords dataframe.

In [73]:
df_ords.to_csv(os.path.join(path, '02 Data','Prepared Data', 'orders_checked.csv'))