# 01. Importing Libraries

In [113]:
# Import Libraries
import pandas as pd
import numpy as np
import os

# 02. Importing Data

In [None]:
# Importing 'products.csv' as as df_prods from 'Original Data' folder and 'order.cvs' 
# as df_ords from 'Prepared Data' folder

In [114]:
path_prods = r'/Users/robertochidiac/Desktop/Instacart Basket Analysis/Data/Original Data/products.csv'

In [115]:
path_prods

'/Users/robertochidiac/Desktop/Instacart Basket Analysis/Data/Original Data/products.csv'

In [116]:
path_ords = r'/Users/robertochidiac/Desktop/Instacart Basket Analysis/Data/Prepared Data/orders_wrangled.csv'

In [117]:
path_ords

'/Users/robertochidiac/Desktop/Instacart Basket Analysis/Data/Prepared Data/orders_wrangled.csv'

In [118]:
df_prods = pd.read_csv(os.path.join(path_prods), index_col = False)

In [119]:
df_ords = pd.read_csv(os.path.join(path_ords), index_col = False)

# 03. Data Consistency Checks

In [120]:
# cerating a test dataframe to practice mixed-type data
# Create a dataframe
df_test = pd.DataFrame()

In [121]:
# Create a mixed-type column
df_test['mix'] = ['a', 'b', 1, True]

In [122]:
df_test.head()

Unnamed: 0,mix
0,a
1,b
2,1
3,True


In [123]:
# the function below will check wether the dataframe contains any mix-type colums
for col in df_test.columns.tolist():
  weird = (df_test[[col]].applymap(type) != df_test[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df_test[weird]) > 0:
    print (col)

mix


In [124]:
# the code below will change the data type
df_test['mix'] = df_test['mix'].astype('str')

In [125]:
# the code below will find missing values in 'df_prods'
df_prods.isnull().sum()

product_id        0
product_name     16
aisle_id          0
department_id     0
prices            0
dtype: int64

In [126]:
# The code below will create a subset containing the missing value from 'product_name' column
df_nan = df_prods[df_prods['product_name'].isnull() == True]

In [127]:
df_nan

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
33,34,,121,14,12.2
68,69,,26,7,11.8
115,116,,93,3,10.8
261,262,,110,13,12.1
525,525,,109,11,1.2
1511,1511,,84,16,14.3
1780,1780,,126,11,12.3
2240,2240,,52,1,14.2
2586,2586,,104,13,12.4
3159,3159,,126,11,13.1


In [128]:
df_prods.shape

(49693, 5)

In [129]:
# the code below will create a new dataframe from 'df_prods' where the missing value is equal to false.
df_prods_clean = df_prods[df_prods['product_name'].isnull() == False]

In [130]:
df_prods_clean.shape

(49677, 5)

In [131]:
# The code below will look for full duplicates in your 'df_prods' data
df_dups = df_prods_clean[df_prods_clean.duplicated()]

In [132]:
df_dups

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
462,462,Fiber 4g Gummy Dietary Supplement,70,11,4.8
18459,18458,Ranger IPA,27,5,9.2
26810,26808,Black House Coffee Roasty Stout Beer,27,5,13.4
35309,35306,Gluten Free Organic Peanut Butter & Chocolate ...,121,14,6.8
35495,35491,Adore Forever Body Wash,127,11,9.9


In [133]:
df_prods_clean.shape

(49677, 5)

In [134]:
# the code below will create a new dataframe that will include the duplicates that were identified above
df_prods_clean_no_dups = df_prods_clean.drop_duplicates()

In [135]:
df_prods_clean_no_dups.shape

(49672, 5)

In [136]:
df_prods_clean_no_dups

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,1,Chocolate Sandwich Cookies,61,19,5.8
1,2,All-Seasons Salt,104,13,9.3
2,3,Robust Golden Unsweetened Oolong Tea,94,7,4.5
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1,10.5
4,5,Green Chile Anytime Sauce,5,13,4.3
5,6,Dry Nose Oil,11,11,2.6
6,7,Pure Coconut Water With Orange,98,7,4.4
7,8,Cut Russet Potatoes Steam N' Mash,116,1,1.1
8,9,Light Strawberry Blueberry Yogurt,120,16,7.0
9,10,Sparkling Orange Juice & Prickly Pear Beverage,115,7,8.4


In [None]:
# The code below show the steps for exporting the new cleaned 'df_prods'dataframe into a csv file

In [137]:
path_deps_clean = r'/Users/robertochidiac/Desktop/Instacart Basket Analysis/Data/Prepared Data'

In [138]:
df_prods_clean_no_dups.to_csv(os.path.join(path_deps_clean, 'products_checked.csv'))

# 04. Data Consistency Check for 'df_ords' DataFrame

In [139]:
# The cell below will use the 'df.describe()' function to investigate the 'df_ords' DataFrame
df_ords.describe()

Unnamed: 0.1,Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order
count,3421083.0,3421083.0,3421083.0,3421083.0,3421083.0,3421083.0,3214874.0
mean,1710541.0,1710542.0,102978.2,17.15486,2.776219,13.45202,11.11484
std,987581.7,987581.7,59533.72,17.73316,2.046829,4.226088,9.206737
min,0.0,1.0,1.0,1.0,0.0,0.0,0.0
25%,855270.5,855271.5,51394.0,5.0,1.0,10.0,4.0
50%,1710541.0,1710542.0,102689.0,11.0,3.0,13.0,7.0
75%,2565812.0,2565812.0,154385.0,23.0,5.0,16.0,15.0
max,3421082.0,3421083.0,206209.0,100.0,6.0,23.0,30.0


### After investigating the result of the function df.describe() I can't see any apparent issue with the data

In [None]:
# Check for mixed-type data in your df_ords dataframe.

In [140]:
# the code below will check for mixed-type in the dataframe
for col in df_ords.columns.tolist():
  weird = (df_ords[[col]].applymap(type) != df_ords[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df_ords[weird]) > 0:
    print (col)

### Running the function above should have given the column that had mixed-type data, but since it didn't come up with anything, it means that the dataframe has no mixed-type data

In [None]:
#Run a check for missing values in your df_ords dataframe.
#In a markdown cell, report your findings and propose an explanation for any missing values you find.

In [141]:
# the code below will find missing values in 'df_ords'
df_ords.isnull().sum()

Unnamed: 0                     0
order_id                       0
user_id                        0
eval_set                       0
order_number                   0
orders_day_of_week             0
order_hour_of_day              0
days_since_prior_order    206209
dtype: int64

### The result for the missing values function above came up with a result of 206209 missing values for column 'days_since_prior_order.
### My assumption for such missing value for that column could be that there is simply no information as to how many days went by since the last order from that customer

In [None]:
# Address the missing values using an appropriate method.
# In a markdown cell, explain why you used your method of choice.

In [142]:
# First step is to write a code that will create a subset containing the missing value from 
#'days_since_prior_order' column
df_nan_ords = df_ords[df_ords['days_since_prior_order'].isnull() == True]

In [94]:
df_nan_ords

Unnamed: 0.1,Unnamed: 0,order_id,user_id,eval_set,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order
0,0,2539329,1,prior,1,2,8,
11,11,2168274,2,prior,1,2,11,
26,26,1374495,3,prior,1,1,14,
39,39,3343014,4,prior,1,6,11,
45,45,2717275,5,prior,1,3,12,
50,50,2086598,6,prior,1,5,18,
54,54,2565571,7,prior,1,3,9,
75,75,600894,8,prior,1,6,0,
79,79,280530,9,prior,1,1,17,
83,83,1224907,10,prior,1,2,14,


In [143]:
# The code below will give us the number of columns and row for the 'df_ords' dataframe, which will give us 
# an overall view and comparaison with the new dataframe created above
df_ords.shape

(3421083, 8)

In [144]:
# With the median known, the new step would be to use the function in the code below to replace my missing
# values with the median
df_ords['ordered_today'] = np.where(df_ords['days_since_prior_order'].isnull(), True, False)

In [145]:
df_ords.head()

Unnamed: 0.1,Unnamed: 0,order_id,user_id,eval_set,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,ordered_today
0,0,2539329,1,prior,1,2,8,,True
1,1,2398795,1,prior,2,3,7,15.0,False
2,2,473747,1,prior,3,3,12,21.0,False
3,3,2254736,1,prior,4,4,7,29.0,False
4,4,431534,1,prior,5,4,15,28.0,False


In [146]:
df_ords.isnull().sum()

Unnamed: 0                     0
order_id                       0
user_id                        0
eval_set                       0
order_number                   0
orders_day_of_week             0
order_hour_of_day              0
days_since_prior_order    206209
ordered_today                  0
dtype: int64

In [147]:
df_ords.shape

(3421083, 9)

### In this particular context just replacing the missing values with a specific median or mean would distort the data. So instead of replacing the NaN values we will create a new column called 'ordered_today that will give us a value of True when the column 'days_since_prior_order' has an NaN value and False for any other value in that column. 

In [None]:
# Run a check for duplicate values in your df_ords data.
# In a markdown cell, report your findings and propose an explanation for any duplicate values you find.

In [148]:
# The code below will look for full duplicates in 'df_prods' data
df_dups_ords = df_ords[df_ords.duplicated()]

In [149]:
df_dups_ords

Unnamed: 0.1,Unnamed: 0,order_id,user_id,eval_set,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,ordered_today


### After runnining the duplicate function in my dataframe the result just the columns headers, which mean that the dataframe doesn't have any full duplicates. And if any duplicates were present the proper way to adress them would have been to remove them using 'df.drop_duplicates()' function

In [None]:
# Export your final, cleaned df_prods and df_ords data as “.csv” files in your “Prepared Data” folder 
# and give them appropriate, succinct names.

In [None]:
# The code below show the steps for exporting the new cleaned 'df_prods' and df_ords dataframe into a csv file

In [150]:
path_deps_clean = r'/Users/robertochidiac/Desktop/Instacart Basket Analysis/Data/Prepared Data'

In [151]:
df_prods_clean_no_dups.to_csv(os.path.join(path_deps_clean, 'products_checked.csv'))

In [152]:
path_ords_clean = r'/Users/robertochidiac/Desktop/Instacart Basket Analysis/Data/Prepared Data'

In [154]:
df_ords.to_csv(os.path.join(path_ords_clean, 'orders_checked.csv'))