# Data consistency check
# Contents

## 1. Mixed-type data
## 2. Data prep products
### 2.1 Missing values
### 2.2 Duplicates
### 2.3 Exporting changes
## 3. Data prep orders
### 3.1 Check for mixed-type data in df-ords dataframe
### 3.2 Check for missing values in df-ords dataframe
### 3.3 Run duplicate check
### 3.4 Exporting changes
## 5. Additional data cleaning from 4.9

In [1]:
# Import libraries
import pandas as pd
import numpy as np
import os

In [2]:
# Project folder path as string
path = r'/Users/sophie/Desktop/CareerFoundry /09 2023 Phython'

In [3]:
path

'/Users/sophie/Desktop/CareerFoundry /09 2023 Phython'

In [5]:
# Importing data sets
df_prods = pd.read_csv(os.path.join(path, 'Data', 'original data ', 'products.csv'), index_col = False)
df_ords = pd.read_csv(os.path.join(path, 'Data', 'prepared data ', 'orders_wrangled.csv'), index_col = False)

In [6]:
# Print first rows from orders
df_prods.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,1,Chocolate Sandwich Cookies,61,19,5.8
1,2,All-Seasons Salt,104,13,9.3
2,3,Robust Golden Unsweetened Oolong Tea,94,7,4.5
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1,10.5
4,5,Green Chile Anytime Sauce,5,13,4.3


In [7]:
# Print first rows from orders
df_ords.head()

Unnamed: 0.1,Unnamed: 0,order_id,user_id,eval_set,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order
0,0,2539329,1,prior,1,2,8,
1,1,2398795,1,prior,2,3,7,15.0
2,2,473747,1,prior,3,3,12,21.0
3,3,2254736,1,prior,4,4,7,29.0
4,4,431534,1,prior,5,4,15,28.0


# 1. Mixed-type data

In [11]:
# Creating an example data frame with a mixed-type column
df_test = pd.DataFrame()
df_test['mix'] = ['a', 'b', 1, True]

In [12]:
df_test.head()

Unnamed: 0,mix
0,a
1,b
2,1
3,True


In [14]:
# Checking whether data frame contains any mixed-type columns
for col in df_test.columns.tolist():
  weird = (df_test[[col]].applymap(type) != df_test[[col]].iloc[0].apply(type)).any(axis = 1) # Test that checks whether the data types within the column are consistent. 
  if len (df_test[weird]) > 0: 
    print (col)
    
# Here, the if statement is checking whether weird is true or false (boolean). If it’s greater than 0 (boolean), than it’s true. If not, it’s false. 
# If weird is true, the command print(col) is executed, which prints the problematic column for you to see.

mix


In [15]:
# How to handle mixed data types
# 1., decide with type

df_test['mix'] = df_test['mix'].astype('str')

# From string to numeric: update the str within the astype() function to int64 (for example)


# 2. Data Prep products.csv

## 2.1 Missing values

In [7]:
# Find missing values
df_prods.isnull().sum()

# Assign the function isnull() to the df_prods dataframe, then sum the result with the attached sum() function
# isnull() function is used to find missing observations
# isnull() function by itself would return a value of True or False 
# You need to know how many total missing observations there are, which is where the sum() function comes in
# True values can also be interpreted numerically as 1, and False values can also be interpreted numerically as 0. 
# If every missing observation is equal to 1, then you can simply add them up using the sum() function to obtain the total number of missing observations

product_id        0
product_name     16
aisle_id          0
department_id     0
prices            0
dtype: int64

In [8]:
# Retrieve these missings within new dataframe
df_nan = df_prods[df_prods['product_name'].isnull() == True]
df_nan

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
33,34,,121,14,12.2
68,69,,26,7,11.8
115,116,,93,3,10.8
261,262,,110,13,12.1
525,525,,109,11,1.2
1511,1511,,84,16,14.3
1780,1780,,126,11,12.3
2240,2240,,52,1,14.2
2586,2586,,104,13,12.4
3159,3159,,126,11,13.1


In [None]:
# Addressing Missing Values

# Create a new variable that acts like a flag based on the missing value. OR
# Impute the value with the mean or median of the column (if the variable is numeric). OR
# Remove or filter out the missing data.

In [1]:
# Imputing

# Finding the mean
# df.describe()
# Imputing the mean
# df['column with missings'].fillna(mean value, inplace=True)

# Median
# df_prods.median()
# df_prods['product_name'].median()
# df['column with missings'].fillna(median value, inplace=True)

# Linear Interpolation:
# finding the mean of the rows before the missing value occurs,
# finding the mean of the rows after the missing value occurs, 
# and estimating where the missing value should fall between those two means

In [9]:
# Compare rows of original and subset dataframe
df_prods.shape

(49693, 5)

In [10]:
# Subset w/o missings (boolean == False)
df_prods_clean = df_prods[df_prods['product_name'].isnull() == False]
df_prods_clean.shape

# Alternative: drop all missings:
# df_prods.dropna(inplace = True) 
# Drop missings from particular column
# df_prods.dropna(subset = [‘product_name’], inplace = True) 

# inplace = True is overwriting df_prods with a new version of df_prods that doesn’t contain the missing values
# Default would be False --> creates only a view
# Best practice: new data frame df_prods_clean 

(49677, 5)

In [39]:
df_prods_clean = df_prods.dropna(subset = ['product_name'])
df_prods_clean.shape

In [10]:
df_prods_clean = df_prods.dropna(subset=['product_name']).copy()
df_prods_clean.shape

(49677, 5)

## 2.2 Duplicates

In [11]:
# Best practice: make a record of these occurrences, export examples, 
# and send them to your client for further explanation

# Look for full duplicates
df_dups = df_prods_clean[df_prods_clean.duplicated()]

# Creates a new subset of df_prods_clean—df_dups—containing only rows that are duplicates

In [12]:
df_dups

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
462,462,Fiber 4g Gummy Dietary Supplement,70,11,4.8
18459,18458,Ranger IPA,27,5,9.2
26810,26808,Black House Coffee Roasty Stout Beer,27,5,13.4
35309,35306,Gluten Free Organic Peanut Butter & Chocolate ...,121,14,6.8
35495,35491,Adore Forever Body Wash,127,11,9.9


In [13]:
# Addressing Duplicates
#drop_duplicates() 

# Check current number of rows
df_prods_clean.shape

(49677, 5)

In [14]:
df_prods_clean_no_dups = df_prods_clean.drop_duplicates()
df_prods_clean_no_dups.shape

(49672, 5)

## 2.3 Exporting changes

In [15]:
df_prods_clean_no_dups.to_csv(os.path.join(path, 'Data', 'prepared data ', 'products_checked.csv')) 


# 3. Data Prep orders.csv

In [16]:
df_ords.describe()

Unnamed: 0.1,Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order
count,3421083.0,3421083.0,3421083.0,3421083.0,3421083.0,3421083.0,3214874.0
mean,1710541.0,1710542.0,102978.2,17.15486,2.776219,13.45202,11.11484
std,987581.7,987581.7,59533.72,17.73316,2.046829,4.226088,9.206737
min,0.0,1.0,1.0,1.0,0.0,0.0,0.0
25%,855270.5,855271.5,51394.0,5.0,1.0,10.0,4.0
50%,1710541.0,1710542.0,102689.0,11.0,3.0,13.0,7.0
75%,2565812.0,2565812.0,154385.0,23.0,5.0,16.0,15.0
max,3421082.0,3421083.0,206209.0,100.0,6.0,23.0,30.0


In [17]:
# drop "Unnamed" column

df_ords_new = df_ords.drop(columns = ['Unnamed: 0'])
df_ords_new.head()

Unnamed: 0,order_id,user_id,eval_set,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order
0,2539329,1,prior,1,2,8,
1,2398795,1,prior,2,3,7,15.0
2,473747,1,prior,3,3,12,21.0
3,2254736,1,prior,4,4,7,29.0
4,431534,1,prior,5,4,15,28.0


In [19]:
# Change data type of order_id to exclude from descriptive analyses

df_ords_new[['order_id', 'user_id', 'order_number']] = df_ords_new[['order_id', 'user_id', 'order_number']].astype({'order_id': str, 'user_id': str, 'order_number': str})
df_ords_new[['order_id', 'user_id', 'order_number']].dtypes

order_id        object
user_id         object
order_number    object
dtype: object

In [20]:
# Investigating data:
df_ords_new.describe()

Unnamed: 0,orders_day_of_week,order_hour_of_day,days_since_prior_order
count,3421083.0,3421083.0,3214874.0
mean,2.776219,13.45202,11.11484
std,2.046829,4.226088,9.206737
min,0.0,0.0,0.0
25%,1.0,10.0,4.0
50%,3.0,13.0,7.0
75%,5.0,16.0,15.0
max,6.0,23.0,30.0


In [34]:
# order_day_of_week looks accurate and consistent: no negative values; min & max between expected range of 7 days; mean, 50% percentile are at expected level around 3
# order_hour_of_day looks suspicious: we expect min & max between range of 0 to 24 hours. However, may is very low with 2.3. All percentiles and mean accordingly also seem too low. Many missings coded as 0 which are counted as true values?
# days_since_prior_order looks suspicious: 50% percentile is higher than both other percentiles and higher than max?

## 3.1 Check for mixed-type data in df_ords dataframe

In [24]:
for col in df_ords_new.columns.tolist():
  weird = (df_ords_new[[col]].applymap(type) != df_ords_new[[col]].iloc[0].apply(type)).any(axis = 1) # Test that checks whether the data types within the column are consistent. 
  if len (df_ords_new[weird]) > 0: 
    print (col)

In [None]:
# No output indicating mixed columns.

## 3.2 Run check for missing values in df_ords dataframe

In [21]:
# Find missing values
df_ords_new.isnull().sum()

order_id                       0
user_id                        0
eval_set                       0
order_number                   0
orders_day_of_week             0
order_hour_of_day              0
days_since_prior_order    206209
dtype: int64

In [22]:
df_ords_nan = df_ords_new[df_ords_new['days_since_prior_order'].isnull() == True]
df_ords_nan

Unnamed: 0,order_id,user_id,eval_set,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order
0,2539329,1,prior,1,2,8,
11,2168274,2,prior,1,2,11,
26,1374495,3,prior,1,1,14,
39,3343014,4,prior,1,6,11,
45,2717275,5,prior,1,3,12,
...,...,...,...,...,...,...,...
3420930,969311,206205,prior,1,4,12,
3420934,3189322,206206,prior,1,3,18,
3421002,2166133,206207,prior,1,6,19,
3421019,2227043,206208,prior,1,1,15,


In [23]:
df_ords_nan.order_number.max()

'1'

In [None]:
# Order number is '1' in these cases: there are no prior orders, since these customers ordered only one time so far.
# Hence, these are no true missings but meaningful informative values.
# Should be flagged as they are (or maybe renamed as "first_order") and this should be noted down in the data dict.

In [None]:
# Rename NaN's
# NOT NECESSARY

#df_ords_rename_nan = df_ords_new.fillna("first_order", inplace=False)
#print(df_ords_rename_nan)

In [None]:
# Check renaming

# Define the target value I want to filter for
#target_value = 'first_order'

# Use boolean indexing to filter rows with the target value in any column
#filtered_df = df_ords_rename_nan[dr_ords_rename_nan.eq(target_value).any(axis=1)]

# Print the filtered DataFrame
#print(filtered_df)

In [30]:
# Do nothing with "missings" because they stand for "new customer"
# Create Boolean variable for "new customer", because this is useful information

df_ords_clean = df_ords_new
df_ords_clean['new_customer'] = df_ords_new['days_since_prior_order'].isnull() == True
df_ords_clean

Unnamed: 0,order_id,user_id,eval_set,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,new_customer
0,2539329,1,prior,1,2,8,,True
1,2398795,1,prior,2,3,7,15.0,False
2,473747,1,prior,3,3,12,21.0,False
3,2254736,1,prior,4,4,7,29.0,False
4,431534,1,prior,5,4,15,28.0,False
...,...,...,...,...,...,...,...,...
3421078,2266710,206209,prior,10,5,18,29.0,False
3421079,1854736,206209,prior,11,4,10,30.0,False
3421080,626363,206209,prior,12,1,12,18.0,False
3421081,2977660,206209,prior,13,1,12,7.0,False


## 3.3 Run duplicate check 

In [29]:
# Look for full duplicates
df_ords_dups = df_ords_clean[df_ords_clean.duplicated()]

In [26]:
df_ords_dups

Unnamed: 0,order_id,user_id,eval_set,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,new_customer


In [39]:
# Checking again in original data

df_ords_dups = df_ords[df_ords.duplicated()]
df_ords_dups

Unnamed: 0.1,Unnamed: 0,order_id,user_id,eval_set,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order


In [None]:
# There seem to be no duplicates. 

## 3.4 Exporting changes 

In [31]:
df_ords_clean.to_csv(os.path.join(path, 'Data', 'prepared data ', 'orders_checked.csv')) 

# 4. Additional data cleaning from 4.9 on 'prices' 
## After noticing suspicious values in histogram

ords_prods_merge['prices'].max()

In [None]:
# Check variable distribution
sns.scatterplot(x = 'prices', y = 'prices',data = ords_prods_merge)
print(sns.scatterplot)

In [None]:
# Looks suspicious, what about 20.000 value? Run more checks
# Outliers?

ords_prods_merge.loc[ords_prods_merge['prices'] > 100]

In [None]:
# Mark suspicious outliers as NaNs
ords_prods_merge.loc[ords_prods_merge['prices'] >100, 'prices'] = np.nan

In [None]:
# Check if successful
ords_prods_merge['prices'].max()