 # # Script cleaning and checking consistency of Orders and Products dataframes

### 1. Importing Libraries
### 2. Importing Data
### 3. Data Consistency Checks
		- Mixed Data Types
		- Missing Values
		- Trends and Outliers
		- Duplicates
### 4. Exporting Data

## 1. Importing Libraries

In [1]:
# Importing libraries
import pandas as pd
import numpy as np
import os

## 2. Importing Data

In [2]:
# Creating shortcut to folder
path = r'C:\Users\seank\OneDrive\Dokumente\Career Foundry Data Analytics Course\Data Immersion\4 Python\03-2020_Instacart_Basket _Analysis'

# Importing datasets using path
     # Orders dataset already prepared
df_ords = pd.read_csv(os.path.join(path, '02_Data', 'Prepared_Data', 'orders_wrangled.csv'), 
                            index_col = False)
     # Product 
df_prods = pd.read_csv(os.path.join(path,'02_Data', 'Original_Data', 'products.csv'),
                       index_col = False)

In [3]:
df_ords.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order
0,2539329,1,1,2,8,
1,2398795,1,2,3,7,15.0
2,473747,1,3,3,12,21.0
3,2254736,1,4,4,7,29.0
4,431534,1,5,4,15,28.0


## 3. Data Consistency Checks

# Mixed Data Types

In [None]:
# Checking for mixed data types
for column in df_ords.columns:
    if df_ords[column].apply(type).nunique() > 1:
        print(f"Column '{column}' has mixed data types.")

no mixed data columns found

# Searching Out Missing Values

In [6]:
    # Products
# Using isnull (which results in boolean), and then adding up these results
df_prods.isnull().sum()

product_id        0
product_name     16
aisle_id          0
department_id     0
prices            0
dtype: int64

In [7]:
# creating subset with just the null values
df_prods_nan = df_prods[df_prods['product_name'].isnull() == True]
df_prods_nan

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
33,34,,121,14,12.2
68,69,,26,7,11.8
115,116,,93,3,10.8
261,262,,110,13,12.1
525,525,,109,11,1.2
1511,1511,,84,16,14.3
1780,1780,,126,11,12.3
2240,2240,,52,1,14.2
2586,2586,,104,13,12.4
3159,3159,,126,11,13.1


In [8]:
# Creating subset of data without the missings
df_prods.shape    # first check shape of data


(49693, 5)

In [9]:
# create subset with missings excluded
df_prods_clean = df_prods[df_prods['product_name'].isnull() == False]
df_prods_clean.shape     # check shape of this new df to make sure only 16 rows fewer

(49677, 5)

In [10]:
    # Now the Orders df
# checking for missings
df_ords.isnull().sum()

order_id                       0
user_id                        0
order_number                   0
orders_day_of_week             0
order_hour_of_day              0
days_since_prior_order    206209
dtype: int64

    Days_since_prior_order has 206209 missings
    This isn't an issue in cases where it's somebody's first order

In [11]:
# checking if these missings are all instances of people's first order
    # subset
df_ords_nan = df_ords[df_ords['days_since_prior_order'].isnull() == True]
df_ords_nan.describe()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order
count,206209.0,206209.0,206209.0,206209.0,206209.0,0.0
mean,1708462.0,103105.0,1.0,2.754118,13.626597,
std,988129.9,59527.555167,0.0,2.076205,4.223769,
min,20.0,1.0,1.0,0.0,0.0,
25%,850730.0,51553.0,1.0,1.0,11.0,
50%,1706246.0,103105.0,1.0,3.0,14.0,
75%,2564292.0,154657.0,1.0,5.0,17.0,
max,3421081.0,206209.0,1.0,6.0,23.0,


Min and max for order number column are both 1, so this is not a true missing but valuable information since it represents people first orders. So we definitely shouldn't delete them.
Imputing values wouldn't make much sense either.
I could create a new binary variable to mark these orders as first order to be used in future analyses, but we can just use the order_number == 1 for this instead so it's unnecessary.
I will do nothing with these rows, and just stay aware of it when it comes to analysis stage

## Looking for trends and outliers

In [12]:
# Looking for other trends or outliers in Orders df
df_ords.describe()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order
count,3421083.0,3421083.0,3421083.0,3421083.0,3421083.0,3214874.0
mean,1710542.0,102978.2,17.15486,2.776219,13.45202,11.11484
std,987581.7,59533.72,17.73316,2.046829,4.226088,9.206737
min,1.0,1.0,1.0,0.0,0.0,0.0
25%,855271.5,51394.0,5.0,1.0,10.0,4.0
50%,1710542.0,102689.0,11.0,3.0,13.0,7.0
75%,2565812.0,154385.0,23.0,5.0,16.0,15.0
max,3421083.0,206209.0,100.0,6.0,23.0,30.0


The days of the week are coded as 0 - 6, rather than 1-7. Similarly time is 0-23 rather than 1-24.
We can also see that order_id is the unique identifier and that order number only goes until 100

In [13]:
# Now checking for trends in products df
df_prods_clean.describe()

Unnamed: 0,product_id,aisle_id,department_id,prices
count,49677.0,49677.0,49677.0,49677.0
mean,24850.194235,67.76311,11.728687,9.993164
std,14340.588602,38.316396,5.850651,453.592708
min,1.0,1.0,1.0,1.0
25%,12433.0,35.0,7.0,4.1
50%,24851.0,69.0,13.0,7.1
75%,37267.0,100.0,17.0,11.1
max,49688.0,134.0,21.0,99999.0


In [None]:
# Max value of 99999 seems strange, investigating items with high prices
pricecheck = df_prods_clean[df_prods_clean['prices'] > 100]
pricecheck.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
21554,21553,Lowfat 2% Milkfat Cottage Cheese,108,16,14900.0
33666,33664,2 % Reduced Fat Milk,84,16,99999.0


In [15]:
# It's just these two items that have been clearly miscoded. 
# Since I can't contact company, I will replace prices with prices of similar items

In [None]:
# Looking for cottage cheese and milk
# Filter rows where product_name contains 'cottage cheese' or 'milk'
milk_cheese_df = df_prods_clean[df_prods_clean['product_name'].str.contains('cottage cheese|milk', 
                                                                case=False, 
                                                                regex=True,
                                                               na=False)]

# Display the filtered df
milk_cheese_df

In [17]:
# it's clear that a lot of items contain Milk that aren't actually milk, like chocolate items.
# This rules out taking average or median of all 'milk' items
# Doing more specific search with milk as first word and also only 2% milk

In [18]:
df_2milk = milk_cheese_df[milk_cheese_df['product_name'].str.contains('^milk',
                                                                         case=False,
                                                                         regex=True,
                                                                         na=False) &
                            milk_cheese_df['product_name'].str.contains('2%',
                                                                         case=True,
                                                                         regex=True,
                                                                         na=False)
]

In [19]:
df_2milk.head(20)

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
2962,2962,"Milk, Reduced Fat, 2% Milkfat",84,16,2.7
4864,4864,"Milk, 2% Reduced Fat, Chocolate",84,16,3.5


In [20]:
# Taking average of these two items since they're  similar to our miscoded item
milk2avg = np.mean([2.7, 3.5])

In [21]:
# Reassign the price value for the miscoded milk (product_id 33664)
df_prods_clean.loc[df_prods_clean['product_id'] == 33664, 'prices'] = milk2avg

In [22]:
# Check it worked
print(df_prods_clean[df_prods_clean['product_id'] == 33664])

       product_id           product_name  aisle_id  department_id  prices
33666       33664  2 % Reduced Fat  Milk        84             16     3.1


In [23]:
# Now for cottage cheese
    # subsetting all cottage cheeses
df_cottagecheese = df_prods_clean[df_prods_clean['product_name'].str.contains('cottage cheese',
                                                                         case=False,
                                                                         regex=True,
                                                                         na=False)]
# Now excluding the actual item with the issue
df_cottagecheese = df_cottagecheese[df_cottagecheese['product_id'] != 21553]

df_cottagecheese.describe()

Unnamed: 0,product_id,aisle_id,department_id,prices
count,70.0,70.0,70.0,70.0
mean,25322.928571,101.114286,16.4,6.757143
std,13856.045083,21.514096,1.988372,3.705639
min,195.0,21.0,6.0,1.1
25%,12037.5,108.0,16.0,3.3
50%,27075.5,108.0,16.0,6.75
75%,36355.5,108.0,16.0,9.4
max,48259.0,120.0,21.0,15.0


In [24]:
df_cottagecheese.info()

<class 'pandas.core.frame.DataFrame'>
Index: 70 entries, 194 to 48263
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   product_id     70 non-null     int64  
 1   product_name   70 non-null     object 
 2   aisle_id       70 non-null     int64  
 3   department_id  70 non-null     int64  
 4   prices         70 non-null     float64
dtypes: float64(1), int64(3), object(1)
memory usage: 3.3+ KB


There are 70 different items here so just taking the average of them to assign as value

In [25]:
# First calculate the average price in df_cottagecheese
average_price_cottage = df_cottagecheese['prices'].mean()
print(average_price_cottage)


6.757142857142856


In [26]:
# Assign the average price to the miscoded product (product_id is 21553)
df_prods_clean.loc[df_prods_clean['product_id'] == 21553, 'prices'] = average_price_cottage
print(df_prods_clean[df_prods_clean['product_id']==21553])

       product_id                      product_name  aisle_id  department_id  \
21554       21553  Lowfat 2% Milkfat Cottage Cheese       108             16   

         prices  
21554  6.757143  


In [27]:
# Now checking how our changes have affected the main df
df_prods_clean.describe()

Unnamed: 0,product_id,aisle_id,department_id,prices
count,49677.0,49677.0,49677.0,49677.0
mean,24850.194235,67.76311,11.728687,7.680441
std,14340.588602,38.316396,5.850651,4.199255
min,1.0,1.0,1.0,1.0
25%,12433.0,35.0,7.0,4.1
50%,24851.0,69.0,13.0,7.1
75%,37267.0,100.0,17.0,11.1
max,49688.0,134.0,21.0,25.0


# # Finding and removing duplicates

In [28]:
# New df with full duplicates
    # products
df_prods_dups = df_prods_clean[df_prods_clean.duplicated()]
df_prods_dups

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
462,462,Fiber 4g Gummy Dietary Supplement,70,11,4.8
18459,18458,Ranger IPA,27,5,9.2
26810,26808,Black House Coffee Roasty Stout Beer,27,5,13.4
35309,35306,Gluten Free Organic Peanut Butter & Chocolate ...,121,14,6.8
35495,35491,Adore Forever Body Wash,127,11,9.9


In [29]:
# Removing Duplicate
    # First checking size of df 
df_prods_clean.shape

(49677, 5)

In [30]:
# New Df without duplicates
df_prods_checked = df_prods_clean.drop_duplicates()
df_prods_checked.shape    # correct, only 5 fewer rows which is what we saw before

(49672, 5)

In [31]:
    # Now for Orders DF
# New df with full duplicates
df_ords_dups = df_ords[df_ords.duplicated()]
df_ords_dups

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order


No duplicates so no action needed

In [32]:
# Renaming df to make clear it's been checked for consistency
df_ords_checked = df_ords

In [33]:
df_ords_checked.shape

(3421083, 6)

## 4. Exporting to CSV

In [33]:
# orders
df_ords_checked.to_csv(os.path.join(path, '02_Data', 'Prepared_Data', 'orders_checked.csv'),
                       index = False)

In [34]:
# products
df_prods_checked.to_csv(os.path.join(path, '02_Data', 'Prepared_Data', 'products_checked.csv'),
                        index = False)