# This script contains the following points:

#### 01. Import libraries
#### 02. Import data 
#### 03. Data Consistency Checks

## 01. Importing libraries

In [2]:
# Import libraries
import pandas as pd
import numpy as np
import os

## 02. Imoprting data sets (products from Original Data and orders_wrangled from Prepared Data)

In [3]:
path = r'C:\Users\nang6\OneDrive\Bureau\Data Analytics\Data Immersion\Achievement 4\02-2020 Instacart Basket Analysis'

In [4]:
df_prods = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'products.csv'), index_col = False)

In [5]:
df_ords = pd.read_csv(os.path.join(path, '02 Data', 'Prepared Data', 'orders_wrangled.csv'), index_col = False)

## 03. Data Consistency Checks

### 3.1 Describe( ) function

In [29]:
df_ords.describe()

Unnamed: 0.1,Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order
count,3421083.0,3421083.0,3421083.0,3421083.0,3421083.0,3421083.0,3214874.0
mean,1710541.0,1710542.0,102978.2,17.15486,2.776219,13.45202,11.11484
std,987581.7,987581.7,59533.72,17.73316,2.046829,4.226088,9.206737
min,0.0,1.0,1.0,1.0,0.0,0.0,0.0
25%,855270.5,855271.5,51394.0,5.0,1.0,10.0,4.0
50%,1710541.0,1710542.0,102689.0,11.0,3.0,13.0,7.0
75%,2565812.0,2565812.0,154385.0,23.0,5.0,16.0,15.0
max,3421082.0,3421083.0,206209.0,100.0,6.0,23.0,30.0


### 3.2 Mixed-Type Data (includes both string values and numeric values)

In [7]:
# Create a dataframe
df_test = pd.DataFrame()

In [8]:
# Create a mixed type column
df_test['mix'] = ['a','b', 1, True]

In [9]:
df_test.head()

Unnamed: 0,mix
0,a
1,b
2,1
3,True


In [27]:
# Check for mixed-type columns
for col in df_test.columns.tolist():
  weird = (df_test[[col]].applymap(type) != df_test[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df_test[weird]) > 0:
    print (col)

In [11]:
# How to fix it with string
df_test['mix'] = df_test['mix'].astype('str')

### How to fix it with number - can be int64 or whichever number data type we want to use
df_test['mix'] = df_test['mix'].astype('int64')

### 3.3 Missing Values

#### 3.3.1 Finding Missing Values

In [28]:
# sum() function obtains the total number of missing observations. True values interpreted numerically as 1, and False as 0.
df_prods.isnull().sum()

product_id        0
product_name     16
aisle_id          0
department_id     0
prices            0
dtype: int64

In [13]:
# To view these 16 values we can create a subset of the dataframe containing only the values in question
df_nan = df_prods[df_prods['product_name'].isnull()==True]

In [14]:
df_nan

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
33,34,,121,14,12.2
68,69,,26,7,11.8
115,116,,93,3,10.8
261,262,,110,13,12.1
525,525,,109,11,1.2
1511,1511,,84,16,14.3
1780,1780,,126,11,12.3
2240,2240,,52,1,14.2
2586,2586,,104,13,12.4
3159,3159,,126,11,13.1


#### 3.3.2 Addressing Missing Values

There are a few ways to deal with missing data:

1) Create a new variable that acts like a flag based on the missing value.

2) Impute the value with the mean or median of the column (if the variable is numeric).

3) Remove or filter out the missing data.

1 - See example in course

2 - Imputing data:

    by using the mean: df['column with missings'].fillna(mean value, inplace=True)
    
    by using the median: df['column with missings'].fillna(median value, inplace=True)

In [15]:
# Comparing the number of rows in the current dataframe with the number in the subset once the missing rows have been removed
df_prods.shape

(49693, 5)

In [16]:
# Creating dataframe without missing values in 'product_name' column as those missing values are string values and cannot do much with them
df_prods_clean = df_prods[df_prods['product_name'].isnull() == False]

In [17]:
df_prods_clean.shape

(49677, 5)

3 - Dropping all missing values:

    df_prods.dropna(inplace = True)
    
    or dropping only the NaNs from a particular column:
    
    df_prods.dropna(subset = [‘product_name’], inplace = True)


### 3.4 Duplicates

#### 3.4.1 Finding Duplicates

In [18]:
df_dups = df_prods_clean[df_prods_clean.duplicated()]

In [19]:
df_dups

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
462,462,Fiber 4g Gummy Dietary Supplement,70,11,4.8
18459,18458,Ranger IPA,27,5,9.2
26810,26808,Black House Coffee Roasty Stout Beer,27,5,13.4
35309,35306,Gluten Free Organic Peanut Butter & Chocolate ...,121,14,6.8
35495,35491,Adore Forever Body Wash,127,11,9.9


#### 3.4.2 Addressing Duplicates

In [20]:
# Firstly, we need to check the current number of rows in current dataframe in order to compare the number after removing the duplicates
df_prods_clean.shape

(49677, 5)

In [21]:
# Next, create a new dataframe that doesn’t include the duplicates you just identified
df_prods_clean_no_dups = df_prods_clean.drop_duplicates()

In [22]:
df_prods_clean_no_dups.shape

(49672, 5)