### This script contains:

#### 1. Import libraries
#### 2. Import dataframes
#### 3. Finding missing values
#### 4. Fixing missing values
#### 5. Duplicates
#### 6. Questions
#### 7. Exporting data

### Importing libraries

In [1]:
#Import libraries 
import pandas as pd
import numpy as np
import os

### Importing Dataframes

In [2]:
#Folder path into usable string
path = r'C:\Users\willm\Instacart Basket Analysis'

In [3]:
#Import Products Dataset
df_prods = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'products.csv'))

In [4]:
#Import Orders_wrangled Dataset
df_ords = pd.read_csv(os.path.join(path, '02 Data', 'Prepared Data', 'orders_wrangled.csv'))

In [5]:
df_ords.head(30)

Unnamed: 0.1,Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order
0,0,2539329,1,1,2,8,
1,1,2398795,1,2,3,7,15.0
2,2,473747,1,3,3,12,21.0
3,3,2254736,1,4,4,7,29.0
4,4,431534,1,5,4,15,28.0
5,5,3367565,1,6,2,7,19.0
6,6,550135,1,7,1,9,20.0
7,7,3108588,1,8,1,14,14.0
8,8,2295261,1,9,1,16,0.0
9,9,2550362,1,10,4,8,30.0


### Finding missing values

In [11]:
#Finding which columns having missing data, and how many times
df_prods.isnull().sum()

product_id        0
product_name     16
aisle_id          0
department_id     0
prices            0
dtype: int64

### Fixing missing values

In [12]:
#Creating a subset with the missing values
df_nan = df_prods[df_prods['product_name'].isnull() == True]

In [13]:
df_nan

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
33,34,,121,14,12.2
68,69,,26,7,11.8
115,116,,93,3,10.8
261,262,,110,13,12.1
525,525,,109,11,1.2
1511,1511,,84,16,14.3
1780,1780,,126,11,12.3
2240,2240,,52,1,14.2
2586,2586,,104,13,12.4
3159,3159,,126,11,13.1


In [14]:
df_prods.shape

(49693, 5)

In [15]:
#Creating a new data frame with only the clean data
df_prods_clean = df_prods[df_prods['product_name'].isnull() == False]

In [16]:
df_prods_clean.shape

(49677, 5)

### Duplicates

In [17]:
df_dups = df_prods_clean[df_prods_clean.duplicated()]

In [18]:
df_dups

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
462,462,Fiber 4g Gummy Dietary Supplement,70,11,4.8
18459,18458,Ranger IPA,27,5,9.2
26810,26808,Black House Coffee Roasty Stout Beer,27,5,13.4
35309,35306,Gluten Free Organic Peanut Butter & Chocolate ...,121,14,6.8
35495,35491,Adore Forever Body Wash,127,11,9.9


In [19]:
#Droping the duplicates
df_prods_clean_no_dups = df_prods_clean.drop_duplicates()

In [20]:
df_prods_clean_no_dups.shape

(49672, 5)

In [21]:
df_prods_clean_no_dups.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 49672 entries, 0 to 49692
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   product_id     49672 non-null  int64  
 1   product_name   49672 non-null  object 
 2   aisle_id       49672 non-null  int64  
 3   department_id  49672 non-null  int64  
 4   prices         49672 non-null  float64
dtypes: float64(1), int64(3), object(1)
memory usage: 2.3+ MB


### Questions

In [28]:
#Are there any missing values in the df_ords dataframe?

In [29]:
df_ords.isnull().sum()

Unnamed: 0                     0
order_id                       0
user_id                        0
order_number                   0
orders_day_of_week             0
order_hour_of_day              0
days_since_prior_order    206209
dtype: int64

#### There are 206,209 orders that show NaN in the days_since_prior_order. Seen as NaN is different to 0.0 - which would imply something was ordered today - these NaN entries should be from people who have only made 1 order from the company - using that customer profile.

In [30]:
#How do we mark these specific orders out?

#### Create a new variable that acts like a flag - we can have it show 'First Order' which the marketting team can filter for, for easier targetting.

In [31]:
df_ords['first_order'] = np.where(np.isnan(df_ords['days_since_prior_order'].values), 'First Order', 'Repeat Customer')

In [32]:
df_ords.head(30)

Unnamed: 0.1,Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,first_order
0,0,2539329,1,1,2,8,,First Order
1,1,2398795,1,2,3,7,15.0,Repeat Customer
2,2,473747,1,3,3,12,21.0,Repeat Customer
3,3,2254736,1,4,4,7,29.0,Repeat Customer
4,4,431534,1,5,4,15,28.0,Repeat Customer
5,5,3367565,1,6,2,7,19.0,Repeat Customer
6,6,550135,1,7,1,9,20.0,Repeat Customer
7,7,3108588,1,8,1,14,14.0,Repeat Customer
8,8,2295261,1,9,1,16,0.0,Repeat Customer
9,9,2550362,1,10,4,8,30.0,Repeat Customer


### Exporting the Dataframes

In [None]:
#Exporting the clean data set
df_prods_clean_no_dups.to_csv(os.path.join(path, '02 Data','Prepared Data', 'products_checked.csv'))

In [37]:
#Exporting the Orders data set, with the new 'First Order' flag
df_ords.to_csv(os.path.join(path, '02 Data','Prepared Data', 'orders_with_first_order_flag.csv'))