# Data Cleaning and Preparation

## Step 1: Inputing the Dataset

In [1]:
#This cell will be used for importing packages needed for any subsequent stages.

import pandas as pd

In [9]:
#reading file 

df = pd.read_csv(r'C:\Users\GARETH TIROP\Desktop\Data Science and Analytics Docs\Modified_ECommerce_consumer_behaviour_dataset.csv')

## Step 2: Preview and basic information of  Dataset

In [10]:
df #Preview of Dataset

Unnamed: 0,order_id,user_id,order_number,order_dow,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,department_id,department,product_name
0,2425083,49125,1,2,18,,17,1,0,13,pantry,baking ingredients
1,2425083,49125,1,2,18,,91,2,0,16,dairy eggs,soy lactosefree
2,2425083,49125,1,2,18,,36,3,0,16,dairy eggs,butter
3,2425083,49125,1,2,18,,83,4,0,4,produce,fresh vegetables
4,2425083,49125,1,2,18,,83,5,0,4,produce,fresh vegetables
...,...,...,...,...,...,...,...,...,...,...,...,...
1048570,671875,33287,5,6,15,30.0,69,12,1,15,canned goods,soup broth bouillon
1048571,671875,33287,5,6,15,30.0,91,13,1,16,dairy eggs,soy lactosefree
1048572,671875,33287,5,6,15,30.0,61,14,1,19,snacks,cookies cakes
1048573,671875,33287,5,6,15,30.0,61,15,1,19,snacks,cookies cakes


In [21]:
df.info() #info on datatypes

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1048575 entries, 0 to 1048574
Data columns (total 12 columns):
 #   Column                  Non-Null Count    Dtype  
---  ------                  --------------    -----  
 0   order_id                1048575 non-null  int64  
 1   user_id                 1048575 non-null  int64  
 2   order_number            1048575 non-null  int64  
 3   order_dow               1048575 non-null  int64  
 4   order_hour_of_day       1048575 non-null  int64  
 5   days_since_prior_order  983854 non-null   float64
 6   product_id              1048575 non-null  int64  
 7   add_to_cart_order       1048575 non-null  int64  
 8   reordered               1048575 non-null  int64  
 9   department_id           1048575 non-null  int64  
 10  department              1048575 non-null  object 
 11  product_name            1048575 non-null  object 
dtypes: float64(1), int64(9), object(2)
memory usage: 96.0+ MB


In [23]:
df.describe()

#Statistical summary of numerical columns

Unnamed: 0,order_id,user_id,order_number,order_dow,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,department_id
count,1048575.0,1048575.0,1048575.0,1048575.0,1048575.0,983854.0,1048575.0,1048575.0,1048575.0,1048575.0
mean,1704987.0,102995.2,17.19931,2.72871,13.44314,11.431402,71.19057,8.370037,0.5888096,9.928045
std,986758.7,59526.9,17.57635,2.095244,4.237347,8.995322,38.18826,7.134039,0.4920499,6.28869
min,11.0,3.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0
25%,849311.0,51394.0,5.0,1.0,10.0,5.0,31.0,3.0,0.0,4.0
50%,1702748.0,102456.0,11.0,3.0,13.0,8.0,83.0,6.0,1.0,9.0
75%,2557490.0,154671.0,24.0,5.0,16.0,15.0,107.0,11.0,1.0,16.0
max,3421074.0,206209.0,100.0,6.0,23.0,30.0,134.0,100.0,1.0,21.0


## Step 3: Understanding Columns 

 
N/B :  
   - The relation between the columns and the objective of the project has been explained in the README.md

   - This stage is only for the cleaning and preparation process.
      
      
The columns :

- order_id - unique identity of order

- user_id -unique identity of user/customer

- order_number - Number OF THE ORDER

- order_dow -Day of the Week the order was made(either 0,1,2,3,4,5,6)

- order_hour_of_day - Hour of the day order was made

- days_since_prior_order - Days since prior order ; 0 for new customers, the rest depending on last day

- product_id - unique ID of product that is part of an order

- add_to_cart_order - Number of specific products added to cart as part of the order

- reordered - If the re-order took place ( is in binary of 0 or 1)

- department_id - specific department identity that a product is part of

- department - name of department

- product_name - name of product

## Step 4: Checking for duplicates

In [13]:
duplicated_data = df.duplicated()

In [14]:
duplicated_data

0          False
1          False
2          False
3          False
4          False
           ...  
1048570    False
1048571    False
1048572    False
1048573    False
1048574    False
Length: 1048575, dtype: bool

In [16]:
# With the function duplicated(), it returns 'False' for unique rows, and returns 'True' for duplicated entries.

# Therefore, all the 1048575 rows out the total 1048575 rows are unique and there are thus no duplicate entries.

## Step 5: Missing Values

In [19]:
#Checking missing values for all columns

all_missing_values = df.isnull().sum()

#Display

print("Number of missing values in each column: ")

print(all_missing_values)

Number of missing values in each column: 
order_id                      0
user_id                       0
order_number                  0
order_dow                     0
order_hour_of_day             0
days_since_prior_order    64721
product_id                    0
add_to_cart_order             0
reordered                     0
department_id                 0
department                    0
product_name                  0
dtype: int64


- In above, all columns but 1 have zero missing values.

- The column "days_since_prior_order" has 64721 missing values.

- The column is used to record the days since a customer last ordered from the ecommerce stored.


- Therefore, it is assumed that the missing values that give NaN, should be 0. 
- This would mean,that those with the missing values are actually New customers and have just been recorded in the system for the first time.

In [24]:
#Replacing NaN with value 0 for the "days_since_prior_order" column

df['days_since_prior_order'] = df['days_since_prior_order'].fillna('0')

#The column is initially in float, we need it in integers as it represent finite days

df['days_since_prior_order'] = df['days_since_prior_order'].astype(int)

In [25]:
df

Unnamed: 0,order_id,user_id,order_number,order_dow,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,department_id,department,product_name
0,2425083,49125,1,2,18,0,17,1,0,13,pantry,baking ingredients
1,2425083,49125,1,2,18,0,91,2,0,16,dairy eggs,soy lactosefree
2,2425083,49125,1,2,18,0,36,3,0,16,dairy eggs,butter
3,2425083,49125,1,2,18,0,83,4,0,4,produce,fresh vegetables
4,2425083,49125,1,2,18,0,83,5,0,4,produce,fresh vegetables
...,...,...,...,...,...,...,...,...,...,...,...,...
1048570,671875,33287,5,6,15,30,69,12,1,15,canned goods,soup broth bouillon
1048571,671875,33287,5,6,15,30,91,13,1,16,dairy eggs,soy lactosefree
1048572,671875,33287,5,6,15,30,61,14,1,19,snacks,cookies cakes
1048573,671875,33287,5,6,15,30,61,15,1,19,snacks,cookies cakes
