## Amazon Delivery Time Prediction - Data Cleaning

In [209]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

### Clean the dataset by handling missing values and duplicates

In [303]:
df = pd.read_csv('prepared_data.csv')

In [305]:
# Remove duplicates
initial_rows = len(df)
df = df.drop_duplicates()
print(f"Removed {initial_rows - len(df)} duplicate rows")

Removed 0 duplicate rows


In [307]:
df.isnull().sum()

Order_ID            0
Agent_Age           0
Agent_Rating       54
Store_Latitude      0
Store_Longitude     0
Drop_Latitude       0
Drop_Longitude      0
Order_Date          0
Order_Time          0
Pickup_Time         0
Weather            91
Traffic             0
Vehicle             0
Area                0
Delivery_Time       0
Category            0
Distance            0
Order_Hour          0
Order_Minute        0
Pickup_Hour         0
Pickup_Minute       0
Order_Day           0
Order_Month         0
dtype: int64

In [309]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43739 entries, 0 to 43738
Data columns (total 23 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Order_ID         43739 non-null  object 
 1   Agent_Age        43739 non-null  int64  
 2   Agent_Rating     43685 non-null  float64
 3   Store_Latitude   43739 non-null  float64
 4   Store_Longitude  43739 non-null  float64
 5   Drop_Latitude    43739 non-null  float64
 6   Drop_Longitude   43739 non-null  float64
 7   Order_Date       43739 non-null  object 
 8   Order_Time       43739 non-null  object 
 9   Pickup_Time      43739 non-null  object 
 10  Weather          43648 non-null  object 
 11  Traffic          43739 non-null  object 
 12  Vehicle          43739 non-null  object 
 13  Area             43739 non-null  object 
 14  Delivery_Time    43739 non-null  int64  
 15  Category         43739 non-null  object 
 16  Distance         43739 non-null  float64
 17  Order_Hour  

In [311]:
# Fill missing values in 'Agent_Rating' & 'Weather' respective modes
df['Agent_Rating'] = df['Agent_Rating'].fillna(df['Agent_Rating'].mode()[0])
df['Weather'] = df['Weather'].fillna(df['Weather'].mode()[0])

In [313]:
# changing NaT string values in 'Order_Time' to NaN
df['Order_Time'] = df['Order_Time'].replace("NaT", pd.NA)

In [315]:
# changing NaN string values in 'Traffic' to NaN
df['Traffic'] = df['Traffic'].replace("NaN ", pd.NA)

In [317]:
# Fill categorical columns with mode
categorical_columns = df.select_dtypes(include=['object']).columns
for col in categorical_columns:
    df[col] = df[col].fillna(df[col].mode()[0])

In [319]:
df.isnull().sum()

Order_ID           0
Agent_Age          0
Agent_Rating       0
Store_Latitude     0
Store_Longitude    0
Drop_Latitude      0
Drop_Longitude     0
Order_Date         0
Order_Time         0
Pickup_Time        0
Weather            0
Traffic            0
Vehicle            0
Area               0
Delivery_Time      0
Category           0
Distance           0
Order_Hour         0
Order_Minute       0
Pickup_Hour        0
Pickup_Minute      0
Order_Day          0
Order_Month        0
dtype: int64

### Standardize categorical variables

In [321]:
# Initialize dictionary to store label encoders

encoders = {}
    
# Categorical columns to encode
categorical_cols = ['Weather', 'Traffic', 'Vehicle', 'Area', 'Category', 'Order_Day']
    
# Encode each categorical column
for col in categorical_cols:
    encoders[col] = LabelEncoder()
    df[f'{col}_Encoded'] = encoders[col].fit_transform(df[col])
        
    # Print mapping for reference
    mapping = dict(zip(encoders[col].classes_, encoders[col].transform(encoders[col].classes_)))
    print(f"\nEncoding mapping for {col}:")
    print(mapping)


Encoding mapping for Weather:
{'Cloudy': 0, 'Fog': 1, 'Sandstorms': 2, 'Stormy': 3, 'Sunny': 4, 'Windy': 5}

Encoding mapping for Traffic:
{'High ': 0, 'Jam ': 1, 'Low ': 2, 'Medium ': 3}

Encoding mapping for Vehicle:
{'bicycle ': 0, 'motorcycle ': 1, 'scooter ': 2, 'van': 3}

Encoding mapping for Area:
{'Metropolitian ': 0, 'Other': 1, 'Semi-Urban ': 2, 'Urban ': 3}

Encoding mapping for Category:
{'Apparel': 0, 'Books': 1, 'Clothing': 2, 'Cosmetics': 3, 'Electronics': 4, 'Grocery': 5, 'Home': 6, 'Jewelry': 7, 'Kitchen': 8, 'Outdoors': 9, 'Pet Supplies': 10, 'Shoes': 11, 'Skincare': 12, 'Snacks': 13, 'Sports': 14, 'Toys': 15}

Encoding mapping for Order_Day:
{'Friday': 0, 'Monday': 1, 'Saturday': 2, 'Sunday': 3, 'Thursday': 4, 'Tuesday': 5, 'Wednesday': 6}


In [323]:
# Save cleaned dataset
df.to_csv('cleaned_data.csv', index=False)
print("\nCleaned data saved to 'cleaned_data.csv'")


Cleaned data saved to 'cleaned_data.csv'
