### **Treatment of Imputation:**

In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import seaborn as sns
import missingno as msn
import haversine as hs
import warnings
warnings.filterwarnings("ignore")

In [6]:
def time_diff(row):

    order_picked = pd.to_timedelta(row['Time_Order_picked'])
    time_order =  pd.to_timedelta(row['Time_Orderd'])

    x = order_picked - time_order
    return x

def haversine_row(row):
    loc1 = (row['Restaurant_latitude'], row['Restaurant_longitude'])
    loc2 = (row['Delivery_location_latitude'], row['Delivery_location_longitude'])
    return hs.haversine(loc1, loc2, unit=hs.Unit.KILOMETERS)



def basic_data_cleaning(df):

    df.drop(columns=['ID'],inplace=True)
    df['Restaurant_latitude'] = abs(df['Restaurant_latitude'].replace(0,np.nan))
    df['Restaurant_longitude'] = abs(df['Restaurant_longitude'].replace(0,np.nan))
    cols = ["Delivery_location_latitude","Delivery_location_longitude"]
    df.loc[df['Restaurant_latitude'].isnull(), cols] = np.nan

    columns = ["Delivery_person_Age",  "Delivery_person_Ratings","Time_Orderd", "Weatherconditions", "Road_traffic_density",
    "multiple_deliveries", "Festival", "City"]
    for i in columns:
        df[i] = df[i].replace('NaN ',np.nan)
    df['Weatherconditions'] = df['Weatherconditions'].replace("conditions NaN",np.nan)
    df['Weatherconditions'] = df['Weatherconditions'].apply(lambda x: x.split()[-1] if isinstance(x, str) else x)
    df['Time_taken(min)'] = df['Time_taken(min)'].apply(lambda x: x[-2:]).astype("float")

    # changing datatype 
    df['Delivery_person_Age'] = df['Delivery_person_Age'].astype("float")
    df['Delivery_person_Ratings'] = df['Delivery_person_Ratings'].astype("float")
    df['Order_Date'] = pd.to_datetime(df['Order_Date'])
    df['multiple_deliveries'] = df['multiple_deliveries'].astype("float")

    # applying harversine distance
    df['Distance_res_to_loc_KM'] = df[['Restaurant_latitude', 'Restaurant_longitude','Delivery_location_latitude', 'Delivery_location_longitude']].apply(haversine_row,axis=1)
    df.insert(7,"Distance_res_loc_KM",df.pop('Distance_res_to_loc_KM'))

    # finding the time taken between restrurant & order pickup
    df["Time_res_pickup"] = (df[['Time_Order_picked','Time_Orderd']].apply(time_diff,axis=1).dt.total_seconds()/60).replace({-1425.:15., -1430.:10., -1435.:5.})
    df.insert(11,"Time_res_pickup",df.pop("Time_res_pickup"))

    return df

In [7]:
df = pd.read_csv(r"E:\DATA SCIENCE & AI\PROJECTS\Swiggy Delivery Dataset\train.csv")

In [8]:
df = basic_data_cleaning(df)

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45593 entries, 0 to 45592
Data columns (total 21 columns):
 #   Column                       Non-Null Count  Dtype         
---  ------                       --------------  -----         
 0   Delivery_person_ID           45593 non-null  object        
 1   Delivery_person_Age          43739 non-null  float64       
 2   Delivery_person_Ratings      43685 non-null  float64       
 3   Restaurant_latitude          41953 non-null  float64       
 4   Restaurant_longitude         41953 non-null  float64       
 5   Delivery_location_latitude   41953 non-null  float64       
 6   Delivery_location_longitude  41953 non-null  float64       
 7   Distance_res_loc_KM          41953 non-null  float64       
 8   Order_Date                   45593 non-null  datetime64[ns]
 9   Time_Orderd                  43862 non-null  object        
 10  Time_Order_picked            45593 non-null  object        
 11  Time_res_pickup              43862 non-nu

In [10]:
df.isnull().sum()

Delivery_person_ID                0
Delivery_person_Age            1854
Delivery_person_Ratings        1908
Restaurant_latitude            3640
Restaurant_longitude           3640
Delivery_location_latitude     3640
Delivery_location_longitude    3640
Distance_res_loc_KM            3640
Order_Date                        0
Time_Orderd                    1731
Time_Order_picked                 0
Time_res_pickup                1731
Weatherconditions               616
Road_traffic_density            601
Vehicle_condition                 0
Type_of_order                     0
Type_of_vehicle                   0
multiple_deliveries             993
Festival                        228
City                           1200
Time_taken(min)                   0
dtype: int64

We have to perform these tasks:

- Delivery_person_age : Have to test diff technique like knn, iter

- delivery_rating: mean

- Distance_res_loc_KM: have to test diff techniques

- time_ordered: we may drop this

- Time_res_pickup: have to test diff techniques

- Weatherconditions: most freq

- Road_traffic_density: most freq

- multiple_deliveries: with 0

- city: most freq->Metropolitan

- festival: most freq -> no

- other 4 columns (long & lati) we may drop these cols.