In [61]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

df=pd.read_csv('https://raw.githubusercontent.com/Shivan118/New-Machine-Learning-Modular-Coding-projecs/refs/heads/main/Data/finalTrain.csv')
df.head()

Unnamed: 0,ID,Delivery_person_ID,Delivery_person_Age,Delivery_person_Ratings,Restaurant_latitude,Restaurant_longitude,Delivery_location_latitude,Delivery_location_longitude,Order_Date,Time_Orderd,Time_Order_picked,Weather_conditions,Road_traffic_density,Vehicle_condition,Type_of_order,Type_of_vehicle,multiple_deliveries,Festival,City,Time_taken (min)
0,0xcdcd,DEHRES17DEL01,36.0,4.2,30.327968,78.046106,30.397968,78.116106,12-02-2022,21:55,22:10,Fog,Jam,2,Snack,motorcycle,3.0,No,Metropolitian,46
1,0xd987,KOCRES16DEL01,21.0,4.7,10.003064,76.307589,10.043064,76.347589,13-02-2022,14:55,15:05,Stormy,High,1,Meal,motorcycle,1.0,No,Metropolitian,23
2,0x2784,PUNERES13DEL03,23.0,4.7,18.56245,73.916619,18.65245,74.006619,04-03-2022,17:30,17:40,Sandstorms,Medium,1,Drinks,scooter,1.0,No,Metropolitian,21
3,0xc8b6,LUDHRES15DEL02,34.0,4.3,30.899584,75.809346,30.919584,75.829346,13-02-2022,09:20,09:30,Sandstorms,Low,0,Buffet,motorcycle,0.0,No,Metropolitian,20
4,0xdb64,KNPRES14DEL02,24.0,4.7,26.463504,80.372929,26.593504,80.502929,14-02-2022,19:50,20:05,Fog,Jam,1,Snack,scooter,1.0,No,Metropolitian,41


In [62]:
df.drop('ID',axis=1,inplace=True)

In [63]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45584 entries, 0 to 45583
Data columns (total 19 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   Delivery_person_ID           45584 non-null  object 
 1   Delivery_person_Age          43730 non-null  float64
 2   Delivery_person_Ratings      43676 non-null  float64
 3   Restaurant_latitude          45584 non-null  float64
 4   Restaurant_longitude         45584 non-null  float64
 5   Delivery_location_latitude   45584 non-null  float64
 6   Delivery_location_longitude  45584 non-null  float64
 7   Order_Date                   45584 non-null  object 
 8   Time_Orderd                  43853 non-null  object 
 9   Time_Order_picked            45584 non-null  object 
 10  Weather_conditions           44968 non-null  object 
 11  Road_traffic_density         44983 non-null  object 
 12  Vehicle_condition            45584 non-null  int64  
 13  Type_of_order   

## Steps:
1. Calculate distance using lat/long
2. Calculate Delivery city and Ratings
3. Extract HH/MM/YYYY from order date
4. Preprocess Time ordered

### 1. Extracting distance using Restaurant lat/long and delivery lat/long

In [64]:
import math

def distance(lat1, lon1, lat2, lon2):
    # Convert latitude and longitude to radians
    lat1, lon1, lat2, lon2 = map(math.radians, [lat1, lon1, lat2, lon2])
    
    # Haversine formula
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = math.sin(dlat/2)**2 + math.cos(lat1) * math.cos(lat2) * math.sin(dlon/2)**2
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1-a))
    R = 6371.0 # Earth's radius in km
    dist = R * c
    
    return dist

df['distance'] = df.apply(lambda row: distance(row['Restaurant_latitude'], row['Restaurant_longitude'], row['Delivery_location_latitude'], row['Delivery_location_longitude']), axis=1)

### 2. Calculate city and delivery ratings

In [65]:
def cal_ratings(df, col1, col2):

    df['Delivery_city']=df[col1].str.split('RES',expand=True)[0]
    rating_map = round(df.groupby(col1)[col2].mean(),1).to_dict()
    df[col2]=df[col2].fillna(df[col1].map(rating_map))

    return df

df = cal_ratings(df, 'Delivery_person_ID', 'Delivery_person_Ratings')

In [66]:
df.head()

Unnamed: 0,Delivery_person_ID,Delivery_person_Age,Delivery_person_Ratings,Restaurant_latitude,Restaurant_longitude,Delivery_location_latitude,Delivery_location_longitude,Order_Date,Time_Orderd,Time_Order_picked,Weather_conditions,Road_traffic_density,Vehicle_condition,Type_of_order,Type_of_vehicle,multiple_deliveries,Festival,City,Time_taken (min),distance,Delivery_city
0,DEHRES17DEL01,36.0,4.2,30.327968,78.046106,30.397968,78.116106,12-02-2022,21:55,22:10,Fog,Jam,2,Snack,motorcycle,3.0,No,Metropolitian,46,10.280582,DEH
1,KOCRES16DEL01,21.0,4.7,10.003064,76.307589,10.043064,76.347589,13-02-2022,14:55,15:05,Stormy,High,1,Meal,motorcycle,1.0,No,Metropolitian,23,6.242319,KOC
2,PUNERES13DEL03,23.0,4.7,18.56245,73.916619,18.65245,74.006619,04-03-2022,17:30,17:40,Sandstorms,Medium,1,Drinks,scooter,1.0,No,Metropolitian,21,13.78786,PUNE
3,LUDHRES15DEL02,34.0,4.3,30.899584,75.809346,30.919584,75.829346,13-02-2022,09:20,09:30,Sandstorms,Low,0,Buffet,motorcycle,0.0,No,Metropolitian,20,2.930258,LUDH
4,KNPRES14DEL02,24.0,4.7,26.463504,80.372929,26.593504,80.502929,14-02-2022,19:50,20:05,Fog,Jam,1,Snack,scooter,1.0,No,Metropolitian,41,19.396618,KNP


### 3. Extract month/ day/ yeaar from order date

In [67]:
def preprocess_date(df, column):
    df[column] = pd.to_datetime(df[column], format='%d-%m-%Y')

    df['year']= df[column].dt.year
    df['month']= df[column].dt.month
    df['day']= df[column].dt.day

    return df

df = preprocess_date(df, 'Order_Date')

In [68]:
df.head()

Unnamed: 0,Delivery_person_ID,Delivery_person_Age,Delivery_person_Ratings,Restaurant_latitude,Restaurant_longitude,Delivery_location_latitude,Delivery_location_longitude,Order_Date,Time_Orderd,Time_Order_picked,Weather_conditions,Road_traffic_density,Vehicle_condition,Type_of_order,Type_of_vehicle,multiple_deliveries,Festival,City,Time_taken (min),distance,Delivery_city,year,month,day
0,DEHRES17DEL01,36.0,4.2,30.327968,78.046106,30.397968,78.116106,2022-02-12,21:55,22:10,Fog,Jam,2,Snack,motorcycle,3.0,No,Metropolitian,46,10.280582,DEH,2022,2,12
1,KOCRES16DEL01,21.0,4.7,10.003064,76.307589,10.043064,76.347589,2022-02-13,14:55,15:05,Stormy,High,1,Meal,motorcycle,1.0,No,Metropolitian,23,6.242319,KOC,2022,2,13
2,PUNERES13DEL03,23.0,4.7,18.56245,73.916619,18.65245,74.006619,2022-03-04,17:30,17:40,Sandstorms,Medium,1,Drinks,scooter,1.0,No,Metropolitian,21,13.78786,PUNE,2022,3,4
3,LUDHRES15DEL02,34.0,4.3,30.899584,75.809346,30.919584,75.829346,2022-02-13,09:20,09:30,Sandstorms,Low,0,Buffet,motorcycle,0.0,No,Metropolitian,20,2.930258,LUDH,2022,2,13
4,KNPRES14DEL02,24.0,4.7,26.463504,80.372929,26.593504,80.502929,2022-02-14,19:50,20:05,Fog,Jam,1,Snack,scooter,1.0,No,Metropolitian,41,19.396618,KNP,2022,2,14


### 4. Time Ordered

In [69]:
def process_time_ordered(df, time_column):
    # Drop rows with NaN values in the time_column
    df.dropna(subset=[time_column], inplace=True)
    
    # Replace periods with colons in the time_column
    df[time_column] = df[time_column].str.replace('.', ':')
    
    # Define a function to extract the time in HH:MM format
    def extract_time(x):
        try:
            return x.split(':')[0] + ':' + x.split(':')[1][:2]
        except IndexError:
            return '00:00'

    # Apply the extract_time function
    df[time_column] = df[time_column].apply(extract_time)

    # Convert the time_column to datetime format, now using format '%H:%M'
    df[time_column] = pd.to_datetime(df[time_column], format='%H:%M', errors='coerce')
    
    # Ensure there are no conversion issues
    df.dropna(subset=[time_column], inplace=True)
    
    # Extract hour and handle cases where '0' should be replaced with '24'
    df['TimeOrder_Hour'] = df[time_column].dt.hour
    df['TimeOrder_Hour'] = df['TimeOrder_Hour'].replace(0, 24).astype(int)
    
    # Extract minutes from the time column
    df['TimeOrder_min'] = df[time_column].dt.minute
    
    return df

df = process_time_ordered(df, 'Time_Orderd')

In [70]:
df.head()

Unnamed: 0,Delivery_person_ID,Delivery_person_Age,Delivery_person_Ratings,Restaurant_latitude,Restaurant_longitude,Delivery_location_latitude,Delivery_location_longitude,Order_Date,Time_Orderd,Time_Order_picked,Weather_conditions,Road_traffic_density,Vehicle_condition,Type_of_order,Type_of_vehicle,multiple_deliveries,Festival,City,Time_taken (min),distance,Delivery_city,year,month,day,TimeOrder_Hour,TimeOrder_min
0,DEHRES17DEL01,36.0,4.2,30.327968,78.046106,30.397968,78.116106,2022-02-12,1900-01-01 21:55:00,22:10,Fog,Jam,2,Snack,motorcycle,3.0,No,Metropolitian,46,10.280582,DEH,2022,2,12,21,55
1,KOCRES16DEL01,21.0,4.7,10.003064,76.307589,10.043064,76.347589,2022-02-13,1900-01-01 14:55:00,15:05,Stormy,High,1,Meal,motorcycle,1.0,No,Metropolitian,23,6.242319,KOC,2022,2,13,14,55
2,PUNERES13DEL03,23.0,4.7,18.56245,73.916619,18.65245,74.006619,2022-03-04,1900-01-01 17:30:00,17:40,Sandstorms,Medium,1,Drinks,scooter,1.0,No,Metropolitian,21,13.78786,PUNE,2022,3,4,17,30
3,LUDHRES15DEL02,34.0,4.3,30.899584,75.809346,30.919584,75.829346,2022-02-13,1900-01-01 09:20:00,09:30,Sandstorms,Low,0,Buffet,motorcycle,0.0,No,Metropolitian,20,2.930258,LUDH,2022,2,13,9,20
4,KNPRES14DEL02,24.0,4.7,26.463504,80.372929,26.593504,80.502929,2022-02-14,1900-01-01 19:50:00,20:05,Fog,Jam,1,Snack,scooter,1.0,No,Metropolitian,41,19.396618,KNP,2022,2,14,19,50


### 5. Time Order Picked

In [71]:
def process_time_order_picked(df, time_column):
    # Drop rows with NaN values in the time_column
    df.dropna(subset=[time_column], inplace=True)
    
    # Replace periods with colons in the time_column
    df[time_column] = df[time_column].str.replace('.', ':')
    
    # Define a function to extract the time in HH:MM format
    def extract_time(x):
        try:
            return x.split(':')[0] + ':' + x.split(':')[1][:2]
        except IndexError:
            return '00:00'

    # Apply the extract_time function
    df[time_column] = df[time_column].apply(extract_time)

    # Convert the time_column to datetime format, now using format '%H:%M'
    df[time_column] = pd.to_datetime(df[time_column], format='%H:%M', errors='coerce')
    
    # Ensure there are no conversion issues
    df.dropna(subset=[time_column], inplace=True)
    
    # Extract hour and handle cases where '0' should be replaced with '24'
    df['Time_Order_picked_Hour'] = df[time_column].dt.hour
    df['Time_Order_picked_Hour'] = df['Time_Order_picked_Hour'].replace(0, 24).astype(int)
    
    # Extract minutes from the time column
    df['Time_Order_picked_min'] = df[time_column].dt.minute

    return df

df = process_time_order_picked(df, 'Time_Order_picked')

In [72]:
df.head()

Unnamed: 0,Delivery_person_ID,Delivery_person_Age,Delivery_person_Ratings,Restaurant_latitude,Restaurant_longitude,Delivery_location_latitude,Delivery_location_longitude,Order_Date,Time_Orderd,Time_Order_picked,Weather_conditions,Road_traffic_density,Vehicle_condition,Type_of_order,Type_of_vehicle,multiple_deliveries,Festival,City,Time_taken (min),distance,Delivery_city,year,month,day,TimeOrder_Hour,TimeOrder_min,Time_Order_picked_Hour,Time_Order_picked_min
0,DEHRES17DEL01,36.0,4.2,30.327968,78.046106,30.397968,78.116106,2022-02-12,1900-01-01 21:55:00,1900-01-01 22:10:00,Fog,Jam,2,Snack,motorcycle,3.0,No,Metropolitian,46,10.280582,DEH,2022,2,12,21,55,22,10
1,KOCRES16DEL01,21.0,4.7,10.003064,76.307589,10.043064,76.347589,2022-02-13,1900-01-01 14:55:00,1900-01-01 15:05:00,Stormy,High,1,Meal,motorcycle,1.0,No,Metropolitian,23,6.242319,KOC,2022,2,13,14,55,15,5
2,PUNERES13DEL03,23.0,4.7,18.56245,73.916619,18.65245,74.006619,2022-03-04,1900-01-01 17:30:00,1900-01-01 17:40:00,Sandstorms,Medium,1,Drinks,scooter,1.0,No,Metropolitian,21,13.78786,PUNE,2022,3,4,17,30,17,40
3,LUDHRES15DEL02,34.0,4.3,30.899584,75.809346,30.919584,75.829346,2022-02-13,1900-01-01 09:20:00,1900-01-01 09:30:00,Sandstorms,Low,0,Buffet,motorcycle,0.0,No,Metropolitian,20,2.930258,LUDH,2022,2,13,9,20,9,30
4,KNPRES14DEL02,24.0,4.7,26.463504,80.372929,26.593504,80.502929,2022-02-14,1900-01-01 19:50:00,1900-01-01 20:05:00,Fog,Jam,1,Snack,scooter,1.0,No,Metropolitian,41,19.396618,KNP,2022,2,14,19,50,20,5


In [73]:
df.drop(['Restaurant_latitude','Restaurant_longitude','Delivery_location_latitude','Delivery_location_longitude',
        'year','month','day','TimeOrder_min','Time_Order_picked_Hour','Time_Order_picked_min',
        'Time_Orderd','Time_Order_picked','Delivery_person_ID'],axis=1,inplace=True)

In [74]:
df.head()

Unnamed: 0,Delivery_person_Age,Delivery_person_Ratings,Order_Date,Weather_conditions,Road_traffic_density,Vehicle_condition,Type_of_order,Type_of_vehicle,multiple_deliveries,Festival,City,Time_taken (min),distance,Delivery_city,TimeOrder_Hour
0,36.0,4.2,2022-02-12,Fog,Jam,2,Snack,motorcycle,3.0,No,Metropolitian,46,10.280582,DEH,21
1,21.0,4.7,2022-02-13,Stormy,High,1,Meal,motorcycle,1.0,No,Metropolitian,23,6.242319,KOC,14
2,23.0,4.7,2022-03-04,Sandstorms,Medium,1,Drinks,scooter,1.0,No,Metropolitian,21,13.78786,PUNE,17
3,34.0,4.3,2022-02-13,Sandstorms,Low,0,Buffet,motorcycle,0.0,No,Metropolitian,20,2.930258,LUDH,9
4,24.0,4.7,2022-02-14,Fog,Jam,1,Snack,scooter,1.0,No,Metropolitian,41,19.396618,KNP,19


In [75]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 37614 entries, 0 to 45583
Data columns (total 15 columns):
 #   Column                   Non-Null Count  Dtype         
---  ------                   --------------  -----         
 0   Delivery_person_Age      37432 non-null  float64       
 1   Delivery_person_Ratings  37614 non-null  float64       
 2   Order_Date               37614 non-null  datetime64[ns]
 3   Weather_conditions       37614 non-null  object        
 4   Road_traffic_density     37614 non-null  object        
 5   Vehicle_condition        37614 non-null  int64         
 6   Type_of_order            37614 non-null  object        
 7   Type_of_vehicle          37614 non-null  object        
 8   multiple_deliveries      36798 non-null  float64       
 9   Festival                 37419 non-null  object        
 10  City                     36622 non-null  object        
 11  Time_taken (min)         37614 non-null  int64         
 12  distance                 37614 non-nu

In [76]:
df.shape

(37614, 15)

## Feature Engineering

In [77]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OrdinalEncoder,OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import KNNImputer

In [78]:
from sklearn.model_selection import train_test_split

train_data, test_data = train_test_split(df, test_size = 0.20, random_state= 42)

In [79]:
df.head()

Unnamed: 0,Delivery_person_Age,Delivery_person_Ratings,Order_Date,Weather_conditions,Road_traffic_density,Vehicle_condition,Type_of_order,Type_of_vehicle,multiple_deliveries,Festival,City,Time_taken (min),distance,Delivery_city,TimeOrder_Hour
0,36.0,4.2,2022-02-12,Fog,Jam,2,Snack,motorcycle,3.0,No,Metropolitian,46,10.280582,DEH,21
1,21.0,4.7,2022-02-13,Stormy,High,1,Meal,motorcycle,1.0,No,Metropolitian,23,6.242319,KOC,14
2,23.0,4.7,2022-03-04,Sandstorms,Medium,1,Drinks,scooter,1.0,No,Metropolitian,21,13.78786,PUNE,17
3,34.0,4.3,2022-02-13,Sandstorms,Low,0,Buffet,motorcycle,0.0,No,Metropolitian,20,2.930258,LUDH,9
4,24.0,4.7,2022-02-14,Fog,Jam,1,Snack,scooter,1.0,No,Metropolitian,41,19.396618,KNP,19


In [80]:
df.shape

(37614, 15)

#### Creating feature engg pipeline

In [81]:
def create_preprocessor(numerical_columns, categorical_columns, ordinal_columns, ordinal_categories):
    # Numerical pipeline
    numerical_pipeline = Pipeline(steps=[
        ('impute', SimpleImputer(strategy='constant', fill_value=0)),
        ('scaler', StandardScaler(with_mean=False))
    ])
    
    # Categorical pipeline
    categorical_pipeline = Pipeline(steps=[
        ('impute', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore')),
        ('scaler', StandardScaler(with_mean=False))
    ])
    
    # Ordinal pipeline
    ordinal_pipeline = Pipeline(steps=[
        ('impute', SimpleImputer(strategy='most_frequent')),
        ('ordinal', OrdinalEncoder(categories=ordinal_categories)),
        ('scaler', StandardScaler(with_mean=False))
    ])
    
    # Column transformer
    preprocessor = ColumnTransformer([
        ('numerical_pipeline', numerical_pipeline, numerical_columns),
        ('categorical_pipeline', categorical_pipeline, categorical_columns),
        ('ordinal_pipeline', ordinal_pipeline, ordinal_columns)
    ])
    
    return preprocessor

In [82]:
Road_traffic_density=['Low','Medium','High','Jam']
Weather_conditions=['Sunny','Cloudy','Windy','Fog','Sandstorms','Stormy']

categorical_columns=['Type_of_order','Type_of_vehicle','Festival','City','Delivery_city']
ordinal_columns = ['Road_traffic_density', 'Weather_conditions']
ordinal_categories = [Road_traffic_density, Weather_conditions]
numerical_columns=['Delivery_person_Age','Delivery_person_Ratings','Vehicle_condition','multiple_deliveries',
                        'TimeOrder_Hour','distance']

preprocessor = create_preprocessor(numerical_columns, categorical_columns, ordinal_columns, ordinal_categories)

#### Function for feat. engg

In [83]:
def transform_data(preprocessor, df_train, df_test, numerical_columns, categorical_columns, ordinal_columns, ordinal_categories):

    df_train_transformed = preprocessor.fit_transform(df_train)
    df_test_transformed = preprocessor.transform(df_test)

    ohe_columns = list(preprocessor.named_transformers_['categorical_pipeline']['onehot'].get_feature_names_out(categorical_columns))

    all_columns = numerical_columns + ohe_columns + ordinal_columns

    df_train_transformed = pd.DataFrame(df_train_transformed, columns=all_columns)
    df_test_transformed = pd.DataFrame(df_test_transformed, columns=all_columns)

    return df_train_transformed, df_test_transformed

In [84]:
target_col_train = train_data['Time_taken (min)']
target_col_test = test_data['Time_taken (min)']

train_data.drop(['Time_taken (min)'], axis=1, inplace=True)
test_data.drop(['Time_taken (min)'], axis=1, inplace=True)

In [85]:
preprocessor = create_preprocessor(numerical_columns, categorical_columns, ordinal_columns, ordinal_categories)

train_data_transformed, test_data_transformed = transform_data(preprocessor, train_data, test_data, numerical_columns,categorical_columns,ordinal_columns, ordinal_categories)

In [86]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [87]:
train_data_transformed.head()

Unnamed: 0,Delivery_person_Age,Delivery_person_Ratings,Vehicle_condition,multiple_deliveries,TimeOrder_Hour,distance,Type_of_order_Buffet,Type_of_order_Drinks,Type_of_order_Meal,Type_of_order_Snack,Type_of_vehicle_electric_scooter,Type_of_vehicle_motorcycle,Type_of_vehicle_scooter,Festival_No,Festival_Yes,City_Metropolitian,City_Semi-Urban,City_Urban,Delivery_city_AGR,Delivery_city_ALH,Delivery_city_AURG,Delivery_city_BANG,Delivery_city_BHP,Delivery_city_CHEN,Delivery_city_COIMB,Delivery_city_DEH,Delivery_city_GOA,Delivery_city_HYD,Delivery_city_INDO,Delivery_city_JAP,Delivery_city_KNP,Delivery_city_KOC,Delivery_city_KOL,Delivery_city_LUDH,Delivery_city_MUM,Delivery_city_MYS,Delivery_city_PUNE,Delivery_city_RANCHI,Delivery_city_SUR,Delivery_city_VAD,Road_traffic_density,Weather_conditions
0,3.441411,14.152959,2.452161,1.745049,3.848087,0.03468,0.0,0.0,0.0,2.300475,0.0,2.030393,0.0,7.451937,0.0,0.0,0.0,2.400481,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.014561,0.0,0.0,0.0,0.806045,1.759786
1,4.588548,12.866326,1.22608,1.745049,4.275652,0.039629,2.322275,0.0,0.0,0.0,0.0,2.030393,0.0,7.451937,0.0,2.386711,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.014561,0.0,0.0,0.0,2.418134,1.759786
2,6.227315,14.796275,2.452161,1.745049,1.710261,0.010041,0.0,2.314837,0.0,0.0,0.0,0.0,2.121179,7.451937,0.0,2.386711,0.0,0.0,0.0,0.0,0.0,3.898309,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.586595
3,6.063438,16.082908,0.0,1.745049,2.565391,0.020185,0.0,0.0,0.0,2.300475,0.0,2.030393,0.0,7.451937,0.0,2.386711,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.949078,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.612089,1.173191
4,5.899562,13.187984,0.0,1.745049,4.275652,0.065258,2.322275,0.0,0.0,0.0,0.0,2.030393,0.0,7.451937,0.0,2.386711,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.957227,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.418134,2.346382


In [88]:
train_data_transformed = pd.concat([train_data_transformed, target_col_train], axis=1)
test_data_transformed = pd.concat([test_data_transformed, target_col_test], axis=1)

## Model Training

In [89]:
X_train = train_data_transformed.drop(labels=['Time_taken (min)'],axis=1)
y_train = train_data_transformed[['Time_taken (min)']]

In [90]:
from sklearn.ensemble import RandomForestRegressor

In [91]:
def evaluate_reg(true, predicted):
    r2 = r2_score(true, predicted) # Calculate r2 score
    MAE = mean_absolute_error(true, predicted) # Calculate MAE
    MSE = mean_squared_error(true, predicted) # Calculate MSE
    rmse = np.sqrt(mean_squared_error(true, predicted))
   
    return r2, MAE , MSE,rmse

In [None]:
def train_model(X_train, y_train) -> GradientBoostingClassifier:
    """Train the Random Forest model."""
    rf = RandomForestRegressor()
    rf.fit(X_train, y_train)
    return rf

def save_model(model, file_path: str) -> None:
    """Save the trained model to a file."""
    with open(file_path, 'wb') as file:
        pickle.dump(model, file)

In [None]:
clf = train_model(X_train, y_train, params)
        
save_model(clf, 'models/model.pkl')