In [14]:
import os
import pandas as pd
preprocessor_path = os.path.join("artifacts/preprocessor","preprocessor.pkl")
train_path = os.path.join("artifacts/Data","train.csv")
test_path = os.path.join("artifacts/Data","test.csv")
prediction_path = os.path.join("artifacts/prediction","prediction.csv")
df_train = pd.read_csv(train_path)
df_test = pd.read_csv(test_path)
df_prediction = pd.read_csv(prediction_path)

In [4]:
# main.py
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, RobustScaler, OrdinalEncoder
from src.components.custom_transformer import FilterOutBigValuesTransformer
import joblib

# Define pipelines for different feature types
mean_features = ['Delivery_person_Age', 'Delivery_person_Ratings', 'multiple_deliveries', 'Hour_order', 'Min_order']
ohe_categories = ['Road_traffic_density', 'Type_of_vehicle']
ordinal_categories = ['Road_traffic_density', 'Weatherconditions', 'Type_of_vehicle', 'City']
robust_features = ['translogi_latitude', 'translogi_longitude', 'Delivery_location_latitude', 'Delivery_location_longitude']
standardize_features = ['Temperature', 'Traffic_Index', 'Delivery_person_Age', 'Delivery_person_Ratings']

mean_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

cat_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ohe', OneHotEncoder(handle_unknown='ignore'))
])

ordinal_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ordinal', OrdinalEncoder()),
    ('scaler', RobustScaler())
])

num_pipeline_standardize = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),  
    ('scaler', StandardScaler())  
])

num_pipeline_robust = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', RobustScaler())
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num_mean', mean_pipeline, mean_features),
        ('num_standardize', num_pipeline_standardize, standardize_features),
        ('num_robust', num_pipeline_robust, robust_features),
        ('cat', cat_pipeline, ohe_categories),
        ('ordinal', ordinal_pipeline, ordinal_categories),
    ])

In [9]:
preprocessor.fit_transform(df)

array([[ 1.30184564,  0.50588378,  0.4534272 , ..., -0.66666667,
         0.        ,  0.        ],
       [-0.27832441, -1.32449319,  0.4534272 , ...,  0.33333333,
         0.        ,  0.        ],
       [-0.98062221, -0.10424187, -1.31608473, ...,  1.        ,
         0.        ,  0.        ],
       ...,
       [ 1.65299455,  0.81094661, -1.31608473, ...,  1.        ,
         0.        ,  0.        ],
       [-0.80504776,  0.81094661,  0.4534272 , ...,  1.        ,
         1.        ,  0.        ],
       [ 0.24839894, -0.10424187,  0.4534272 , ...,  0.33333333,
         0.        ,  0.        ]])

In [10]:
joblib.dump(preprocessor, preprocessor_path)

['artifacts/preprocessor\\preprocessor.pkl']

In [15]:
preprocessor = joblib.load(preprocessor_path)

In [26]:
df = df_prediction
df

Unnamed: 0,Delivery_person_Age,Delivery_person_Ratings,translogi_latitude,translogi_longitude,Delivery_location_latitude,Delivery_location_longitude,Weatherconditions,Road_traffic_density,Vehicle_condition,Type_of_vehicle,...,Time_taken,Order_day,Order_month,Order_year,Hour_order,Min_order,avg_delivery_time_area,traffic_weather_impact,vehicle_capacity_utilization,distance
0,30.0,5.0,22.744648,75.894377,22.824648,75.974377,Fog,Medium,0,motorcycle,...,44,20,3,2022,18.0,20.0,27.315226,28.044816,0.0,12.081765


In [19]:
data = preprocessor.transform(df_prediction)

In [20]:
data.shape

(1, 25)