In [1]:
import os

In [2]:
%pwd

'c:\\Users\\Sanju\\WORKSPACE\\Late-Delivery-Classification-Machine-Learning-Project\\research'

In [3]:
os.chdir('../')

In [4]:
%pwd

'c:\\Users\\Sanju\\WORKSPACE\\Late-Delivery-Classification-Machine-Learning-Project'

In [5]:
from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class DataTransformationConfig:
    root_dir: Path
    preprocessor_path: Path
    train_data_path: Path
    test_data_path: Path
    target_column: str

In [6]:
from MLProject.constants import *
from MLProject.utils.common import read_yaml, create_directories

In [7]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH,
        schema_filepath = SCHEMA_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        create_directories([self.config.artifacts_root])


    
    def get_data_transformation_config(self) -> DataTransformationConfig:
        config = self.config.data_transformation
        schema = self.schema.TARGET_COLUMN

        create_directories([config.root_dir])

        data_transformation_config = DataTransformationConfig(
            root_dir = config.root_dir,
            preprocessor_path=config.preprocessor_path,
            train_data_path=config.train_data_path,
            test_data_path=config.test_data_path,
            target_column=schema.name
        )

        return data_transformation_config

In [8]:
import os
from MLProject import logger
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from MLProject.entity.config_entity import DataTransformationConfig
from MLProject.utils.common import save_object
import pandas as pd
import scipy
import numpy as np
import pickle

In [9]:
class DataTransformation:
    def __init__(self, config: DataTransformationConfig):
        self.config = config

    def get_data_transformation_object(self):
        try:
            Numerical_cols = ['Days_for_shipment_(scheduled)', 'Benefit_per_order',
                              'Sales_per_customer','Latitude', 'Longitude',
                              'Order_Item_Discount', 'Order_Item_Discount_Rate',
                              'Order_Item_Product_Price', 'Order_Item_Profit_Ratio',
                              'Order_Item_Quantity', 'Sales', 'Order_Item_Total',
                              'Order_Profit_Per_Order', 'Product_Price']
            
            nom_cat_cols = ['Type','Order_Status','Shipping_Mode']
            
            num_pipeline=Pipeline(
                steps=[
                    ('imputer', SimpleImputer(strategy='median'))
                ]
            )

            # Nominal_Categorigal Pipeline
            nom_cat_pipeline=Pipeline(
                steps=[
                    ('imputer',SimpleImputer(strategy='most_frequent')),
                    ('OneHotEncoder', OneHotEncoder(drop='first'))
                ]
            )

            preprocessor=ColumnTransformer([
                ('num_pipeline',num_pipeline,Numerical_cols),
                ('nom_cat_pipeline',nom_cat_pipeline,nom_cat_cols)
            ])

            return preprocessor

        except Exception as e:
            raise e
        
    def initiate_data_transformation(self):
        train_df = pd.read_csv(self.config.train_data_path)
        test_df = pd.read_csv(self.config.test_data_path)


        train_df['Shipping_Mode']= train_df['Shipping_Mode'].replace('Same Day', 'Premium Class')
        train_df['Order_Item_Discount_Rate']= train_df['Order_Item_Discount_Rate'].apply(lambda x: x*100)

        test_df['Shipping_Mode']= test_df['Shipping_Mode'].replace('Same Day', 'Premium Class')
        test_df['Order_Item_Discount_Rate']= test_df['Order_Item_Discount_Rate'].apply(lambda x: x*100)

        input_feature_train_df = train_df.drop(columns=['Late_delivery_risk'])
        target_feature_train_df = train_df['Late_delivery_risk']

        input_feature_test_df = test_df.drop(columns=['Late_delivery_risk'])
        target_feature_test_df = test_df['Late_delivery_risk']

        preprocessing_obj = self.get_data_transformation_object()

        input_feature_train_arr = pd.DataFrame(preprocessing_obj.fit_transform(input_feature_train_df), columns=preprocessing_obj.get_feature_names_out())
        input_feature_test_arr = pd.DataFrame(preprocessing_obj.fit_transform(input_feature_test_df), columns=preprocessing_obj.get_feature_names_out())

        train_arr = pd.concat([input_feature_train_arr, target_feature_train_df], axis=1)
        test_arr = pd.concat([input_feature_test_arr, target_feature_test_df], axis=1)
        
        train_arr.columns = ['Days_for_shipment_(scheduled)','Benefit_per_order', 'Sales_per_customer',
                             'Latitude', 'Longitude','Order_Item_Discount','Order_Item_Discount_Rate',
                             'Order_Item_Product_Price','Order_Item_Profit_Ratio','Order_Item_Quantity', 'Sales',
                             'Order_Item_Total','Order_Profit_Per_Order', 'Product_Price','Type_DEBIT', 'Type_PAYMENT',
                             'Type_TRANSFER','Order_Status_CLOSED','Order_Status_COMPLETE','Order_Status_ON_HOLD',
                             'Order_Status_PAYMENT_REVIEW','Order_Status_PENDING','Order_Status_PENDING_PAYMENT',
                             'Order_Status_PROCESSING','Order_Status_SUSPECTED_FRAUD','Shipping_Mode_Premium Class',
                             'Shipping_Mode_Second Class','Shipping_Mode_Standard Class','Late_delivery_risk']
        
        test_arr.columns = ['Days_for_shipment_(scheduled)','Benefit_per_order', 'Sales_per_customer',
                             'Latitude', 'Longitude','Order_Item_Discount','Order_Item_Discount_Rate',
                             'Order_Item_Product_Price','Order_Item_Profit_Ratio','Order_Item_Quantity', 'Sales',
                             'Order_Item_Total','Order_Profit_Per_Order', 'Product_Price','Type_DEBIT', 'Type_PAYMENT',
                             'Type_TRANSFER','Order_Status_CLOSED','Order_Status_COMPLETE','Order_Status_ON_HOLD',
                             'Order_Status_PAYMENT_REVIEW','Order_Status_PENDING','Order_Status_PENDING_PAYMENT',
                             'Order_Status_PROCESSING','Order_Status_SUSPECTED_FRAUD','Shipping_Mode_Premium Class',
                             'Shipping_Mode_Second Class','Shipping_Mode_Standard Class','Late_delivery_risk']
        
        train_arr.to_csv(os.path.join(self.config.root_dir, 'trans_train.csv'), index=False)
        test_arr.to_csv(os.path.join(self.config.root_dir, 'trans_test.csv'), index=False)

        save_object(
                file_path=self.config.preprocessor_path,
                obj=preprocessing_obj
            )

        return (
                train_arr,
                test_arr,
                self.config.preprocessor_path
            )         

In [10]:
try:
    config = ConfigurationManager()
    data_transformation_config = config.get_data_transformation_config()
    data_transformation = DataTransformation(config=data_transformation_config)
    data_transformation.get_data_transformation_object()
    data_transformation.initiate_data_transformation() 
except Exception as e:
    raise e

[2024-02-29 22:21:17,773: INFO: common: yaml file: config\config.yaml loaded successfully]
[2024-02-29 22:21:17,776: INFO: common: yaml file: params.yaml loaded successfully]
[2024-02-29 22:21:17,778: INFO: common: yaml file: schema.yaml loaded successfully]
[2024-02-29 22:21:17,779: INFO: common: created directory at: artifacts]
[2024-02-29 22:21:17,780: INFO: common: created directory at: artifacts/data_transformation]
