In [1]:
import os

In [2]:
%pwd

'c:\\Users\\PASCAL\\flight_price_prediction\\research'

In [3]:
os.chdir("../")

In [4]:
%pwd

'c:\\Users\\PASCAL\\flight_price_prediction'

In [15]:
from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class DataTransformationConfig:
    root_dir: Path
    data_path: Path
    preprocessor_path: Path    

In [16]:
from src.flightprice.constants import *
from src.flightprice.utils.common import read_yaml, create_directories

In [17]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        create_directories([self.config.artifacts_root])


    
    def get_data_transformation_config(self) -> DataTransformationConfig:
        config = self.config.data_transformation

        create_directories([config.root_dir])

        data_transformation_config = DataTransformationConfig(
            root_dir=config.root_dir,
            data_path=config.data_path,
            preprocessor_path = config.preprocessor_path
        )

        return data_transformation_config

In [18]:
import os
from src.flightprice.logger import logging
import sys
from dataclasses import dataclass

import numpy as np 
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder,StandardScaler
import pickle
'''from src.flightprice.utils.common import save_object'''

'from src.flightprice.utils.common import save_object'

In [33]:


class DataTransformation:
    def __init__(self, config: DataTransformationConfig):
        self.config = config

    def get_data_transformer_object(self):
        try:
            numerical_columns = ['Journey_day', 'Journey_month', 
                                 'Dep_hour', 'Dep_min', 
                                 'Arrival_hour', 'Arrival_min', 
                                 'Duration_hours', 'Duration_mins']
            categorical_columns = ['Airline', 'Source', 'Destination', 'Total_Stops']

            num_pipeline = Pipeline(
                steps=[
                    ("imputer", SimpleImputer(strategy="median")),
                    ("scaler", StandardScaler())
                ]
            )

            cat_pipeline = Pipeline(
                steps=[
                    ("imputer", SimpleImputer(strategy="most_frequent")),
                    ("one_hot_encoder", OneHotEncoder()),
                    ("scaler", StandardScaler(with_mean=False))
                ]
            )

            logging.info(f"Categorical columns: {categorical_columns}")
            logging.info(f"Numerical columns: {numerical_columns}")

            preprocessor = ColumnTransformer(
                transformers=[
                    ("num_pipeline", num_pipeline, numerical_columns),
                    ("cat_pipelines", cat_pipeline, categorical_columns)
                ],
                remainder="drop"  # Ignore any columns not explicitly specified
            )

            return preprocessor

        except Exception as e:
            logging.error(f"Error in get_data_transformer_object: {str(e)}")

    def initiate_data_transformation(self):
        try:
            train_data_path = 'artifacts/data_ingestion/unzipped_data/train_data.csv'  # Replace with the actual path to your train data file
            test_data_path = 'artifacts/data_ingestion/unzipped_data/test_data.csv'  # Replace with the actual path to your test data file

            logging.info("Read train and test data completed")

            logging.info("Obtaining preprocessing object")

            preprocessing_obj = self.get_data_transformer_object()

            target_column_name = "Price"

            train_df = pd.read_csv(train_data_path)
            test_df = pd.read_csv(test_data_path)

            input_feature_train_df = train_df.drop(columns=[target_column_name], axis=1)
            target_feature_train_df = train_df[target_column_name]

            input_feature_test_df = test_df.drop(columns=[target_column_name], axis=1)
            target_feature_test_df = test_df[target_column_name]

            logging.info("Applying preprocessing object on training dataframe and testing dataframe.")

            input_feature_train_arr = preprocessing_obj.fit_transform(input_feature_train_df)
            input_feature_test_arr = preprocessing_obj.transform(input_feature_test_df)

            train_arr = np.c_[
                input_feature_train_arr, np.array(target_feature_train_df)
            ]
            test_arr = np.c_[input_feature_test_arr, np.array(target_feature_test_df)]

            # Save preprocessing object
            preprocessing_obj_file = os.path.join("artifacts", 'data_transformation', 'preprocessing_obj.pkl')
            with open(preprocessing_obj_file, 'wb') as file:
                pickle.dump(preprocessing_obj, file)

            logging.info("Saved preprocessing object.")
            logging.info("Transformation of the data is completed")

            return (
                train_arr,
                test_arr,
                preprocessing_obj_file
            )
        except Exception as e:
            logging.error(f"Error in initiate_data_transformation: {str(e)}")


In [34]:
try:
    config = ConfigurationManager()
    data_transformation_config = config.get_data_transformation_config()
    data_transformation = DataTransformation(data_transformation_config)
    data_transformation.initiate_data_transformation()
except Exception as e:
    raise e


[2023-07-16 04:37:23,334: INFO: common: yaml file: config\config.yaml loaded successfully]
[2023-07-16 04:37:23,336: INFO: common: yaml file: params.yaml loaded successfully]
[2023-07-16 04:37:23,338: INFO: common: created directory at: artifacts]
[2023-07-16 04:37:23,339: INFO: common: created directory at: artifacts/data_transformation]
[2023-07-16 04:37:23,340: INFO: 82110484: Read train and test data completed]
[2023-07-16 04:37:23,341: INFO: 82110484: Obtaining preprocessing object]
[2023-07-16 04:37:23,342: INFO: 82110484: Categorical columns: ['Airline', 'Source', 'Destination', 'Total_Stops']]
[2023-07-16 04:37:23,343: INFO: 82110484: Numerical columns: ['Journey_day', 'Journey_month', 'Dep_hour', 'Dep_min', 'Arrival_hour', 'Arrival_min', 'Duration_hours', 'Duration_mins']]
[2023-07-16 04:37:23,376: INFO: 82110484: Applying preprocessing object on training dataframe and testing dataframe.]
[2023-07-16 04:37:23,490: INFO: 82110484: Saved preprocessing object.]
[2023-07-16 04:37: