In [1]:
import os

In [2]:
%pwd

'c:\\Users\\PASCAL\\flight_price_prediction\\research'

In [3]:
os.chdir("../")

In [4]:
%pwd

'c:\\Users\\PASCAL\\flight_price_prediction'

In [22]:
from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class DataTransformationConfig:
    root_dir: Path
    data_train_path: Path
    preprocessor_path: Path    

In [23]:
from src.flightprice.constants import *
from src.flightprice.utils.common import read_yaml, create_directories

In [24]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        create_directories([self.config.artifacts_root])

    

    def get_data_transformation_config(self) -> DataTransformationConfig:
        config = self.config.data_transformation

        create_directories([config.root_dir])

        data_transformation_config = DataTransformationConfig(
        root_dir=config.root_dir,
        data_train_path=config.data_train_path,
        preprocessor_path=config.preprocessor_path
    )

        return data_transformation_config

In [25]:
import os
from src.flightprice.logger import logging
import sys
from dataclasses import dataclass

import numpy as np 
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder,StandardScaler
import pickle
'''from src.flightprice.utils.common import save_object'''

'from src.flightprice.utils.common import save_object'

In [26]:

class DataTransformation:
    def __init__(self, config):
        self.data_transformation_config = config

    def get_data_transformer_object(self):
        try:
            data = pd.read_csv(self.data_transformation_config.data_train_path)

            numerical_columns = ["Dep_Time", "Arrival_Time", "Duration"]
            categorical_columns = [
                'Airline', 'Source',
                'Destination', 'Route',
                'Total_Stops', 'Additional_Info'
            ]

            # Exclude non-numeric columns from the numerical columns list
            numerical_columns = [col for col in numerical_columns if col in data.columns and data[col].dtype != object]

            num_pipeline = Pipeline(
                steps=[
                    ("imputer", SimpleImputer(strategy="median")),
                    ("scaler", StandardScaler())
                ]
            )

            cat_pipeline = Pipeline(
                steps=[
                    ("imputer", SimpleImputer(strategy="most_frequent")),
                    ("encoder", OneHotEncoder())
                ]
            )

            preprocessor = ColumnTransformer(
                transformers=[
                    ("numeric", num_pipeline, numerical_columns),
                    ("categorical", cat_pipeline, categorical_columns)
                ]
            )

            return preprocessor

        except Exception as e:
            print(f"Error in get_data_transformer_object: {str(e)}")

    def initiate_data_transformation(self):
        try:
            data = pd.read_csv(self.data_transformation_config.data_train_path)

            preprocessor = self.get_data_transformer_object()

            target_column_name="Price"
            numerical_columns = ["Dep_Time", "Arrival_Time", "Duration"]

            input_feature_train_df=data.drop(columns=[target_column_name],axis=1)
            target_feature_train_df=data[target_column_name]

            logging.info(
                f"Applying preprocessing object on training dataframe and testing dataframe.")

            transformed_data = preprocessor.fit_transform(input_feature_train_df)

            output_dir = os.path.join(self.data_transformation_config.root_dir, "artifacts/data_transformation")
            os.makedirs(output_dir, exist_ok=True)

            output_file = os.path.join(output_dir, "preprocessors.pkl")
            with open(output_file, "wb") as file:
                pickle.dump(preprocessor, file)

            print("Data transformation completed and saved as a pickle file.")

        except Exception as e:
            print(f"Error in initiate_data_transformation: {str(e)}")



In [27]:
try:
    config = ConfigurationManager()
    data_transformation_config = config.get_data_transformation_config()
    data_transformation = DataTransformation(data_transformation_config)
    data_transformation.initiate_data_transformation()
except Exception as e:
    raise e


[2023-07-11 06:33:50,067: INFO: common: yaml file: config\config.yaml loaded successfully]
[2023-07-11 06:33:50,069: INFO: common: yaml file: params.yaml loaded successfully]
[2023-07-11 06:33:50,071: INFO: common: created directory at: artifacts]
[2023-07-11 06:33:50,072: INFO: common: created directory at: artifacts/data_transformation]
[2023-07-11 06:33:50,154: INFO: 1338166216: Applying preprocessing object on training dataframe and testing dataframe.]
Data transformation completed and saved as a pickle file.
