In [1]:
import os

In [2]:
%pwd

'c:\\Users\\ainao\\Downloads\\Projects\\Bulldozer-Prediction\\research'

In [3]:
os.chdir("../")

In [4]:
%pwd

'c:\\Users\\ainao\\Downloads\\Projects\\Bulldozer-Prediction'

In [5]:
from dataclasses import dataclass
from pathlib import Path
@dataclass(frozen=True)
class DataTransformationConfig:
      root_dir: Path
      train_path: Path
      test_path:Path
      train_data: Path
      test_data:Path
      preprocessor: Path


In [6]:
from bullprediction.utils.common import create_directories, read_yaml
from bullprediction.constants import *

In [7]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH):

        self.config = read_yaml(config_filepath)

        create_directories([self.config.artifacts_root])

    def get_data_transformation(self) -> DataTransformationConfig:
        config = self.config.data_transformation

        create_directories([config.root_dir])

        data_transformation_config = DataTransformationConfig(
        root_dir = config.root_dir,
        train_path = config.train_path,
        test_path = config.test_path,
        train_data = config.train_data,
        test_data = config.test_data,
        preprocessor = config.preprocessor
        )

        return data_transformation_config   

In [8]:
import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder
import logging
from bullprediction.utils.common import save_object



In [9]:
class DataTransformation:
    def __init__(self,config: DataTransformationConfig):
        self.config = config

    def build_preprocessor(self, df: pd.DataFrame):
        df = df.drop(columns=["SalePrice"], errors="ignore")

    # Identify numerical and categorical columns
        num_cols = df.select_dtypes(include=["int64", "float64"]).columns.tolist()
        cat_cols = df.select_dtypes(include=["object", "category"]).columns.tolist()

    # Define transformers
        num_transformer = SimpleImputer(strategy="median")
        cat_transformer = Pipeline([
        ("imputer", SimpleImputer(strategy="constant", fill_value="missing")),
        ("encoder", OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1))
         ])

    # Build column transformer
        preprocessor = ColumnTransformer([
        ("num", num_transformer, num_cols),
        ("cat", cat_transformer, cat_cols)
    ])

        return preprocessor

    def initiate_data_transformation_and_split(self):
        train_df = pd.read_csv(self.config.train_path, low_memory=False)
        test_df = pd.read_csv(self.config.test_path,low_memory=False)

        valid_data = train_df[train_df.saleYear ==2012]
        train_data = train_df[train_df.saleYear!=2012]

        target_column_name = 'SalePrice'

        input_feature_train_data = train_data.drop(columns=[target_column_name])
        input_feature_valid_data = valid_data.drop(columns=[target_column_name])

        target_feature_train_data = train_data[target_column_name]
        target_feature_valid_data = valid_data[target_column_name]


        input_feature_test_df = test_df

        #preprocessing_obj = self.build_preprocessor(input_feature_train_data)

        logging.info("Applying preprocessing pipeline to train and test data.")

        sample_train = input_feature_train_data.iloc[:100]
        sample_valid = input_feature_valid_data.iloc[:100]
        sample_target_train =  target_feature_train_data.iloc[:100]
        sample_target_valid =  target_feature_valid_data.iloc[:100]

        preprocessing_obj = self.build_preprocessor(sample_train)
        sample_train_processed = preprocessing_obj.fit_transform(sample_train)
        sample_valid_processed = preprocessing_obj.transform(sample_valid)
        sample_test = input_feature_test_df.iloc[:100]

        # input_feature_train_processed = preprocessing_obj.fit_transform(input_feature_train_data)
        # input_feature_val_processed = preprocessing_obj.transform(input_feature_valid_data)
        input_feature_test_processed = preprocessing_obj.transform(sample_test)

        save_object(
                file_path=self.config.preprocessor,
                obj=preprocessing_obj
            )
    
        return (sample_train_processed, sample_valid_processed, 
                input_feature_test_processed, sample_target_train,
                sample_target_valid, self.config.preprocessor)    

    

In [10]:
try:
    config = ConfigurationManager()
    data_transformation_config = config.get_data_transformation()
    data_transformation = DataTransformation(config=data_transformation_config)
    data_transformation.initiate_data_transformation_and_split()
except Exception as e:
    raise e

[2025-06-28 14:30:20,907: INFO: common: yaml file: config\config.yaml loaded successfully]
[2025-06-28 14:30:20,911: INFO: common: created directory at: artifacts]
[2025-06-28 14:30:20,914: INFO: common: created directory at: artifacts/data_transformation]
[2025-06-28 14:30:23,964: INFO: 2144867838: Applying preprocessing pipeline to train and test data.]


