In [2]:
import os

In [1]:
%pwd

'd:\\ML_Projects\\Resume Project\\Project_6_insurance_claim_prediction\\reasearch'

In [3]:
os.chdir("../")

In [4]:
%pwd

'd:\\ML_Projects\\Resume Project\\Project_6_insurance_claim_prediction'

In [None]:
# Prepare Entity class
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataTransformationConfig:
    root_dir : Path
    data_path : Path
  
    

In [15]:
# prepare confuguration manager class
from src.mlproject.constants import *
from src.mlproject.utils.common import read_yaml,create_directories

In [None]:
class ConfigurationManager:
    def __init__(self,
                 config_filepath = CONFIG_FILE_PATH,
                 params_filepath = PARAMS_FILE_PATH,
                 schema_filepath = SCHEMA_FILE_PATH):
        
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        create_directories([self.config.artifacts_root])



    def get_data_transformation_config(self) -> DataTransformationConfig:
        config = self.config.data_transformation

        create_directories([config.root_dir])

        data_validation_config = DataTransformationConfig(
            root_dir=config.root_dir,
            data_path=config.data_path,
            
        )

        return data_validation_config
        

In [17]:

import os
from src.mlproject.logging import logger
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder,StandardScaler
import numpy as np

import joblib


In [35]:
class DataTransformation:
    def __init__(self,config: DataTransformationConfig):
        self.config = config

    ## Note: you can add different data transformation techniques such as scalar, PCA, an all.
    # You can perform all kinds of EDA in ML cycle here before passing this data to the model.

    def train_test_splitting(self):
        data = pd.read_csv(self.config.data_path)

        # Split data into training and test set. (0.75, 0.25) split.
        train,test = train_test_split(data)

        train.to_csv(os.path.join(self.config.root_dir,"train.csv"),index= False)
        test.to_csv(os.path.join(self.config.root_dir,"test.csv"),index= False)

        logger.info("Splitted data into training and test sets")
        logger.info(train.shape)
        logger.info(test.shape)

        print(train.shape)
        print(test.shape)
        return train,test

    def get_data_transformer_object(self):
        '''
        This function is responsible for data transformation
        '''

        try:
            numerical_columns = ["age","bmi","children"]
            categorical_column = [
                "sex",
                "smoker",
                "region"
            ]

            # num_pipeline =Pipeline(
            #     steps=[
            #         ("imputer",SimpleImputer(strategy="median")),
            #         #("scaler",StandardScaler())
            #     ]
            # )

            cat_pipeline = Pipeline(
                steps=[
                    # ("imputer",SimpleImputer(strategy="most_frequent")),
                    ("one_hot_encoder",OneHotEncoder()),
                    #("scaler",StandardScaler())
                ]
            )
            
            preprocessor = ColumnTransformer(
                [
                    # ("num_pipeline",num_pipeline,numerical_columns),
                    ("cat_pipeline",cat_pipeline,categorical_column),
                    
                ],remainder='passthrough'
            )

            logger.info("Preprocessor object created")

            return preprocessor
        

        except Exception as e:
            raise e
        
    
    def initiate_data_transformation(self,train_data,test_data):

        try:
            
            preprocessor_obj = self.get_data_transformer_object()

            target_column_name = "charges"


            input_feature_train_df = train_data.drop(columns=[target_column_name],axis=1)
            target_feature_train_df = train_data[target_column_name]
            #print(input_feature_train_df.head(5))
            input_feature_test_df = test_data.drop(columns=[target_column_name],axis=1)
            target_feature_test_df = test_data[target_column_name]

            logger.info(
                f"Applying preprocessing object on training dataframe and testing dataframe"
            )

            input_feature_train_arr = preprocessor_obj.fit_transform(input_feature_train_df)
            input_feature_test_arr = preprocessor_obj.transform(input_feature_test_df)


            train_arr = np.c_[
                input_feature_train_arr,np.array(target_feature_train_df)
            ]

            test_arr = np.c_[
                input_feature_test_arr,np.array(target_feature_test_df)
            ]


            #### For dataframe output 
            transformed_train=pd.DataFrame(train_arr)
            transformed_test=pd.DataFrame(test_arr)

            
            transformed_train.to_csv(os.path.join(self.config.root_dir,"transformed_train.csv"),index= False)
            transformed_test.to_csv(os.path.join(self.config.root_dir,"transformed_test.csv"),index= False)

            logger.info(f"Saved train and test data.")

            joblib.dump(preprocessor_obj, os.path.join(self.config.root_dir,"preprocessor.joblib"))
            logger.info(f"Saved Preprocessing objects.")

            return train_arr,test_arr

        except Exception as e:
            raise e




In [36]:
try:
    config= ConfigurationManager()
    data_transformation_config = config.get_data_transformation_config()
    data_transformation = DataTransformation(config=data_transformation_config)
    train,test = data_transformation.train_test_splitting()
    data_transformation.initiate_data_transformation(train_data=train,test_data=test)

except Exception as e:
    raise e
    

[2025-04-10 00:04:10,618: INFO: common: yaml file: config\config.yaml loaded successfully]
[2025-04-10 00:04:10,621: INFO: common: yaml file: params.yaml loaded successfully]
[2025-04-10 00:04:10,624: INFO: common: yaml file: schema.yaml loaded successfully]
[2025-04-10 00:04:10,626: INFO: common: Created directory at: artifacts]
[2025-04-10 00:04:10,628: INFO: common: Created directory at: artifacts/data_transformation]
[2025-04-10 00:04:10,656: INFO: 2760259681: Splitted data into training and test sets]
[2025-04-10 00:04:10,658: INFO: 2760259681: (2079, 7)]
[2025-04-10 00:04:10,660: INFO: 2760259681: (693, 7)]
(2079, 7)
(693, 7)
[2025-04-10 00:04:10,661: INFO: 2760259681: Preprocessor object created]
[2025-04-10 00:04:10,664: INFO: 2760259681: Applying preprocessing object on training dataframe and testing dataframe]
[2025-04-10 00:04:10,715: INFO: 2760259681: Saved train and test data.]
[2025-04-10 00:04:10,715: INFO: 2760259681: Saved Preprocessing objects.]
