In [1]:
import os

In [2]:
%pwd

'a:\\mlflow\\Heart-Failure-Prediction-with-Machine-Learning-MLflow\\research'

In [3]:
os.chdir('../')

In [4]:
%pwd

'a:\\mlflow\\Heart-Failure-Prediction-with-Machine-Learning-MLflow'

In [5]:
from dataclasses import dataclass
from pathlib import Path

In [6]:
@dataclass(frozen=True)
class DataTransformationConfig:
    root_dir: Path
    data_path: Path

In [7]:
from mlProject.constants import *
from mlProject.utils.common import read_yaml, create_directories

In [None]:
class ConfigrationManger:
    def __init__(
        self,
        config_file_path = CONFIG_FILE_PATH,
        params_file_path = PARAMS_FILE_PATH,    
        schema_file_path = SCHEMA_FILE_PATH,
    ):
        self.config = read_yaml(config_file_path)
        self.params = read_yaml(params_file_path)
        self.schema = read_yaml(schema_file_path)

        create_directories([self.config.artifacts_root_dir])
        
    def get_data_transformation_config(self) -> DataTransformationConfig:
        config = self.config.data_transformation
        create_directories([config.root_dir])
        
        data_transformation_config = DataTransformationConfig(
            root_dir=Path(config.root_dir),
            data_path=Path(config.data_path) 
        )
        
        return data_transformation_config
        



In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split
from mlProject import logger

In [None]:
class DataTransformation:
    def __init__(self, config: DataTransformationConfig):
        self.config = config
        

    # All data transformation techniques such as normalization, encoding, Scalar, PCA, etc. can be implemented here.
    
    def data_preprocessing(self) -> pd.DataFrame:

        logger.info("Starting data preprocessing...")

        # Load the data
        data = pd.read_csv(self.config.data_path)
        data.dropna(inplace=True)
        data.drop_duplicates(inplace=True)

        # Map categorical columns to numeric
        data['Sex'] = data['Sex'].map({'M': 1, 'F': 0})
        data['ExerciseAngina'] = data['ExerciseAngina'].map({'Y': 1, 'N': 0})


        data['ChestPainType'] = data['ChestPainType'].map({
            'ATA': 0,
            'NAP': 1,
            'ASY': 2,
            'TA': 3
        })

        data['RestingECG'] = data['RestingECG'].map({
            'Normal': 0,
            'ST': 1,
            'LVH': 2
        })

        data['ST_Slope'] = data['ST_Slope'].map({
            'Up': 0,
            'Flat': 1,
            'Down': 2
        })

        return data

    def train_test_split(self, data: pd.DataFrame) -> None:
        
        # Splitting the data into train and test sets
        # 75% of the data will be used for training and 25% for testing
        train, test = train_test_split(data, test_size=0.25, random_state=42)

        train.to_csv(os.path.join(self.config.root_dir ,'train.csv'), index=False)
        test.to_csv(os.path.join(self.config.root_dir ,'test.csv'), index=False)

        logger.info(f"Train and test data saved at {self.config.root_dir}")
        logger.info(f"Train data shape: {train.shape}")
        logger.info(f"Test data shape: {test.shape}")   

        


In [11]:
try:
    config_manager = ConfigrationManger()
    data_transformation_config = config_manager.get_data_transformation_config()    
    data_transformation = DataTransformation(config=data_transformation_config)
    processed_data = data_transformation.data_preprocessing()
    data_transformation.train_test_split(processed_data)
except Exception as e:
    logger.exception(f"An error occurred during data transformation: {e}")
    raise e

[2025-07-21 20:12:16,846 : INFO: yaml file: config\config.yaml loaded successfully]
[2025-07-21 20:12:16,849 : INFO: yaml file: params.yaml loaded successfully]
[2025-07-21 20:12:16,854 : INFO: yaml file: schema.yaml loaded successfully]
[2025-07-21 20:12:16,857 : INFO: created directory at: artifacts]
[2025-07-21 20:12:16,857 : INFO: created directory at: artifacts/data_transformation]
[2025-07-21 20:12:16,857 : INFO: Starting data preprocessing...]
[2025-07-21 20:12:16,935 : INFO: Train and test data saved at artifacts\data_transformation]
[2025-07-21 20:12:16,935 : INFO: Train data shape: (688, 12)]
[2025-07-21 20:12:16,935 : INFO: Test data shape: (230, 12)]
