In [2]:
import os

In [1]:
%pwd

'a:\\Machine-Learning-Project-with-MLflow\\research'

In [3]:
os.chdir('../')

In [4]:
%pwd

'a:\\Machine-Learning-Project-with-MLflow'

In [5]:
from dataclasses import dataclass
from pathlib import Path

In [21]:
@dataclass(frozen=True)
class DataTransformationConfig:
    root_dir: Path
    data_path: Path

In [8]:
from ml_project.constants import *
from ml_project.utils.common import read_yaml, create_directories

In [22]:
class ConfigrationManger:
    def __init__(
        self,
        config_file_path = CONFIG_FILE_PATH,
        params_file_path = PARAMS_FILE_PATH,    
        schema_file_path = SCHEMA_FILE_PATH,
    ):
        self.config = read_yaml(config_file_path)
        self.params = read_yaml(params_file_path)
        self.schema = read_yaml(schema_file_path)

        create_directories([self.config.artifacts_root_dir])
        
    def get_data_transformation_config(self) -> DataTransformationConfig:
        config = self.config.data_transformation
        create_directories([config.root_dir])
        
        data_validation_config = DataTransformationConfig(
            root_dir=Path(config.root_dir),
            data_path=Path(config.data_path) 
        )
        
        return data_validation_config
        



In [19]:
import pandas as pd
from sklearn.model_selection import train_test_split
from ml_project import logger

In [None]:
class DataTransformation:
    def __init__(self, config: DataTransformationConfig):
        self.config = config

    # All data transformation techniques such as normalization, encoding, Scalar, PCA, etc. can be implemented here.
    # For now, we will just read data and use it directly as it is already in a good format.


    def train_test_split(self):
        data=pd.read_csv(self.config.data_path)
        
        # Splitting the data into train and test sets
        # 75% of the data will be used for training and 25% for testing
        train, test = train_test_split(data, test_size=0.25, random_state=42)

        train.to_csv(os.path.join(self.config.root_dir ,'train.csv'), index=False)
        test.to_csv(os.path.join(self.config.root_dir ,'test.csv'), index=False)

        logger.info(f"Train and test data saved at {self.config.root_dir}")
        logger.info(f"Train data shape: {train.shape}")
        logger.info(f"Test data shape: {test.shape}")   

        


In [24]:
try:
    config_manager = ConfigrationManger()
    data_transformation_config = config_manager.get_data_transformation_config()    
    data_transformation = DataTransformation(config=data_transformation_config)
    data_transformation.train_test_split()
except Exception as e:
    logger.exception(f"An error occurred during data transformation: {e}")
    raise e

[2025-07-18 06:34:57,309 : INFO: yaml file: config\config.yaml loaded successfully]
[2025-07-18 06:34:57,310 : INFO: yaml file: params.yaml loaded successfully]
[2025-07-18 06:34:57,313 : INFO: yaml file: schema.yaml loaded successfully]
[2025-07-18 06:34:57,313 : INFO: created directory at: artifacts]
[2025-07-18 06:34:57,318 : INFO: created directory at: artifacts/data_transformation]
[2025-07-18 06:34:57,367 : INFO: Train and test data saved at artifacts\data_transformation]
[2025-07-18 06:34:57,369 : INFO: Train data shape: (1199, 12)]
[2025-07-18 06:34:57,370 : INFO: Test data shape: (400, 12)]
