In [1]:
import os

In [4]:
pwd

'c:\\Users\\user\\Desktop\\End-to end- ML-project-with-ML-flow\\End-to-end-Machine-Learning-Project-with-Mlflow'

In [5]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataTransformationConfig:
    root_dir: Path
    data_path: Path
    

In [6]:
from Mlproject.constants import *
from Mlproject.utils.common import read_yaml, create_directories

In [7]:
import logging
from pathlib import Path
from box import ConfigBox

logging.basicConfig(level=logging.INFO, format='[%(asctime)s]: %(message)s:')

CONFIG_FILE_PATH = Path("config/config.yaml")
PARAMS_FILE_PATH = Path("params.yaml")
SCHEMA_FILE_PATH = Path("schema.yaml")

In [8]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH,
        schema_filepath = SCHEMA_FILE_PATH
    ):
        self.config = ConfigBox(read_yaml(config_filepath))
        self.params = ConfigBox(read_yaml(params_filepath))
        self.schema = ConfigBox(read_yaml(schema_filepath))

        
        create_directories([self.config.artifacts_root])

    def get_data_transformation_config(self) -> DataTransformationConfig:
        try:
            config = self.config.data_transformation
            logging.info(f"Data transformation config: {config}")
        except Exception as e:
            logging.error(f"Error reading data_transformation config: {str(e)}")
            logging.error(f"Available config keys: {self.config.keys()}")
            raise

        create_directories([config.root_dir])

        data_transformation_config = DataTransformationConfig(
            root_dir=Path(config.root_dir),
            data_path=Path(config.data_path)
        )

        return data_transformation_config

In [9]:
from Mlproject.constants import *
from Mlproject.utils.common import read_yaml, create_directories
from Mlproject.entity.config_entity import DataTransformationConfig
from pathlib import Path

class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH,
        schema_filepath = SCHEMA_FILE_PATH
    ):
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        create_directories([self.config.artifacts_root])

    def get_data_transformation_config(self) -> DataTransformationConfig:
        config = self.config.data_transformation

        create_directories([config.root_dir])

        data_transformation_config = DataTransformationConfig(
            root_dir=Path(config.root_dir),
            data_path=Path(config.data_path)
        )

        return data_transformation_config

In [10]:
def update_config():
    try:
        # Read existing config
        with open(CONFIG_FILE_PATH, 'r') as f:
            config = yaml.safe_load(f)
        
        # Add data_transformation section if it doesn't exist
        if 'data_transformation' not in config:
            config['data_transformation'] = {
                'root_dir': 'artifacts/data_transformation',
                'data_path': 'artifacts/data_ingestion/winequality-red.csv'
            }
        
        # Write updated config
        with open(CONFIG_FILE_PATH, 'w') as f:
            yaml.dump(config, f, default_flow_style=False)
            
        print("Config file updated successfully!")
        
    except Exception as e:
        print(f"Error updating config: {str(e)}")

# Run this to update your config file
if __name__ == "__main__":
    import yaml
    update_config()

Config file updated successfully!


In [11]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
import logging
from Mlproject.entity.config_entity import DataTransformationConfig



In [12]:
class DataTransformation:
    def __init__(self, config: DataTransformationConfig):
        self.config = config

    def train_test_spliting(self): 
        try:
            # Read the data
            data = pd.read_csv(self.config.data_path)

            # Split the data into training and test sets 
            train, test = train_test_split(data, test_size=0.2, random_state=42)
            
            # Save the split datasets
            train.to_csv(os.path.join(self.config.root_dir, "train.csv"), index=False)
            test.to_csv(os.path.join(self.config.root_dir, "test.csv"), index=False)

            # Log the information
            logging.info("Split data into training and test sets")
            logging.info(f"Training set shape: {train.shape}")
            logging.info(f"Test set shape: {test.shape}")

            # Print shapes
            print(f"Training set shape: {train.shape}")
            print(f"Test set shape: {test.shape}")

        except Exception as e:
            logging.error(f"Error in train_test_splitting: {str(e)}")
            raise e

In [13]:
class DataTransformationTrainingPipeline:
    def __init__(self):
        pass

    def main(self):
        try:
            # Check validation status
            with open(Path("artifacts/data_validation/status.txt"), "r") as f:
                status = f.read().split(" ")[-1]

            if status == "True":
                # Initialize configuration and transformation
                config = ConfigurationManager()
                data_transformation_config = config.data_transformation_config()
                data_transformation = DataTransformation(config=data_transformation_config)
                data_transformation.train_test_spliting()

            else:
                raise Exception("Your data schema is not valid")
           
        except Exception as e:
            raise e

In [14]:
try:
     config = ConfigurationManager()
     data_transformation_config = config.get_data_transformation_config()
     data_transformation = DataTransformation(config=data_transformation_config)
     data_transformation.train_test_spliting()
except Exception as e:
                raise e



[2025-02-26 01:32:56,825: INFO: common: YAML file: config\config.yaml loaded successfully]
[2025-02-26 01:32:56,829: INFO: common: YAML file: params.yaml loaded successfully]
[2025-02-26 01:32:56,850: INFO: common: YAML file: schema.yaml loaded successfully]
[2025-02-26 01:32:56,860: INFO: common: Created directory at: artifacts]
[2025-02-26 01:32:56,865: INFO: common: Created directory at: artifacts/data_transformation]
[2025-02-26 01:32:57,301: INFO: 1281790633: Split data into training and test sets]
[2025-02-26 01:32:57,309: INFO: 1281790633: Training set shape: (1279, 12)]
[2025-02-26 01:32:57,309: INFO: 1281790633: Test set shape: (320, 12)]
Training set shape: (1279, 12)
Test set shape: (320, 12)
