In [None]:
## Update the config.yaml

data_transformation:
  root_dir : artifacts/data_transformation
  data_path : artifacts/data_ingestion/winequality-red.csv

In [None]:
## update the entity --- config.entity
from dataclasses import dataclass
from pathlib import Path

@dataclass
class DataTransformationConfig:
    root_dir : Path
    data_path : Path

In [None]:
## Update Configuration manager --- configuration.py


from src.end_to_end_ML_project_1.constants import *
from src.end_to_end_ML_project_1.utils.common import read_yaml,create_directories

class ConfigurationManager:
    def __init__(self,config_filepath=CONFIG_FILE_PATH, 
                 params_filepath = PARAMS_FILE_PATH, 
                 schema_filepath = SCHEMA_FILE_PATH):
        self.config = read_yaml(config_filepath)
        self.params=read_yaml(params_filepath)
        self.schema=read_yaml(schema_filepath)

        create_directories([self.config.artifacts_root])


    ## Perform the data transformation

    def get_data_transformation_config(self) -> DataTransformationConfig:
        config = self.config.data_transformation
        create_directories([config.root_dir])
        data_transformation_config=DataTransformationConfig(
            root_dir=config.root_dir,
            data_path=config.data_path
        )
        return data_transformation_config

In [None]:
## Update the Components --> data_transformation.py

import os
from src.end_to_end_ML_project_1 import logger
from sklearn.model_selection import train_test_split
import pandas as pd


class DataTransformation:
    def __init__(self,config: DataTransformationConfig):
        self.config = config

    """
    Different Data Transformation Techniques can be added here such as Scaler, PCA and all.
    Different kinds of EDA in ML cycle can also be performed here before passing the data to the model.
    This data won't go through the whole data transformation process because this is to demonstrate the end to end machine learning learning ETL pipeline.
    This data is already clean and can be used for modeling.
    """

    def train_test_splitting(self):
        data=pd.read_csv(self.config.data_path) 

        #Split the data into training and test datasets.
        train,test = train_test_split(data)
        train.to_csv(os.path.join(self.config.root_dir, 'train_csv'),index= False) 
        test.to_csv(os.path.join(self.config.root_dir, 'test.csv'), index = False)

        logger.info('Splitted data into training and test sets')
        logger.info(train.shape)
        logger.info(test.shape)  


In [None]:
## Update the Pipeline

try:
    config = ConfigurationManager()
    data_transformation_config = config.get_data_transformation_config()
    data_transformation = DataTransformation(config=data_transformation_config)
    data_transformation.train_test_splitting()
except Exception as e:
    raise e

