In [1]:
import os
os.chdir('../')
%pwd

'c:\\PEC-26\\MLOPS - Journey\\Projects\\End-to-End-Wine-Quality-Prediction'

In [2]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataTransformationConfig:
    root_dir: Path
    data_path: Path

In [3]:
from wine_quality.constants import *
from wine_quality.utils.common import read_yaml, create_directories

class ConfigurationManager:
    def __init__(self, config_filepath: Path = CONFIG_FILE_PATH, params_filepath: Path = PARAMS_FILE_PATH, schema_filepath: Path = SCHEMA_FILE_PATH):
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        create_directories([self.config.artifacts_root])

    def getDataTransformationConfig(self) -> DataTransformationConfig:
        config = self.config.data_transformation

        create_directories([config.root_dir])

        return DataTransformationConfig(
            root_dir=config.root_dir,
            data_path=config.data_path
        )    

In [4]:
import os
from wine_quality import logger
from sklearn.model_selection import train_test_split
import pandas as pd

class DataTransformation:
    """
    This class shall be used to transform the raw data into test set and training set, assuming that the raw datat is already cleaned.
    NOTE: You can add more data transformation methods in this class if you need to. Such as scaling, encoding, PCA, etc.
          You can also perform all kinds of EDA in Machine Learning cycle here before passing the data to the Model.
    """
    def __init__(self, config: DataTransformationConfig):
        self.config = config
    
    def testTrainSplitting(self):
        data = pd.read_csv(self.config.data_path)

        # Split the data into test and train
        train, test = train_test_split(data, test_size=0.2, random_state=42)

        train.to_csv(os.path.join(self.config.root_dir, 'train.csv'), index=False)
        test.to_csv(os.path.join(self.config.root_dir, 'test.csv'), index=False)

        logger.info("Data Transformation: Test Train Splitting is done.")
        logger.info(f"Train shape: {train.shape} \nTest shape: {test.shape}")

        print(train.shape)
        print(test.shape)

In [5]:
try:
    config = ConfigurationManager()
    data_transformation_config = config.getDataTransformationConfig()
    data_transformation = DataTransformation(data_transformation_config)
    data_transformation.testTrainSplitting()
except Exception as e:
    raise e

[2025-01-09 11:40:44,335: INFO: common yaml file: config\config.yaml loaded successfully]
[2025-01-09 11:40:44,336: INFO: common yaml file: params.yaml loaded successfully]
[2025-01-09 11:40:44,339: INFO: common yaml file: schema.yaml loaded successfully]
[2025-01-09 11:40:44,340: INFO: common created directory at: artifacts]
[2025-01-09 11:40:44,341: INFO: common created directory at: artifacts/data_transformation]
[2025-01-09 11:40:44,362: INFO: 3799609105 Data Transformation: Test Train Splitting is done.]
[2025-01-09 11:40:44,362: INFO: 3799609105 Train shape: (1279, 12) 
Test shape: (320, 12)]
(1279, 12)
(320, 12)
