In [1]:
import os

In [2]:
%pwd

'e:\\ML_Projects\\Wine_Quality_Project\\research'

In [3]:
os.chdir("../")

In [4]:
%pwd

'e:\\ML_Projects\\Wine_Quality_Project'

In [5]:
import pandas as pd

In [6]:
data = pd.read_csv("artifacts\data_ingestion\winequality-red.csv")
data.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [7]:
# this code belong from entity/config_entity.py file

from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataTransformationConfig:
    root_dir: Path
    data_path: Path

In [8]:
# this code belong from config/configuration.py file

# from mlProject.constants import *
# from mlProject.utils.common import read_yaml, create_directories

# class ConfigurationManager:
#     def __init__(
#         self,
#         config_filepath = CONFIG_FILE_PATH,
#         params_filepath = PARAMS_FILE_PATH,
#         schema_filepath = SCHEMA_FILE_PATH):
        
#         self.config = read_yaml(config_filepath)
#         self.params = read_yaml(params_filepath)
#         self.schema = read_yaml(schema_filepath)
        
#         create_directories([self.config.artifacts_root])
        
#     def get_data_transformation_config(self) -> DataTransformationConfig:
#         config = self.config.data_transformation
        
#         create_directories([config.root_dir])
        
#         data_transformation_config = DataTranformationConfig(
#             root_dir = config.root_dir,
#             data_path = config.data_path, 
#         )
        
#         return data_transformation_config
        
from mlProject.constants import CONFIG_FILE_PATH, PARAMS_FILE_PATH, SCHEMA_FILE_PATH
from mlProject.utils.common import read_yaml, create_directories
# from mlProject.entity.config_entity import DataTransformationConfig  # assuming this is the correct import

class ConfigurationManager:
    def __init__(
        self,
        config_filepath: str = CONFIG_FILE_PATH,
        params_filepath: str = PARAMS_FILE_PATH,
        schema_filepath: str = SCHEMA_FILE_PATH
    ):
        """
        Initialize ConfigurationManager: Read config, params, and schema YAML files.
        """
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        # Create artifacts root directory if not exist
        create_directories([self.config.artifacts_root])

    def get_data_transformation_config(self) -> DataTransformationConfig:
        """
        Prepare and return configuration for data transformation component.
        """
        config = self.config.data_transformation

        # Ensure root directory for data transformation exists
        create_directories([config.root_dir])

        # Return structured config entity
        return DataTransformationConfig(
            root_dir=config.root_dir,
            data_path=config.data_path
        )
       
         

In [None]:
# this code belong from component/data_transformation.py file

# import os 
# from mlProject import logger
# from sklearn.model_selection import train_test_split
# import pandas as pd

# class DataTransformation:
#     def __init__(self, config: DataTranformationConfig):
#         self.config = config
        
#     ## Note: you can add different data transformation techniques such as Scaler, PCA and all
#     ## you can perform all kinds of EDA in ML cycle here before passing this data to the model
    
#     # I am only adding train_test_spliting  because this data is already cleaned
    
#     def train_test_spliting(self):
#         data = pd.read_csv(self.config.data_path)
        
#         # Split the data into training and test sets. (0.75, 0.25) split.
#         train, test = train_test_split(data)
        
#         train.to_csv(os.path.join(self.config.root_dir, "train.csv"), index = False)
#         test.to_csv(os.path.join(self.config.root_dir, "test.csv"), index = False)
        
#         logger.info("Splited data into training and test sets")
#         logger.info(train.shape)
#         logger.info(test.shape)
        
#         print(train.shape)
#         print(test.shape)

import os
from mlProject import logger
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA

class DataTransformation:
    def __init__(self, config: DataTransformationConfig):
        self.config = config

    def train_test_spliting(self):
        # Step 1: Read Data
        data = pd.read_csv(self.config.data_path)
        logger.info("Data loaded successfully")

        # Step 2: Basic EDA
        logger.info(f"Data Head:\n{data.head()}")
        logger.info(f"Data Summary:\n{data.describe()}")
        logger.info(f"Missing Values:\n{data.isnull().sum()}")

        # Step 3: Drop missing values (or you can fill them)
        data = data.dropna()
        logger.info("Missing values dropped")

        # Step 4: Separate features and target (assume last column is target)
        X = data.iloc[:, :-1]   # All columns except last
        y = data.iloc[:, -1]    # Last column (target)

        # Step 5: Scale the features using MinMaxScaler
        scaler = MinMaxScaler()
        X_scaled = scaler.fit_transform(X)
        logger.info("Features scaled using MinMaxScaler")

        # Step 6: Apply PCA (reduce to 2 components)
        pca = PCA(n_components=2)
        X_pca = pca.fit_transform(X_scaled)
        logger.info("PCA applied, reduced to 2 features")

        # Step 7: Combine X_pca and target again
        df_pca = pd.DataFrame(X_pca, columns=["PCA1", "PCA2"])
        df_pca["target"] = y.reset_index(drop=True)

        # Step 8: Split the data into training and test sets
        train, test = train_test_split(df_pca, test_size=0.25, random_state=42)

        # Step 9: Save the train and test datasets
        train.to_csv(os.path.join(self.config.root_dir, "train.csv"), index=False)
        test.to_csv(os.path.join(self.config.root_dir, "test.csv"), index=False)

        logger.info("Splitted and saved train/test data successfully")
        logger.info(f"Train Shape: {train.shape}")
        logger.info(f"Test Shape: {test.shape}")

        print(train.shape)
        print(test.shape)


In [None]:
# this code belong from pipeline/data_transformation.py file

# try:
#     config = ConfigurationManager()
#     data_transformation_config = config.get_data_transformation_config()
#     data_transformation = DataTransformation(config = data_transformation_config)
#     data_transformation.train_test_spliting()
# except Exception as e:
#     raise e

from mlProject import logger

try:
    # Step 1: Configuration object
    config_manager = ConfigurationManager()

    # Step 2: Get Data Transformation Config
    data_transform_config = config_manager.get_data_transformation_config()

    # Step 3: Run Data Transformation
    data_transformer = DataTransformation(config=data_transform_config)
    data_transformer.train_test_spliting()

    logger.info("Data Transformation pipeline completed successfully.")

except Exception as e:
    logger.exception("Error occurred during data transformation pipeline.")
    raise


[2025-05-27 02:57:49,794: INFO: common: YAML file: config\config.yaml loaded successfully]
[2025-05-27 02:57:49,811: INFO: common: YAML file: params.yaml loaded successfully]
[2025-05-27 02:57:49,829: INFO: common: YAML file: schema.yaml loaded successfully]
[2025-05-27 02:57:49,843: INFO: common: Created directory at: artifacts]
[2025-05-27 02:57:49,854: INFO: common: Created directory at: artifacts/data_transformation]
[2025-05-27 02:57:49,927: INFO: 1769577190: ✅ Data loaded successfully]
[2025-05-27 02:57:49,964: INFO: 1769577190: 🔍 Data Head:
   fixed acidity  volatile acidity  citric acid  residual sugar  chlorides  \
0            7.4              0.70         0.00             1.9      0.076   
1            7.8              0.88         0.00             2.6      0.098   
2            7.8              0.76         0.04             2.3      0.092   
3           11.2              0.28         0.56             1.9      0.075   
4            7.4              0.70         0.00         