In [2]:
import os

In [3]:
%pwd

'c:\\Users\\DELL\\OneDrive\\Desktop\\Projects\\Projects\\Machine learning\\Red_wine_project\\research'

In [4]:
os.chdir("../")

In [8]:
%pwd

'c:\\Users\\DELL\\OneDrive\\Desktop\\Projects\\Projects\\Machine learning\\Red_wine_project'

In [6]:
import pandas as pd
from pathlib import Path


In [9]:
data=pd.read_csv(Path('artifacts\data_ingestion\winequality-red.csv'))


In [None]:
data.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [None]:
data.columns

Index(['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
       'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
       'pH', 'sulphates', 'alcohol', 'quality'],
      dtype='object')

In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1599 entries, 0 to 1598
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         1599 non-null   float64
 1   volatile acidity      1599 non-null   float64
 2   citric acid           1599 non-null   float64
 3   residual sugar        1599 non-null   float64
 4   chlorides             1599 non-null   float64
 5   free sulfur dioxide   1599 non-null   float64
 6   total sulfur dioxide  1599 non-null   float64
 7   density               1599 non-null   float64
 8   pH                    1599 non-null   float64
 9   sulphates             1599 non-null   float64
 10  alcohol               1599 non-null   float64
 11  quality               1599 non-null   int64  
dtypes: float64(11), int64(1)
memory usage: 150.0 KB


In [10]:
from dataclasses import dataclass
from pathlib import Path

# @dataclass automatically generates:
# - __init__ method
# - readable object representation
# - comparison methods
#
# frozen=True makes the config object IMMUTABLE
# → prevents accidental modification during pipeline execution
# → ensures reproducibility and consistency
@dataclass(frozen=True)
class DataValidationConfig:
    # Root directory where all data validation artifacts will be stored
    # Example: artifacts/data_validation/
    root_dir: Path

    # File path where validation status is written (success / failure)
    # Used to track whether data validation passed
    STATUS_FILE: str

    # Path to the dataset file that needs to be validated
    # Comes from the data ingestion stage
    unzip_data_dir: Path

    # Schema definition loaded from schema.yaml
    # Contains expected columns, data types, etc.
    all_schema: dict


In [11]:
# Import all constant values defined in the constants module
# These typically include fixed file paths such as:
# CONFIG_FILE_PATH, PARAMS_FILE_PATH, SCHEMA_FILE_PATH
# Centralizing them avoids hardcoding paths across the project
from Red_Wine_Prediction.constants import *

# Import reusable utility/helper functions
# read_yaml         → Reads YAML configuration files (config, params, schema)
# create_directories → Safely creates directories needed for artifacts
# These utilities are shared across multiple pipeline stages
from Red_Wine_Prediction.utils.common import read_yaml, create_directories


In [13]:
class ConfigurationManager:
    """
    ConfigurationManager is responsible for:
    - Reading all YAML configuration files
    - Creating required artifact directories
    - Providing stage-specific configuration objects (dataclasses)
    """

    def __init__(
        self,
        config_filepath=CONFIG_FILE_PATH,
        params_filepath=PARAMS_FILE_PATH,
        schema_filepath=SCHEMA_FILE_PATH,
    ):
        """
        Initializes the ConfigurationManager.

        Args:
            config_filepath : Path to config.yaml (pipeline structure & paths)
            params_filepath : Path to params.yaml (model parameters)
            schema_filepath : Path to schema.yaml (data schema definitions)
        """

        # Read config.yaml → contains paths for each pipeline stage
        self.config = read_yaml(config_filepath)

        # Read params.yaml → contains model hyperparameters (used later)
        self.params = read_yaml(params_filepath)

        # Read schema.yaml → contains expected columns & data types
        self.schema = read_yaml(schema_filepath)

        # Create the root artifacts directory
        # All pipeline outputs (data, validation, models, etc.) live inside this
        create_directories([self.config.artifacts_root])

    def get_data_validation_config(self) -> DataValidationConfig:
        """
        Prepares and returns the configuration for the Data Validation stage.

        This method:
        - Extracts the data_validation section from config.yaml
        - Extracts column schema from schema.yaml
        - Creates required directories
        - Wraps everything inside a DataValidationConfig dataclass

        Returns:
            DataValidationConfig: Immutable config object for data validation
        """

        # Access the data_validation section from config.yaml
        config = self.config.data_validation

        # Access expected columns/schema from schema.yaml
        # Example: schema.yaml → COLUMNS:
        #              fixed_acidity: float
        #              alcohol: float
        schema = self.schema.COLUMNS

        # Create directory for data validation artifacts
        create_directories([config.root_dir])

        # Create DataValidationConfig object
        # This converts raw YAML values into a structured, immutable config
        data_validation_config = DataValidationConfig(
            root_dir=config.root_dir,
            STATUS_FILE=config.STATUS_FILE,
            unzip_data_dir=config.unzip_data_dir,
            all_schema=schema,
        )

        # Return the prepared configuration to the DataValidation component
        return data_validation_config


In [14]:
import os

from Red_Wine_Prediction   import logger 



In [15]:
class DataValidation:
    """
    DataValidation is responsible for:
    - Reading the ingested dataset
    - Validating dataset columns against the schema
    - Writing validation status to a status file
    """

    def __init__(self, config: DataValidationConfig):
        """
        Initializes the DataValidation class.

        Args:
            config (DataValidationConfig): Configuration object containing
                                           dataset path, schema, and status file path
        """
        # Store the data validation configuration
        self.config = config

    def validate_all_columns(self) -> bool:
        """
        Validates whether all columns in the dataset match the schema.

        Returns:
            bool: Validation status (True if valid, False otherwise)
        """
        try:
            # Initialize validation status
            validation_status = None

            # Read the dataset CSV file produced by data ingestion
            data = pd.read_csv(self.config.unzip_data_dir)

            # Extract column names from the dataset
            all_cols = list(data.columns)

            # Extract expected column names from schema.yaml
            all_schema = self.config.all_schema.keys()

            # Iterate over each column in the dataset
            for col in all_cols:
                if col not in all_schema:
                    # If a column is not present in the schema,
                    # mark validation as failed
                    validation_status = False

                    # Write failure status to status file
                    with open(self.config.STATUS_FILE, 'w') as f:
                        f.write(f"Validation status: {validation_status}")
                else:
                    # If column exists in schema,
                    # mark validation as successful (for this column)
                    validation_status = True

                    # Write success status to status file
                    with open(self.config.STATUS_FILE, 'w') as f:
                        f.write(f"Validation status: {validation_status}")

            # Return final validation status
            return validation_status

        except Exception as e:
            # Re-raise any exception for centralized logging and handling
            raise e


In [16]:
try:
    # Initialize ConfigurationManager
    # Loads config.yaml, params.yaml, and schema.yaml
    # Also ensures the root artifacts directory exists
    config = ConfigurationManager()

    # Get configuration specific to the Data Validation stage
    # Returns a DataValidationConfig dataclass
    data_validation_config = config.get_data_validation_config()

    # Initialize the DataValidation component with its configuration
    data_validation = DataValidation(config=data_validation_config)

    # Run column validation
    # Checks whether dataset columns match the schema
    # Writes validation status to STATUS_FILE
    data_validation.validate_all_columns()

except Exception as e:
    # Re-raise any exception so it can be logged and handled
    # by the main pipeline controller (main.py)
    raise e


[2026-01-02 21:40:58,822: INFO: common: YAML file loaded successfully: config\config.yaml]
[2026-01-02 21:40:58,824: INFO: common: YAML file loaded successfully: params.yaml]
[2026-01-02 21:40:58,826: INFO: common: YAML file loaded successfully: schema.yaml]
[2026-01-02 21:40:58,828: INFO: common: Directory created at: artifacts]
[2026-01-02 21:40:58,829: INFO: common: Directory created at: artifacts/data_validation]
