In [3]:
import os

In [4]:
%pwd

'e:\\project\\Wine_quality_prediction\\research'

In [5]:
os.chdir("../")

In [6]:
%pwd

'e:\\project\\Wine_quality_prediction'

In [7]:
import pandas as pd

In [8]:
data = pd.read_csv("artifacts\data_ingestion\data.csv")
data.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.17,0.29,1.4,0.047,23.0,107.0,0.9939,3.52,0.65,10.4,6
1,5.3,0.31,0.38,10.5,0.031,53.0,140.0,0.99321,3.34,0.46,11.7,6
2,4.7,0.145,0.29,1.0,0.042,35.0,90.0,0.9908,3.76,0.49,11.3,6
3,6.9,0.26,0.29,4.2,0.043,33.0,114.0,0.9902,3.16,0.31,12.5,6
4,6.4,0.45,0.07,1.1,0.03,10.0,131.0,0.9905,2.97,0.28,10.8,5


In [9]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32485 entries, 0 to 32484
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         32485 non-null  float64
 1   volatile acidity      32485 non-null  float64
 2   citric acid           32485 non-null  float64
 3   residual sugar        32485 non-null  float64
 4   chlorides             32485 non-null  float64
 5   free sulfur dioxide   32485 non-null  float64
 6   total sulfur dioxide  32485 non-null  float64
 7   density               32485 non-null  float64
 8   pH                    32485 non-null  float64
 9   sulphates             32485 non-null  float64
 10  alcohol               32485 non-null  float64
 11  quality               32485 non-null  int64  
dtypes: float64(11), int64(1)
memory usage: 3.0 MB


In [10]:
from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class DataValidationConfig:
    root_dir: Path
    STATUS_FILE: str
    data_file: Path  
    all_schema: dict

In [11]:
from mlProject.constants import *
from mlProject.utils.common import read_yaml, create_directories

In [12]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH,
        schema_filepath = SCHEMA_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        create_directories([self.config.artifacts_root])


    
    def get_data_validation_config(self) -> DataValidationConfig:
        config = self.config.data_validation
        schema = self.schema.COLUMNS

        create_directories([config.root_dir])

        data_validation_config = DataValidationConfig(
            root_dir=config.root_dir,
            STATUS_FILE=config.STATUS_FILE,
            data_file=self.config.data_ingestion.local_data_file,  
            all_schema=schema,
        )

        return data_validation_config


In [13]:
import os
from mlProject import logger

In [14]:
class DataValidation:
    def __init__(self, config: DataValidationConfig):
        self.config = config

    def validate_all_columns(self) -> bool:
        try:
            data = pd.read_csv(self.config.data_file)  
            all_cols = set(data.columns)
            all_schema = set(self.config.all_schema.keys())

            missing_cols = all_schema - all_cols
            validation_status = len(missing_cols) == 0

            with open(self.config.STATUS_FILE, 'w') as f:
                f.write(f"Validation status: {validation_status}")

            if not validation_status:
                logger.warning(f"Missing columns: {missing_cols}")
            else:
                logger.info("All columns are present!")

            return validation_status

        except Exception as e:
            logger.error(f"Error in Data Validation: {e}")
            raise e  

In [15]:
try:
    config = ConfigurationManager()
    data_validation_config = config.get_data_validation_config()
    data_validation = DataValidation(config=data_validation_config)
    data_validation.validate_all_columns()
except Exception as e:
    raise e

[2025-04-11 10:17:11,889: INFO: common: yaml file: config\config.yaml loaded successfully]
[2025-04-11 10:17:11,891: INFO: common: yaml file: params.yaml loaded successfully]
[2025-04-11 10:17:11,894: INFO: common: yaml file: schema.yaml loaded successfully]
[2025-04-11 10:17:11,895: INFO: common: created directory at: artifacts]
[2025-04-11 10:17:11,895: INFO: common: created directory at: artifacts/data_validation]
[2025-04-11 10:17:11,921: INFO: 2830697464: All columns are present!]
