In [1]:
import os
os.chdir('../')

In [2]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataValidationConfig:
    root_dir: Path
    STATUS_FILE: str
    unzip_data_dir: Path
    all_schema: dict

In [3]:
from MLProject.constants import *
from MLProject.utils.common import read_yaml, create_directories

In [6]:
class ConfigurationManager:
    def __init__(
            self,
            config_filepath = CONFIG_FILE_PATH,
            params_filepath = PARAMS_FILE_PATH,
            schema_filepath = SCHEMA_FILE_PATH
    ):
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        create_directories([self.config.artifacts_root])

    def get_validation_config(self) -> DataValidationConfig:
        config = self.config.data_validation
        schema = self.schema.COLUMNS
        create_directories([config.root_dir])

        data_validation_config = DataValidationConfig(
            root_dir=config.root_dir,
            STATUS_FILE=config.STATUS_FILE,
            unzip_data_dir=config.unzip_data_dir,
            all_schema= schema
        )

        return data_validation_config

In [12]:
test = read_yaml(SCHEMA_FILE_PATH)
test.COLUMNS.keys()

| 2024-01-22 22:22:37,971 | INFO | common | yaml file: schema.yaml loaded successfully |


dict_keys(['Loan ID', 'Customer ID', 'Loan Status', 'Current Loan Amount', 'Term', 'Credit Score', 'Annual Income', 'Years in current job', 'Home Ownership', 'Purpose', 'Monthly Debt', 'Years of Credit History', 'Months since last delinquent', 'Number of Open Accounts', 'Number of Credit Problems', 'Current Credit Balance', 'Maximum Open Credit', 'Bankruptcies', 'Tax Liens'])

In [65]:
test.COLUMNS['Loan ID']

'object'

In [80]:
import os
from MLProject import logger
import pandas as pd

class DataValidation:
    def __init__(self, config: DataValidationConfig):
        self.config = config
        self.data = pd.read_csv(self.config.unzip_data_dir)
        with open(self.config.STATUS_FILE, 'w') as f:
            pass
        

    def validate_all_columns(self) -> bool:
        try: 
            all_cols =self.data.columns.to_list()

            all_schema = self.config.all_schema.keys()

            validation_status = True  # Assume validation is True initially

            with open(self.config.STATUS_FILE, 'a') as f:
                for col in all_schema:
                    if col not in all_cols:
                        validation_status = False
                        f.write(f"Validation status: {validation_status}, Because {col} not in schema\n")
                
                # Write the final validation status to the file
                f.write(f"Final Validation Columns status: {validation_status}\n")

            return validation_status
            
        except Exception as e:
            logger.exception(e)
            raise e
    
    def validate_all_type(self, initial = True) -> bool:
        validation_status = initial  # Assume validation is True initially
        keys = self.config.all_schema.keys()
        all_cols = self.data.columns.to_list()
        with open(self.config.STATUS_FILE, 'a') as f:
            for col in all_cols:
                if self.config.all_schema[col] != self.data[col].dtype:
                    validation_status = False
                    f.write(f"Validation status: {validation_status}, Because {col} type different from schema\n")
        
            # Write the final validation status to the file
            f.write(f"Final Validation type status: {validation_status}\n")
        return validation_status

In [82]:
try:
    config = ConfigurationManager()
    data_validation_config = config.get_validation_config()
    data_validation = DataValidation(config=data_validation_config)
    valid_1 = data_validation.validate_all_columns()
    if valid_1:
        valid_2 = data_validation.validate_all_type()
    else:
        valid_2 = data_validation.validate_all_type(initial = False)

except Exception as e:
    raise e

| 2024-01-22 22:59:54,090 | INFO | common | yaml file: config\config.yaml loaded successfully |
| 2024-01-22 22:59:54,092 | INFO | common | yaml file: params.yaml loaded successfully |
| 2024-01-22 22:59:54,096 | INFO | common | yaml file: schema.yaml loaded successfully |
| 2024-01-22 22:59:54,098 | INFO | common | created directory at: artifacts |
| 2024-01-22 22:59:54,099 | INFO | common | created directory at: artifacts/data_validation |
