In [11]:
import os
os.chdir("../")
%pwd

'/Users/tapankheni/Data_Science/Data Science Projects/Credit_Card_Fault_Prediction'

In [12]:
from dataclasses import dataclass
from pathlib import Path
from typing import List

@dataclass 
class DataValidationConfig:
    root_dir: Path
    local_data_path: List[str]
    STATUS_FILE: str
    all_schema: List[dict]

In [13]:
from CreditCardFraudDetection.constants import (
    PARAMS_YAML_FILE_PATH, CONFIG_YAML_FILE_PATH, SCHEMA_YAML_FILE_PATH
)
from CreditCardFraudDetection.utils.common import read_yaml, create_directories

In [18]:
class ConfigurationManager:
    def __init__(self,
                 params_yaml_file_path = PARAMS_YAML_FILE_PATH,
                 config_yaml_file_path = CONFIG_YAML_FILE_PATH,
                 schema_yaml_file_path = SCHEMA_YAML_FILE_PATH):
        
        self.params = read_yaml(params_yaml_file_path)
        self.config = read_yaml(config_yaml_file_path)
        self.schema = read_yaml(schema_yaml_file_path)

        create_directories([self.config.artifacts_root])

    def get_data_validation_config(self) -> DataValidationConfig:
        config = self.config.data_validation
        schema = [self.schema.COLUMNS_DATA1, self.schema.COLUMNS_DATA2]

        create_directories([config.root_dir])

        return DataValidationConfig(
            root_dir = Path(config.root_dir),
            local_data_path = config.local_data_path,
            STATUS_FILE = config.STATUS_FILE,
            all_schema = schema
        )

In [15]:
from CreditCardFraudDetection import logger
import pandas as pd

In [19]:
class DataValidation:
    def __init__(self, config: DataValidationConfig):
        self.config = config
        assert isinstance(self.config.local_data_path, List[str]), "local_data_path should be a list of strings"
        assert isinstance(self.config.all_schema, List[dict]), "all_schema should be a list of dictionaries"
        assert len(self.config.local_data_path) == len(self.config.all_schema), "Mismatch between the number of data files and schema provided"

    def validate_all_columns(self) -> bool:

        logger.info("Validation process of all columns in the data started...")
        
        try:
            validation_status = None

            i = 0
            for data_path in self.config.local_data_path:
                data = pd.read_csv(data_path)
                schema = self.config.all_schema[i].keys()

                data_columns = list(data.columns)

                for col in data_columns:
                    if col not in schema:
                        validation_status = False
                        with open(self.config.STATUS_FILE, "w") as file:
                            file.write(f"Validation status: {validation_status}")

                        logger.error(f"Column {col} not in schema for data stored at {data_path}")
                        break
                else:
                    validation_status = True
                    with open(self.config.STATUS_FILE, "w") as file:
                        file.write(f"Validation status: {validation_status}")
                
                i += 1
                
            logger.info("Validation process of all columns in the data completed.")
            return validation_status
            
        except Exception as e:
            logger.error(f"Error in validating all columns: {e}")
            raise e

In [20]:
try:
    config_manager = ConfigurationManager()
    data_validation_config = config_manager.get_data_validation_config()
    data_validation = DataValidation(config = data_validation_config)
    data_validation.validate_all_columns()

except Exception as e:
    logger.error(f"Error in validating all columns: {e}")
    raise e

[2024-07-05 13:55:26,183: INFO: common: yaml file: params.yaml loaded successfully]
[2024-07-05 13:55:26,186: INFO: common: yaml file: config/config.yaml loaded successfully]
[2024-07-05 13:55:26,191: INFO: common: yaml file: schema.yaml loaded successfully]
[2024-07-05 13:55:26,191: INFO: common: created directory at: artifacts]
[2024-07-05 13:55:26,192: INFO: common: created directory at: artifacts/data_validation]
