# Notebook experimentation for data validation

Data validation is to check whether the ingested data is valid.

In [1]:
import os

In [2]:
%pwd

'/Users/soogeunpark/Documents/text_summarizer_cicd/Text-Summarizer/research'

In [3]:
os.chdir("../")

In [4]:
os.getcwd()

'/Users/soogeunpark/Documents/text_summarizer_cicd/Text-Summarizer'

I first update the ```config.yaml```

Now I work on the ```config\\configuration.py```

In [5]:
from TextSummarizer.constants import *
# the asterisk imports everything in the directory

# I'm importing the file paths given in the constants file

In [14]:
from TextSummarizer.utils.common import read_yaml, create_directories

In [8]:

class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH
    ):
        """
        This class is used to manage the configuration of the project
        """

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        create_directories([self.config.artifacts_roots]) 
        # this refers to the artifacts_roots in the config.yaml
        # this creates the 'artifacts' directory
        # because self.config does the "read_yaml" function which uses ConfigBox, the artifacts_roots can be just accessed by using the dot notation

    def data_validation_config(self) -> DataValidationConfig:
        """
        This function returns the data validation config
        """
        config = self.config.data_validation
        # this refers to the data_validation in the config.yaml which is loaded above

        create_directories([config.root_dir])
        # this creates the directory under artifacts called 'data_validation'
        
        data_validation_config = DataValidationConfig(
            root_dir = config.root_dir,
            STATUS_FILE = config.STATUS_FILE,
            ALL_REQUIRED_FILES = config.ALL_REQUIRED_FILES
        )
        
        return data_validation_config


Define the entity: entity is the format that is used for providing the information for the configuration for data validation

In [7]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen = True)
class DataValidationConfig:
    root_dir: Path
    STATUS_FILE: str
    ALL_REQUIRED_FILES: list



Now working on the components:

In [10]:
import os
from TextSummarizer.logging import logger

In [19]:
class DataValidation:
    def __init__(self, config: DataValidationConfig):
        """
        This class is used to execute data validation
        """
        self.config = config

    # method that downloads the data from the url
    def validate_all_files_exist(self) -> bool:

        try: 
            all_files = os.listdir(os.path.join("artifacts", "data_ingestion", "samsum_dataset"))
            # this provides all of the files in the artifacts/data_ingestion/samsum_dataset,
            # in a list format

            for file in all_files:
                if file not in self.config.ALL_REQUIRED_FILES:
                    logger.info(f"Validation failed for file: {file}")
                else: 
                    logger.info(f"Validation passed for file: {file}")

            validation_status = all(file in all_files for file in self.config.ALL_REQUIRED_FILES)
            
            with open(self.config.STATUS_FILE, "w") as f:
                f.write(f"Validation status: {validation_status}.")

        except Exception as e:
            raise e

        return validation_status
        
                

In [20]:

try:
    config = ConfigurationManager()
    
    data_validation_config = config.data_validation_config()

    data_validation = DataValidation(config = data_validation_config)
    # this takes the data_validation_config, and uses it to create the DataValidation class

    # now we use the method defined in the dtaa_validation class
    
    data_validation.validate_all_files_exist()

except Exception as e:
    raise e

[2023-12-31 12:33:30,150: INFO: common: yaml file: config\config.yaml loaded successfully]
[2023-12-31 12:33:30,152: INFO: common: yaml file: params.yaml loaded successfully]
[2023-12-31 12:33:30,153: INFO: common: created directory at: artifacts]
[2023-12-31 12:33:30,154: INFO: common: created directory at: artifacts/data_validation]
[2023-12-31 12:33:30,154: INFO: 578262685: Validation failed for file: dataset_dict.json]
[2023-12-31 12:33:30,155: INFO: 578262685: Validation passed for file: test]
[2023-12-31 12:33:30,156: INFO: 578262685: Validation passed for file: train]
[2023-12-31 12:33:30,156: INFO: 578262685: Validation passed for file: validation]
[2023-12-31 12:33:30,157: INFO: 578262685: Validation failed for file: xxxx]


Now let's convert this into modular coding...