In [1]:
%pwd

'/workspaces/End-End-Text-summeriser/research'

In [2]:
# change working directory to root directory
import os
os.chdir("../")

In [3]:
# update the config file for data validation 
# 3. define the entity

from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class DataValidationConfig:
    root_dir: Path
    STATUS_FILE: str
    ALL_REQUIRED_FILES: list

In [None]:
# 4. configuration manager in src config.

In [5]:
from textSummerizer.constants import *
from textSummerizer.utils.common import read_yaml, create_directories

In [6]:
class ConfigurationManager:

    #constructor assigning values from constant file to the class variables

    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH):

        # reading yaml config file and params file and assigning them to the class variables
        
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        # create directories/file for artifacts
        # note that we are using . to access, as we have defined using configbox

        create_directories([self.config.artifacts_root])

    # user defined functions with parameters returned type are based on the decorator function defined in above cell
    
    def get_data_validation_config(self) -> DataValidationConfig:
        config = self.config.data_validation

        create_directories([config.root_dir])

        data_validation_config = DataValidationConfig(
            root_dir=config.root_dir,
            STATUS_FILE=config.STATUS_FILE,
            ALL_REQUIRED_FILES=config.ALL_REQUIRED_FILES,
        )

        return data_validation_config

In [None]:
#5. components

In [7]:
import os
from textSummerizer.logging import logger

In [8]:
# create class for data validation component will use the class configuration manager

class DataValiadtion:

    # will take in the config object and will use the class configuration manager to get the data validation config object
    def __init__(self, config: DataValidationConfig):
        self.config = config # initialize the config object


    # method to validate all required files are present
    def validate_all_files_exist(self)-> bool:
        try:
            validation_status = None

            all_files = os.listdir(os.path.join("artifacts","data_ingestion","samsum_dataset"))

            for file in all_files:
                if file not in self.config.ALL_REQUIRED_FILES:
                    validation_status = False
                    with open(self.config.STATUS_FILE, 'w') as f:
                        f.write(f"Validation status: {validation_status}") # Validation status will return false if any of the required files are not present
                else:
                    validation_status = True
                    with open(self.config.STATUS_FILE, 'w') as f:
                        f.write(f"Validation status: {validation_status}")

            return validation_status
        
        except Exception as e:
            raise e

In [None]:
# 6. pipeline

In [9]:
try:
    config = ConfigurationManager() # initialize the config manager
    data_validation_config = config.get_data_validation_config() # get the data validation config object
    data_validation = DataValiadtion(config=data_validation_config) # initialize the data validation object with the config object
    data_validation.validate_all_files_exist() # validate all required files are present
except Exception as e:
    raise e

[2024-03-19 00:26:12,868: INFO: common: yaml file: config/config.yaml loaded successfully]
[2024-03-19 00:26:12,871: INFO: common: yaml file: params.yaml loaded successfully]
[2024-03-19 00:26:12,872: INFO: common: created directory at: artifacts]
[2024-03-19 00:26:12,873: INFO: common: created directory at: artifacts/data_validation]


In [None]:
# pipeline created successfully for data validation, can be verified under artifacts/data_ingestion/samsum_dataset/validation_status.txt
# Update components for data validation in the src folder.