In [1]:
import os

In [2]:
%pwd

'c:\\Users\\Shawn\\Desktop\\TextSummarizer\\app\\research'

In [3]:
os.chdir("../../")

In [4]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataValidationConfig:
    root_dir: Path
    STATUS_FILE: str
    ALL_REQUIRED_FILES: list

In [5]:
from src.constants import *
from src.utils.common import read_yaml, create_directories

In [6]:
class ConfigurationManager:
    def __init__(self,
                 config_filepath: Path = CONFIG_FILE_PATH,
                 params_filepath: Path = PARAMS_FILE_PATH):
        
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        create_directories([self.config.artifacts_root])

    
    def get_data_validation_config(self) -> DataValidationConfig:
        config = self.config.data_validation

        create_directories([config.root_dir])

        data_validation_config = DataValidationConfig(
            root_dir = config.root_dir,
            STATUS_FILE = config.STATUS_FILE,
            ALL_REQUIRED_FILES = config.ALL_REQUIRED_FILES
        )

        return data_validation_config

In [7]:
import os
from src.logging import logger

In [37]:
class DataValidation:
    def __init__(self, config: DataValidationConfig):
        self.config = config

    
    def validate_all_files_exist(self):
        try:
            validation_status = True
            all_files = os.listdir(os.path.join("app", "artifacts", "data_ingestion", "samsum_dataset"))
            missing_files = []
            invalid_files = []

            print(self.config)
            for required_file in self.config.ALL_REQUIRED_FILES:
                if required_file not in all_files:
                    validation_status = False
                    missing_files.append(required_file)
                    logger.error(f"{required_file} exists: Failed")
                else:
                    logger.info(f"{required_file} exists: Passed")
                    file_path = os.path.join("app", "artifacts", "data_ingestion", "samsum_dataset", required_file)
                    # Check if the required file is a directory
                    if os.path.isdir(file_path):
                        # Check for .arrow files within the directory
                        arrow_files = [f for f in os.listdir(file_path) if f.endswith('.arrow')]
                        if not arrow_files:
                            validation_status = False
                            invalid_files.append(f"{required_file} (no .arrow files found)")
                            logger.error(f".arrow files exists in {required_file}: Failed")
                        else:
                            logger.info(f".arrow files exists in {required_file}: Passed")
                            for arrow_file in arrow_files:
                                arrow_file_path = os.path.join(file_path, arrow_file)
                                # Check file size (example: file should not be empty)
                                if os.path.getsize(arrow_file_path) == 0:
                                    validation_status = False
                                    invalid_files.append(f"{arrow_file} (empty file)")
                                    logger.error(f"arrow file not empty: Failed")
                                else:
                                    logger.info(f"arrow file not empty: Passed")
                    else:
                        # Check file size (example: file should not be empty)
                        if os.path.getsize(file_path) == 0:
                            validation_status = False
                            invalid_files.append(f"{required_file} (empty file)")
                            

            # Write validation status to the status file
            with open(self.config.STATUS_FILE, "w") as f:
                if validation_status:
                    f.write("Validation Status: True\n")
                else:
                    f.write(f"Validation Status: {validation_status}\n")
                    if missing_files:
                        f.write(f"Missing Files: {', '.join(missing_files)}\n")
                    if invalid_files:
                        f.write(f"Invalid Files: {', '.join(invalid_files)}\n")

            return validation_status
        except Exception as e:
            raise e

In [38]:
try:
    config_manager = ConfigurationManager()
    data_validation_config = config_manager.get_data_validation_config()
    data_validation = DataValidation(config=data_validation_config)
    data_validation.validate_all_files_exist()
except Exception as e:
    raise e

[2024-12-24 18:42:30,584: INFO: common: Reading C:\Users\Shawn\Desktop\TextSummarizer\app\config\config.yaml file]
[2024-12-24 18:42:30,587: INFO: common: Reading C:\Users\Shawn\Desktop\TextSummarizer\app\params.yaml file]
[2024-12-24 18:42:30,589: INFO: common: Directory created: app/artifacts]
[2024-12-24 18:42:30,591: INFO: common: Directory created: app/artifacts/data_validation]
DataValidationConfig(root_dir='app/artifacts/data_validation', STATUS_FILE='app/artifacts/data_validation/status.txt', ALL_REQUIRED_FILES=BoxList(['train', 'test', 'validation']))
[2024-12-24 18:42:30,593: INFO: 554312349: train exists: Passed]
[2024-12-24 18:42:30,594: INFO: 554312349: .arrow files exists in train: Passed]
[2024-12-24 18:42:30,596: INFO: 554312349: arrow file not empty: Passed]
[2024-12-24 18:42:30,598: INFO: 554312349: test exists: Passed]
[2024-12-24 18:42:30,600: INFO: 554312349: .arrow files exists in test: Passed]
[2024-12-24 18:42:30,602: INFO: 554312349: arrow file not empty: Passe