In [7]:
import os

In [8]:
%pwd

'/Users/rahul/Documents/TextSum/research'

In [9]:
os.chdir("../")

In [10]:
%pwd

'/Users/rahul/Documents/TextSum'

In [1]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataValidationConfig:
    root_dir: Path
    STATUS_FILE: str
    ALL_REQUIRED_FILES: list

In [2]:
from TextSum.constants import *
from TextSum.utils.common import read_yaml, create_directories

In [3]:
class ConfigurationManager:
    def __init__(
            self,
            config_filepath =   CONFIG_FILE_PATH,
            params_filepath = PARAMS_FILE_PATH):
            
            self.config= read_yaml(config_filepath)
            self.params=read_yaml(params_filepath)
            
            create_directories([ self.config.artifacts_root])

    def get_data_validation_config(self) -> DataValidationConfig:
          config= self.config.data_validation

          create_directories([config.root_dir])

          data_validation_config=DataValidationConfig(
                root_dir=config.root_dir,
                STATUS_FILE= config.STATUS_FILE,
                ALL_REQUIRED_FILES= config.ALL_REQUIRED_FILES
          )

          return data_validation_config

In [4]:
import os
from TextSum.logging import logger

In [14]:
import os
import zipfile

class DataValidation:
    def __init__(self, config):
        self.config = config

    def validate_all_files_and_dirs_exist(self) -> bool:
        try:
            # Define the path to the edindata directory
            edindata_dir = os.path.join("artifacts", "data_ingestion", "edin_dataset")

            # Check if the edindata directory exists
            if not os.path.isdir(edindata_dir):
                raise FileNotFoundError(f"Directory does not exist: {edindata_dir}")

            # List all files and directories in the edindata directory
            all_files_and_dirs = set()
            for root, dirs, files in os.walk(edindata_dir):
                # Add directories to the list
                for dir in dirs:
                    all_files_and_dirs.add(os.path.relpath(os.path.join(root, dir), edindata_dir) + '/')
                # Add files to the list
                for file in files:
                    all_files_and_dirs.add(os.path.relpath(os.path.join(root, file), edindata_dir))

            # Check if all required files and directories are present
            missing_paths = [path for path in self.config.ALL_REQUIRED_FILES if path not in all_files_and_dirs]

            validation_status = len(missing_paths) == 0

            # Define the path for status.txt
            status_file_path = os.path.join("artifacts", "data_validation", "status.txt")

            # Write validation status to the status file
            with open(status_file_path, 'w') as f:
                if validation_status:
                    f.write("Validation Status: Success - All required files/directories are present.\n")
                else:
                    f.write(f"Validation Status: Failed - Missing files/directories: {', '.join(missing_paths)}\n")

            return validation_status

        except Exception as e:
            # Define the path for status.txt
            status_file_path = os.path.join("artifacts", "data_validation", "status.txt")
            
            # Write error status to the status file
            with open(status_file_path, 'w') as f:
                f.write(f"Validation Status: Error - {str(e)}\n")
            
            print(f"An error occurred during validation: {str(e)}")
            return False

In [18]:
if __name__ == "__main__":
    try:
        # Create a ConfigurationManager instance
        config_manager = ConfigurationManager()

        # Get the data validation configuration
        data_validation_config = config_manager.get_data_validation_config()

        # Create a DataValidation instance with the configuration
        data_validation = DataValidation(config=data_validation_config)

        # Perform validation and write the status to status.txt
        validation_status = data_validation.validate_all_files_and_dirs_exist()

        # Print the result
        if validation_status:
            print("Validation successful. All required files are present.")
        else:
            print("Validation failed. Check the status.txt file for details.")
    except Exception as e:
        print(f"An error occurred: {str(e)}")

[2024-08-18 15:41:45,071:INFO:common:yaml file: config/config.yaml loaded succesfully]
[2024-08-18 15:41:45,073:INFO:common:yaml file: params.yaml loaded succesfully]
[2024-08-18 15:41:45,073:INFO:common:Created directory: artifacts]
[2024-08-18 15:41:45,074:INFO:common:Created directory: artifacts/data_validation]
Validation successful. All required files are present.
