In [2]:
import os
%pwd

'/mnt/cb03386d-9344-47b1-82f9-868fbb64b4ae/python_projects/HIV_inhibitors_classification_and_generation/research'

In [3]:
os.chdir("../")
%pwd

'/mnt/cb03386d-9344-47b1-82f9-868fbb64b4ae/python_projects/HIV_inhibitors_classification_and_generation'

In [4]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataValidationConfig:
    root_dir: Path
    local_data_file: Path
    STATUS_FILE: Path
    all_schema: list

In [5]:
from hivclass.constants import *
from hivclass.utils.main_utils import create_directories, read_yaml

class ConfigurationMananger:
    def __init__(
        self,
        config_file_path = CONFIG_FILE_PATH,
        params_file_path = PARAMS_FILE_PATH,
        schema_file_path = SCHEMA_FILE_PATH
    ):
        self.config = read_yaml(config_file_path)
        self.params = read_yaml(params_file_path)
        self.schema = read_yaml(schema_file_path)
        
        create_directories([self.config.artifacts_root])
    
    def get_data_validation_config(self) -> DataValidationConfig:
        config = self.config.data_validation
        schema = self.schema.COLUMNS
        
        create_directories([config.root_dir])
        
        data_validation_config = DataValidationConfig(
            root_dir=config.root_dir,
            local_data_file=config.local_data_file,
            STATUS_FILE=config.STATUS_FILE,
            all_schema=schema
        )
        
        return data_validation_config

In [6]:
import pandas as pd
from hivclass import logger

class DataValidation:
    def __init__(self, config: DataValidationConfig):
        self.config = config
    
    def validate_dataset(self) -> bool:
        try:
            # Read dataset
            data_df = pd.read_csv(self.config.local_data_file)
            
            # Validate column names
            if set(data_df.columns) != set(self.config.all_schema):
                validation_status = False
                logger.info("Columns in the dataset CSV file do not match the schema!")
            
            # Check for missing values in any column
            elif data_df.isnull().any().any():
                validation_status = False
                missing_cols = data_df.columns[data_df.isnull().any()].tolist()
                logger.info(f"The following columns contain missing values: {missing_cols}")

            else:
                validation_status = True  # Passes all checks
            
            # Write final validation status once
            with open(self.config.STATUS_FILE, 'w') as f:
                f.write(f"Validation status: {validation_status}")
            
            return validation_status

        except Exception as e:
            logger.error(f"Dataset validation failed: {str(e)}")
            raise

In [8]:
try:
    config = ConfigurationMananger()
    data_validation_config = config.get_data_validation_config()
    data_validation = DataValidation(config=data_validation_config)
    data_validation.validate_dataset()

except Exception as e:
    raise e

[2025-03-03 21:59:00,071: INFO: main_utils: created directory at: artifacts]
[2025-03-03 21:59:00,074: INFO: main_utils: created directory at: artifacts/data_validation]
