## Data Validation

In [2]:
%pwd

'c:\\Users\\user\\Desktop\\moniepoint\\notebooks'

In [3]:
from dataclasses import dataclass
from pathlib import Path
import os

In [4]:
os.chdir("../")

In [5]:
%pwd

'c:\\Users\\user\\Desktop\\moniepoint'

In [6]:
from src.anomaly_detection.utils.common import load_yaml, create_directories

In [8]:
from src.anomaly_detection.constant import *

C:\Users\user\Desktop\moniepoint


In [10]:
from typing import Optional

In [25]:
@dataclass(frozen=True)
class DataValidationConfig:
    """
    Configuration class for data validation settings.

    Attributes:
        root_dir (Path): Root directory for storing data validation artifacts.
        STATUS_FILE (str): Path to the file where the validation status will be recorded.
        unzip_data_dir (Path): Directory containing the extracted data for validation.
        all_schema (dict): Dictionary defining the expected schema (column names and data types).

    Notes:
        - The class is frozen, ensuring immutability after instantiation.
        - The schema dictionary plays a critical role in ensuring data integrity before further processing.
    """
    root_dir: Path
    STATUS_FILE: str
    data_path: Path
    data_path2: Path
    all_schema: dict
    table_info: Optional[str]

```bash
data_validation:
  root_dir: artifacts/data_validation
  data_path: artifacts/data_ingestion/anomaly_detection.csv
  STATUS_FILE: artifacts/data_validation/status.txt
```

In [12]:
# class ConfigurationManager:
#     def __init__(self, config_filepath=CONFIG_FILE_PATH, 
#                  params_filepath=PARAMS_FILE_PATH,
#                  schema_filepath = SCHEMA_FILE_PATH):
#         self.config = load_yaml(config_filepath)
#         self.params = load_yaml(params_filepath)
#         self.schema = load_yaml(schema_filepath)
        
#         create_directories([self.config.artifacts_root])
        
#     def get_data_validation_config(self) -> DataValidationConfig:
#         config = self.config.data_validation
#         schema = self.schema['tables'][0]['columns'][0]
        
#         create_directories([config.root_dir])
        
#         data_validation_config = DataValidationConfig(
#             root_dir = config.root_dir,
#             STATUS_FILE= config.STATUS_FILE,
#             data_path = config.data_path,
#             all_schema = schema
#         )
#         return data_validation_config

In [26]:
output = load_yaml(SCHEMA_FILE_PATH)
print(output['tables'][0]['columns'][0])

2025-08-15 01:20:00 - INFO - [common.py:34] - YAML file C:\Users\user\Desktop\moniepoint\schema.yaml loaded successfully.


{'name': 'raw_log', 'type': 'string', 'description': 'Full raw transaction log entry.', 'constraints': {'required': True, 'unique': False, 'allowed_patterns': ['^(?!.*MLAFORMED_LOG).+$'], 'min_length': 1}}


In [59]:
class ConfigurationManager:
    def __init__(self, config_filepath=CONFIG_FILE_PATH, 
                 params_filepath=PARAMS_FILE_PATH,
                 schema_filepath = SCHEMA_FILE_PATH):
        self.config = load_yaml(config_filepath)
        self.params = load_yaml(params_filepath)
        self.schema = load_yaml(schema_filepath)
        
        create_directories([self.config.artifacts_root])
        
    def get_data_validation_config(self, table_name='transaction_logs') -> DataValidationConfig:
        config = self.config.data_validation
        
        # Get all columns for the specified table
        table_schema = None
        for table in self.schema['tables']:
            if table['name'] == table_name:
                table_schema = table
                break
        
        if table_schema is None:
            raise ValueError(f"Table '{table_name}' not found in schema")
        
        # Extract column information
        columns_info = {}
        for column in table_schema['columns']:
            columns_info[column['name']] = {
                'type': column['type'],
                'description': column.get('description', ''),
                'constraints': column.get('constraints', column.get('constraints', {}))  # Handle typo
            }
        
        create_directories([config.root_dir])
        
        data_validation_config = DataValidationConfig(
            root_dir = config.root_dir,
            STATUS_FILE= config.STATUS_FILE,
            data_path = config.data_path,
            data_path2 = config.data_path2,
            all_schema = columns_info,  # Now contains all columns with their types
            table_info = table_schema   # Optional: include full table info
        )
        return data_validation_config

In [28]:
import pandas as pd 

In [29]:
class DataValidation:
    def __init__(self, config: DataValidationConfig):
        self.config = config 
     
    def validate_data(self):
        try:
            validation_status = None 
             
            data = pd.read_csv(self.config.data_path)
            all_columns = list(data.columns)
            
            all_schema = self.config.all_schema.keys()
            
            for col in all_columns:
               if col not in all_schema:
                   validation_status = False  
                   with open(self.config.STATUS_FILE, "w") as f:
                       f.write(f"Validation status: {validation_status}")
               else:
                   validation_status = True  
                   with open(self.config.STATUS_FILE, "w") as f:
                       f.write(f"Validation status: {validation_status}")
                       
            return validation_status
        except Exception as e:
            raise e

In [30]:
try:
    config = ConfigurationManager()
    data_validation_config = config.get_data_validation_config()
    data_validation = DataValidation(config=data_validation_config)
    data_validation.validate_data()
except Exception as e:
    raise e

2025-08-15 01:23:19 - INFO - [common.py:34] - YAML file C:\Users\user\Desktop\moniepoint\config\config.yaml loaded successfully.
2025-08-15 01:23:19 - INFO - [common.py:34] - YAML file C:\Users\user\Desktop\moniepoint\params.yaml loaded successfully.
2025-08-15 01:23:19 - INFO - [common.py:34] - YAML file C:\Users\user\Desktop\moniepoint\schema.yaml loaded successfully.
2025-08-15 01:23:19 - INFO - [common.py:88] - Created directory at: artifacts
2025-08-15 01:23:19 - INFO - [common.py:88] - Created directory at: artifacts/data_validation


In [31]:
class DataValidation:
    def __init__(self, config: DataValidationConfig):
        self.config = config 
     
    def validate_data(self):
        try:
            validation_status = None 
             
            data = pd.read_csv(self.config.data_path2)
            all_columns = list(data.columns)
            
            all_schema = self.config.all_schema.keys()
            
            for col in all_columns:
               if col not in all_schema:
                   validation_status = False  
                   with open(self.config.STATUS_FILE, "w") as f:
                       f.write(f"Validation status: {validation_status}")
               else:
                   validation_status = True  
                   with open(self.config.STATUS_FILE, "w") as f:
                       f.write(f"Validation status: {validation_status}")
                       
            return validation_status
        except Exception as e:
            raise e

In [35]:
try:
    config = ConfigurationManager()
    data_validation_config = config.get_data_validation_config(table_name="processed_transaction_logs")
    print(data_validation_config.all_schema)
    print(data_validation_config.all_schema.keys())
    data_validation = DataValidation(config=data_validation_config)
    data_validation.validate_data()
except Exception as e:
    raise e

2025-08-15 01:27:14 - INFO - [common.py:34] - YAML file C:\Users\user\Desktop\moniepoint\config\config.yaml loaded successfully.
2025-08-15 01:27:14 - INFO - [common.py:34] - YAML file C:\Users\user\Desktop\moniepoint\params.yaml loaded successfully.
2025-08-15 01:27:14 - INFO - [common.py:34] - YAML file C:\Users\user\Desktop\moniepoint\schema.yaml loaded successfully.
2025-08-15 01:27:14 - INFO - [common.py:88] - Created directory at: artifacts
2025-08-15 01:27:14 - INFO - [common.py:88] - Created directory at: artifacts/data_validation


{'original_log': {'type': 'string', 'description': 'The original raw log entry before parsing.', 'constraints': Box({'required': True, 'min_length': 1})}, 'datetime': {'type': 'timestamp', 'description': 'Transaction timestamp extracted from original_log.', 'constraints': Box({'required': True})}, 'user_id': {'type': 'integer', 'description': 'User ID extracted from original_log.', 'constraints': Box({'required': True, 'min': 0})}, 'transaction_type': {'type': 'string', 'description': 'Normalized transaction action/type (e.g., withdrawal, deposit, transfer, top-up, payment).', 'constraints': Box({'required': True, 'min_length': 1})}, 'amount': {'type': 'float', 'description': 'Numeric transaction amount.', 'constraints': Box({'required': True, 'min': 0})}, 'currency': {'type': 'string', 'description': 'Currency symbol or code (e.g., £, $, €, or ISO-4217 like GBP, USD, EUR).', 'constraints': Box({'required': False, 'allowed_patterns': ['^[A-Z]{3}$|^[£$€]$']})}, 'location': {'type': 'str

In [39]:
try:
    config = ConfigurationManager()
    data_validation_config = config.get_data_validation_config(table_name="processed_transaction_logs")
    # print(data_validation_config.all_schema.values())
    print([col['type'] for col in [col for col in data_validation_config.all_schema.values()]])
    print(data_validation_config.all_schema.keys())
    data_validation = DataValidation(config=data_validation_config)
    data_validation.validate_data()
except Exception as e:
    raise e

2025-08-15 01:41:34 - INFO - [common.py:34] - YAML file C:\Users\user\Desktop\moniepoint\config\config.yaml loaded successfully.
2025-08-15 01:41:34 - INFO - [common.py:34] - YAML file C:\Users\user\Desktop\moniepoint\params.yaml loaded successfully.
2025-08-15 01:41:34 - INFO - [common.py:34] - YAML file C:\Users\user\Desktop\moniepoint\schema.yaml loaded successfully.
2025-08-15 01:41:34 - INFO - [common.py:88] - Created directory at: artifacts
2025-08-15 01:41:34 - INFO - [common.py:88] - Created directory at: artifacts/data_validation


['string', 'timestamp', 'integer', 'string', 'float', 'string', 'string', 'string']
dict_keys(['original_log', 'datetime', 'user_id', 'transaction_type', 'amount', 'currency', 'location', 'device'])


In [40]:
import logging

In [66]:
class DataValidation:
    def __init__(self, config):
        self.config = config

    def validate_data(self):
        try:
            data = pd.read_csv(self.config.data_path2)
            data['datetime'] = pd.to_datetime(data['datetime'])
            all_columns = [col for col in list(data.columns) if col != "row_id"] 
            dtypes_list_str = data.drop(['row_id'], axis="columns").dtypes.astype(str).tolist()
            print(f"Data columns: {all_columns}")
            print(f"Data dtypes: {dtypes_list_str}")

            schema_columns = list(self.config.all_schema.keys())
            print(f"Schema columns: {schema_columns}")
            
            # Fixed: Use self.config instead of data_validation_config
            schema_dtypes = [col['type'] for col in self.config.all_schema.values()]
            print(f"Schema dtypes: {schema_dtypes}")

            # Check for exact column match (order + length)
            column_name_match = all_columns == schema_columns
            # Check if all dtypes match exactly
            dtype_match = dtypes_list_str == schema_dtypes

            validation_status = column_name_match and dtype_match

            if not column_name_match:
                logging.warning(f"Column mismatch:\nExpected: {schema_columns}\nFound: {all_columns}")
                print(f"Column mismatch - Expected: {schema_columns}, Found: {all_columns}")
                
            if not dtype_match:
                logging.warning(f"Dtype mismatch:\nExpected: {schema_dtypes}\nFound: {dtypes_list_str}")
                print(f"Dtype mismatch - Expected: {schema_dtypes}, Found: {dtypes_list_str}")

            if validation_status:
                print("All columns and dtypes match successfully.")
                logging.info("Data validation passed successfully")
            else:
                print("Data validation failed")
                logging.error("Data validation failed")

            # Write validation status once
            with open(self.config.STATUS_FILE, "w") as f:
                f.write(f"Validation status: {validation_status}")

            return validation_status

        except Exception as e:
            logging.error(f"Exception during data validation: {str(e)}")
            print(f"Exception occurred: {str(e)}")
            raise e

In [67]:
try:
    config = ConfigurationManager()
    data_validation_config = config.get_data_validation_config(table_name="processed_transaction_logs")
    data_validation = DataValidation(config=data_validation_config)
    data_validation.validate_data()
except Exception as e:
    raise e

2025-08-15 02:04:48 - INFO - [common.py:34] - YAML file C:\Users\user\Desktop\moniepoint\config\config.yaml loaded successfully.
2025-08-15 02:04:48 - INFO - [common.py:34] - YAML file C:\Users\user\Desktop\moniepoint\params.yaml loaded successfully.
2025-08-15 02:04:48 - INFO - [common.py:34] - YAML file C:\Users\user\Desktop\moniepoint\schema.yaml loaded successfully.
2025-08-15 02:04:48 - INFO - [common.py:88] - Created directory at: artifacts
2025-08-15 02:04:48 - INFO - [common.py:88] - Created directory at: artifacts/data_validation
2025-08-15 02:04:48 - INFO - [958848551.py:38] - Data validation passed successfully


Data columns: ['original_log', 'datetime', 'user_id', 'transaction_type', 'amount', 'currency', 'location', 'device']
Data dtypes: ['object', 'datetime64[ns]', 'object', 'object', 'float64', 'object', 'object', 'object']
Schema columns: ['original_log', 'datetime', 'user_id', 'transaction_type', 'amount', 'currency', 'location', 'device']
Schema dtypes: ['object', 'datetime64[ns]', 'object', 'object', 'float64', 'object', 'object', 'object']
All columns and dtypes match successfully.
