## Data Quality Framework Implementation

**Description**: Implement a simple data quality measurement framework using ISO 8000 principles to assess key dimensions in a dataset.

In [2]:
# Write a conceptual framework described in Python pseudo-code:
import pandas as pd
import numpy as np
import re

class DataQualityFramework:
    def __init__(self, df):
        self.df = df.copy()
        self.report = {}

    def check_completeness(self):
        # Percentage of missing values per column
        completeness = 1 - self.df.isnull().mean()
        self.report['Completeness'] = completeness.to_dict()

    def check_uniqueness(self, subset=None):
        # Percentage of unique rows (overall or subset of columns)
        if subset:
            unique_ratio = self.df.drop_duplicates(subset=subset).shape[0] / self.df.shape[0]
            self.report[f'Uniqueness ({subset})'] = unique_ratio
        else:
            unique_ratio = self.df.drop_duplicates().shape[0] / self.df.shape[0]
            self.report['Uniqueness (All columns)'] = unique_ratio

    def check_validity(self, column_validators):
        # column_validators = dict {column_name: validation_function(value) -> bool}
        validity_results = {}
        for col, validator in column_validators.items():
            if col not in self.df.columns:
                validity_results[col] = None
                continue
            valid_mask = self.df[col].apply(lambda x: validator(x) if pd.notnull(x) else True)
            validity_ratio = valid_mask.mean()
            validity_results[col] = validity_ratio
        self.report['Validity'] = validity_results

    def check_consistency(self, consistency_checks):
        # consistency_checks = list of functions(df) -> bool Series (True=consistent)
        consistency_results = []
        for check_fn in consistency_checks:
            result = check_fn(self.df)
            consistency_results.append(result.mean())
        self.report['Consistency'] = consistency_results

    def generate_report(self):
        print("=== Data Quality Report ===")
        for dimension, result in self.report.items():
            print(f"\n{dimension}:")
            if isinstance(result, dict):
                for k, v in result.items():
                    print(f"  {k}: {v:.2%}" if v is not None else f"  {k}: N/A")
            elif isinstance(result, list):
                for i, v in enumerate(result):
                    print(f"  Check {i+1}: {v:.2%}")
            else:
                print(f"  {result:.2%}")

# Example usage:

# Sample DataFrame
data = {
    'ID': [1, 2, 3, 3, 4],
    'Age': [25, 30, 22, 22, None],
    'Email': ['a@example.com', 'b@example', 'c@example.com', 'c@example.com', 'd@example.com'],
    'Salary': [50000, 60000, 55000, 55000, 52000]
}
df = pd.DataFrame(data)

# Define Validators
def valid_age(x):
    return isinstance(x, (int, float)) and 0 <= x <= 120

def valid_email(x):
    if not isinstance(x, str):
        return False
    pattern = r'^[\w\.-]+@[\w\.-]+\.\w+$'
    return re.match(pattern, x) is not None

# Define Consistency Checks
def salary_positive(df):
    return df['Salary'] >= 0

def age_salary_relation(df):
    # Just an example: salary should be above 20000 if age > 20
    return ~((df['Age'] > 20) & (df['Salary'] < 20000))

# Initialize Framework
dq = DataQualityFramework(df)

# Run Checks
dq.check_completeness()
dq.check_uniqueness(subset=['ID'])
dq.check_validity({'Age': valid_age, 'Email': valid_email})
dq.check_consistency([salary_positive, age_salary_relation])

# Print report
dq.generate_report()



=== Data Quality Report ===

Completeness:
  ID: 100.00%
  Age: 80.00%
  Email: 100.00%
  Salary: 100.00%

Uniqueness (['ID']):
  80.00%

Validity:
  Age: 100.00%
  Email: 80.00%

Consistency:
  Check 1: 100.00%
  Check 2: 100.00%
