## Data Quality Framework Implementation

**Description**: Implement a simple data quality measurement framework using ISO 8000 principles to assess key dimensions in a dataset.

In [1]:
# Write a conceptual framework described in Python pseudo-code:
import pandas as pd
import numpy as np

class DataQualityFramework:
    def __init__(self, df):
        if not isinstance(df, pd.DataFrame):
            raise TypeError("Input data must be a pandas DataFrame")
        self.df = df.copy()
        self.report = {}

    def completeness(self):
        """Assess completeness as percentage of non-missing values per column"""
        completeness_scores = 100 * (1 - self.df.isnull().mean())
        self.report['completeness'] = completeness_scores.to_dict()
        return self.report['completeness']

    def uniqueness(self):
        """Assess uniqueness as % of unique rows in the dataset"""
        total_rows = len(self.df)
        unique_rows = len(self.df.drop_duplicates())
        uniqueness_score = (unique_rows / total_rows) * 100 if total_rows > 0 else np.nan
        self.report['uniqueness'] = uniqueness_score
        return uniqueness_score

    def validity(self, column_rules):
        """
        Assess validity based on user-defined rules.
        `column_rules` is a dict: {column_name: function(value) -> bool}
        """
        validity_report = {}
        for col, rule in column_rules.items():
            if col not in self.df.columns:
                validity_report[col] = None
                continue
            valid_mask = self.df[col].apply(lambda x: rule(x) if pd.notnull(x) else False)
            validity_score = 100 * valid_mask.sum() / len(self.df) if len(self.df) > 0 else np.nan
            validity_report[col] = validity_score
        self.report['validity'] = validity_report
        return validity_report

    def consistency(self, consistency_checks):
        """
        Assess consistency based on user-defined binary checks.
        `consistency_checks` is a dict: {name: function(df) -> bool mask of rows}
        """
        consistency_report = {}
        for check_name, check_func in consistency_checks.items():
            valid_mask = check_func(self.df)
            consistency_score = 100 * valid_mask.sum() / len(self.df) if len(self.df) > 0 else np.nan
            consistency_report[check_name] = consistency_score
        self.report['consistency'] = consistency_report
        return consistency_report

    def accuracy(self, accuracy_checks):
        """
        Assess accuracy based on user-defined accuracy functions.
        `accuracy_checks` is a dict: {column_name: function(value) -> bool}
        """
        accuracy_report = {}
        for col, acc_func in accuracy_checks.items():
            if col not in self.df.columns:
                accuracy_report[col] = None
                continue
            accuracy_mask = self.df[col].apply(lambda x: acc_func(x) if pd.notnull(x) else False)
            accuracy_score = 100 * accuracy_mask.sum() / len(self.df) if len(self.df) > 0 else np.nan
            accuracy_report[col] = accuracy_score
        self.report['accuracy'] = accuracy_report
        return accuracy_report

    def generate_report(self):
        """Return the full report dictionary"""
        return self.report


# ====== Example Usage ======

if __name__ == "__main__":
    # Create a sample dataframe
    data = {
        'age': [25, 30, 22, None, 40, 35, 28, 30],
        'income': [50000, 60000, None, 45000, 72000, 68000, None, 60000],
        'gender': ['M', 'F', 'F', 'M', 'F', 'M', 'F', None],
        'email': ['a@example.com', 'b@example.com', None, 'd@example.com', 'e@example', 'f@example.com', 'g@example.com', 'h@example.com']
    }
    df = pd.DataFrame(data)

    dqf = DataQualityFramework(df)

    print("Completeness (non-missing % per column):")
    print(dqf.completeness())

    print("\nUniqueness (% unique rows):")
    print(dqf.uniqueness())

    # Define validity rules: e.g. age between 18 and 65, income positive, email contains '@'
    validity_rules = {
        'age': lambda x: 18 <= x <= 65,
        'income': lambda x: x > 0,
        'email': lambda x: isinstance(x, str) and '@' in x and '.' in x.split('@')[-1]
    }
    print("\nValidity scores per column:")
    print(dqf.validity(validity_rules))

    # Consistency check example: age should be > 18 AND income should not be null if age present
    consistency_checks = {
        'age_income_consistency': lambda df: (df['age'] > 18) & (df['income'].notnull())
    }
    print("\nConsistency scores:")
    print(dqf.consistency(consistency_checks))

    # Accuracy checks (simple ranges here, or could be replaced with domain-specific checks)
    accuracy_checks = {
        'age': lambda x: 18 <= x <= 65,
        'income': lambda x: 20000 <= x <= 100000
    }
    print("\nAccuracy scores:")
    print(dqf.accuracy(accuracy_checks))

    print("\nFull data quality report:")
    print(dqf.generate_report())


Completeness (non-missing % per column):
{'age': 87.5, 'income': 75.0, 'gender': 87.5, 'email': 87.5}

Uniqueness (% unique rows):
100.0

Validity scores per column:
{'age': 87.5, 'income': 75.0, 'email': 75.0}

Consistency scores:
{'age_income_consistency': 62.5}

Accuracy scores:
{'age': 87.5, 'income': 75.0}

Full data quality report:
{'completeness': {'age': 87.5, 'income': 75.0, 'gender': 87.5, 'email': 87.5}, 'uniqueness': 100.0, 'validity': {'age': 87.5, 'income': 75.0, 'email': 75.0}, 'consistency': {'age_income_consistency': 62.5}, 'accuracy': {'age': 87.5, 'income': 75.0}}
