## Data Quality Framework Implementation

**Description**: Implement a simple data quality measurement framework using ISO 8000 principles to assess key dimensions in a dataset.

In [3]:
import pandas as pd
import numpy as np
import re

# Sample Dataset
data = {
    'ID': [1, 2, 3, 4, 5, 5],
    'Name': ['Alice', 'Bob', 'Charlie', 'David', None, 'Eve'],
    'Email': ['alice@example.com', 'bob@example.com', 'invalid_email', 'david@example.com', 'eve@example.com', None],
    'Age': [25, 30, 35, None, 22, 29]
}

df = pd.DataFrame(data)

# === ISO 8000-Based Data Quality Checks ===

# 1. Completeness
def check_completeness(df):
    completeness = 100 * df.notnull().sum() / len(df)
    return completeness.round(2)

# 2. Accuracy (simulate by comparing to reference values)
def check_accuracy(df, column, reference_values):
    matches = df[column].isin(reference_values).sum()
    total = df[column].notnull().sum()
    return round(100 * matches / total, 2) if total > 0 else 0

# 3. Uniqueness (e.g., for ID column)
def check_uniqueness(df, column):
    unique = df[column].nunique()
    total = df[column].notnull().sum()
    return round(100 * unique / total, 2) if total > 0 else 0

# 4. Validity (e.g., email format)
def is_valid_email(email):
    return re.match(r"[^@]+@[^@]+\.[^@]+", str(email)) is not None

def check_validity(df, column, validation_func):
    valid = df[column].apply(validation_func).sum()
    total = df[column].notnull().sum()
    return round(100 * valid / total, 2) if total > 0 else 0

# 5. Consistency (e.g., combination of ID and Name must always match)
def check_consistency(df, columns):
    unique_combinations = df[columns].dropna().drop_duplicates()
    return round(100 * len(unique_combinations) / len(df), 2)

# === Report Results ===

print("\n=== ISO 8000 Data Quality Report ===")
print("\nCompleteness (%):")
print(check_completeness(df))

print(f"\nUniqueness of 'ID' column: {check_uniqueness(df, 'ID')}%")
print(f"Validity of 'Email' column: {check_validity(df, 'Email', is_valid_email)}%")
print(f"Consistency of ['ID', 'Name']: {check_consistency(df, ['ID', 'Name'])}%")
print(f"Accuracy of 'Email' vs. known good list: {check_accuracy(df, 'Email', ['alice@example.com', 'bob@example.com', 'david@example.com', 'eve@example.com'])}%")



=== ISO 8000 Data Quality Report ===

Completeness (%):
ID       100.00
Name      83.33
Email     83.33
Age       83.33
dtype: float64

Uniqueness of 'ID' column: 83.33%
Validity of 'Email' column: 80.0%
Consistency of ['ID', 'Name']: 83.33%
Accuracy of 'Email' vs. known good list: 80.0%
