## Data Quality Framework Implementation

**Description**: Implement a simple data quality measurement framework using ISO 8000 principles to assess key dimensions in a dataset.

In [2]:
import pandas as pd
import numpy as np

# Sample dataset (this is your 'dataset')
dataset = pd.DataFrame({
    "Name": ["Alice", "Bob", "Charlie", None],
    "Age": [25, 200, None, 40],
    "Gender": ["F", "M", "X", "F"],
    "Join_Date": ["2021-01-01", "not_a_date", "2020-05-20", "2022-03-15"]
})

# Data Element Definition (DED)
DED = {
    "Name": {"type": "string", "required": True},
    "Age": {"type": "integer", "min": 0, "max": 120, "required": False},
    "Gender": {"type": "string", "allowed_values": ["M", "F"], "required": True},
    "Join_Date": {"type": "datetime", "required": True}
}

# Helper functions
def non_null_count(series):
    return series.notnull().sum()

def total_count(df):
    return len(df)

def type_check_score(series, expected_type):
    if expected_type == "integer":
        return 100 * pd.to_numeric(series, errors='coerce').dropna().apply(float.is_integer).mean()
    elif expected_type == "string":
        return 100 * series.dropna().apply(lambda x: isinstance(x, str)).mean()
    elif expected_type == "datetime":
        return 100 * pd.to_datetime(series, errors='coerce').notnull().mean()
    else:
        return 0

# Data Evaluation (DE)
def evaluate_data_quality(df, DED):
    results = {}

    for column, rules in DED.items():
        col_result = {}

        # Completeness
        completeness_score = (non_null_count(df[column]) / total_count(df)) * 100
        col_result["completeness (%)"] = round(completeness_score, 2)

        # Validity
        if "allowed_values" in rules:
            valid_entries = df[column].isin(rules["allowed_values"]).sum()
            validity_score = (valid_entries / total_count(df)) * 100
            col_result["validity (%)"] = round(validity_score, 2)

        # Consistency (Data type)
        consistency_score = type_check_score(df[column], rules["type"])
        col_result["consistency (%)"] = round(consistency_score, 2)

        # Accuracy and Timeliness placeholders
        col_result["accuracy"] = "N/A"
        col_result["timeliness"] = "N/A"

        results[column] = col_result

    return pd.DataFrame(results).T  # Transpose for better readability

# Step 4: Generate Data Quality Report
data_quality_report = evaluate_data_quality(dataset, DED)
print("\nüîç Data Quality Report:\n")
print(data_quality_report)


üîç Data Quality Report:

          completeness (%) consistency (%) accuracy timeliness validity (%)
Name                  75.0           100.0      N/A        N/A          NaN
Age                   75.0           100.0      N/A        N/A          NaN
Gender               100.0           100.0      N/A        N/A         75.0
Join_Date            100.0            75.0      N/A        N/A          NaN
