In [None]:
# Data Quality Metrics & Scoring Examples

# Task 1:
# Assign scores to a customer dataset based on completeness, uniqueness, and consistency.
# Analyze the overall data quality score and identify areas for improvement.





# Task 2:
# Evaluate a dataset for an online shop using metrics such as accuracy, timeliness, and
# integrity. Calculate the data quality score and provide improvement suggestions.





# Task 3:
# Perform a data quality assessment on a financial dataset, scoring it based on validity,
# precision, and accessibility. Review the results and propose corrective actions.





In [1]:
import pandas as pd
import numpy as np

# ---------------------------
# Helper Functions for Scoring
# ---------------------------

def completeness_score(df, columns=None):
    """
    Calculate completeness score as the proportion of non-missing values.
    """
    cols = columns if columns else df.columns
    completeness = {}
    for col in cols:
        completeness[col] = df[col].notnull().mean()
    overall = np.mean(list(completeness.values()))
    return completeness, overall

def uniqueness_score(df, subset=None):
    """
    Calculate uniqueness score as proportion of unique rows or unique values in columns.
    """
    if subset:
        unique_ratio = df.drop_duplicates(subset=subset).shape[0] / df.shape[0]
        return {f"unique_{'_'.join(subset)}": unique_ratio}, unique_ratio
    else:
        uniqueness = {}
        for col in df.columns:
            uniqueness[col] = df[col].nunique() / df.shape[0]
        overall = np.mean(list(uniqueness.values()))
        return uniqueness, overall

def consistency_score(df, rules):
    """
    Calculate consistency score based on custom rules.
    rules: dict where keys are column names and values are functions returning boolean Series.
    """
    consistency = {}
    for col, rule_func in rules.items():
        valid = rule_func(df[col])
        consistency[col] = valid.mean()
    overall = np.mean(list(consistency.values()))
    return consistency, overall

def accuracy_score(df, valid_values_dict):
    """
    Calculate accuracy as proportion of values matching valid sets per column.
    valid_values_dict: dict of column -> set/list of valid values.
    """
    accuracy = {}
    for col, valid_vals in valid_values_dict.items():
        accuracy[col] = df[col].isin(valid_vals).mean()
    overall = np.mean(list(accuracy.values()))
    return accuracy, overall

def timeliness_score(df, date_col, reference_date):
    """
    Calculate timeliness as proportion of records with date_col within a valid range.
    """
    valid = pd.to_datetime(df[date_col], errors='coerce') <= reference_date
    score = valid.mean()
    return score

def integrity_score(df, foreign_key_col, valid_keys):
    """
    Calculate integrity as proportion of foreign_key_col values existing in valid_keys.
    """
    score = df[foreign_key_col].isin(valid_keys).mean()
    return score

def validity_score(df, validation_funcs):
    """
    Calculate validity based on custom validation functions per column.
    validation_funcs: dict col -> function returning boolean Series.
    """
    validity = {}
    for col, func in validation_funcs.items():
        valid = func(df[col])
        validity[col] = valid.mean()
    overall = np.mean(list(validity.values()))
    return validity, overall

def precision_score(df, col, decimal_places):
    """
    Calculate precision as proportion of values with specified decimal places.
    """
    def check_precision(x):
        if pd.isnull(x):
            return False
        s = str(x)
        if '.' in s:
            return len(s.split('.')[-1]) == decimal_places
        return decimal_places == 0
    precision = df[col].apply(check_precision).mean()
    return precision

def accessibility_score(df, accessibility_info):
    """
    Dummy function for accessibility score (e.g., proportion of accessible records).
    accessibility_info: dict or Series indicating accessibility (True/False).
    """
    if isinstance(accessibility_info, pd.Series):
        return accessibility_info.mean()
    elif isinstance(accessibility_info, dict):
        vals = list(accessibility_info.values())
        return np.mean(vals)
    else:
        return None

# ---------------------------
# Task 1: Customer Dataset Quality Scoring
# ---------------------------

print("=== Task 1: Customer Dataset Quality Scoring ===")
customer_data = pd.DataFrame({
    'customer_id': [1, 2, 3, 4, 5, 5],
    'email': ['a@example.com', 'b@example.com', None, 'd@example.com', 'e@example.com', 'e@example.com'],
    'phone': ['1234567890', '0987654321', '1231231234', None, '5555555555', '5555555555']
})

# Completeness: proportion of non-null values
comp_scores, comp_overall = completeness_score(customer_data)
print("Completeness per column:", comp_scores)
print("Overall completeness score:", comp_overall)

# Uniqueness: unique customer_id and unique emails
uniq_scores, uniq_overall = uniqueness_score(customer_data, subset=['customer_id'])
print("Uniqueness score (customer_id):", uniq_scores)
print("Overall uniqueness score:", uniq_overall)

# Consistency: phone numbers should be 10 digits
consistency_rules = {
    'phone': lambda s: s.str.fullmatch(r'\d{10}')
}
cons_scores, cons_overall = consistency_score(customer_data, consistency_rules)
print("Consistency per column:", cons_scores)
print("Overall consistency score:", cons_overall)

# Aggregate overall score (simple average)
overall_score = np.mean([comp_overall, uniq_overall, cons_overall])
print("Overall data quality score:", overall_score)
print("Areas for improvement: Check missing emails and phone number formats.\n")

# ---------------------------
# Task 2: Online Shop Dataset Quality Evaluation
# ---------------------------

print("=== Task 2: Online Shop Dataset Quality Evaluation ===")
shop_data = pd.DataFrame({
    'order_id': [101, 102, 103, 104],
    'order_date': ['2025-01-10', '2025-01-15', '2025-01-20', None],
    'product_id': ['P001', 'P002', 'P002', 'P003'],
    'quantity': [1, 2, 2, 3]
})

# Accuracy: product_id in valid set
valid_products = {'P001', 'P002', 'P003', 'P004'}
acc_scores, acc_overall = accuracy_score(shop_data, {'product_id': valid_products})
print("Accuracy per column:", acc_scores)
print("Overall accuracy score:", acc_overall)

# Timeliness: order_date <= today
today = pd.Timestamp('2025-01-25')
time_score = timeliness_score(shop_data, 'order_date', today)
print("Timeliness score:", time_score)

# Integrity: product_id foreign key integrity
integrity_score_val = integrity_score(shop_data, 'product_id', valid_products)
print("Integrity score:", integrity_score_val)

overall_shop_score = np.mean([acc_overall, time_score, integrity_score_val])
print("Overall data quality score:", overall_shop_score)
print("Suggestions: Fill missing order dates, verify product IDs.\n")

# ---------------------------
# Task 3: Financial Dataset Quality Assessment
# ---------------------------

print("=== Task 3: Financial Dataset Quality Assessment ===")
financial_data = pd.DataFrame({
    'transaction_id': [1001, 1002, 1003, 1004],
    'amount': [100.00, 200.123, 150.50, None],
    'transaction_date': ['2025-01-01', '2025-01-05', 'invalid-date', '2025-01-10'],
    'accessible': [True, True, False, True]
})

# Validity: amount should be positive, transaction_date valid date
validity_funcs = {
    'amount': lambda s: s > 0,
    'transaction_date': lambda s: pd.to_datetime(s, errors='coerce').notnull()
}
valid_scores, valid_overall = validity_score(financial_data, validity_funcs)
print("Validity per column:", valid_scores)
print("Overall validity score:", valid_overall)

# Precision: amount with 2 decimal places
prec_score = precision_score(financial_data, 'amount', 2)
print("Precision score (amount):", prec_score)

# Accessibility: proportion of accessible records
acc_score = accessibility_score(financial_data, financial_data['accessible'])
print("Accessibility score:", acc_score)

overall_fin_score = np.mean([valid_overall, prec_score, acc_score])
print("Overall financial data quality score:", overall_fin_score)
print("Corrective actions: Fix invalid dates, handle missing amounts, ensure consistent decimal precision.\n")


=== Task 1: Customer Dataset Quality Scoring ===
Completeness per column: {'customer_id': np.float64(1.0), 'email': np.float64(0.8333333333333334), 'phone': np.float64(0.8333333333333334)}
Overall completeness score: 0.888888888888889
Uniqueness score (customer_id): {'unique_customer_id': 0.8333333333333334}
Overall uniqueness score: 0.8333333333333334
Consistency per column: {'phone': np.float64(1.0)}
Overall consistency score: 1.0
Overall data quality score: 0.9074074074074074
Areas for improvement: Check missing emails and phone number formats.

=== Task 2: Online Shop Dataset Quality Evaluation ===
Accuracy per column: {'product_id': np.float64(1.0)}
Overall accuracy score: 1.0
Timeliness score: 0.75
Integrity score: 1.0
Overall data quality score: 0.9166666666666666
Suggestions: Fill missing order dates, verify product IDs.

=== Task 3: Financial Dataset Quality Assessment ===
Validity per column: {'amount': np.float64(0.75), 'transaction_date': np.float64(0.75)}
Overall validity 