In [1]:
# Data Quality Metrics & Scoring Examples

# Task 1:
# Assign scores to a customer dataset based on completeness, uniqueness, and consistency.
# Analyze the overall data quality score and identify areas for improvement.





# Task 2:
# Evaluate a dataset for an online shop using metrics such as accuracy, timeliness, and
# integrity. Calculate the data quality score and provide improvement suggestions.





# Task 3:
# Perform a data quality assessment on a financial dataset, scoring it based on validity,
# precision, and accessibility. Review the results and propose corrective actions.





In [2]:
import pandas as pd
import numpy as np
import datetime as dt

# ------------------------
# Sample Datasets
# ------------------------

# Customer dataset for Task 1
data_customers = {
    'CustomerID': [1, 2, 2, 4, 5],
    'Email': ['a@example.com', 'b@example.com', 'b@example.com', None, 'e@example.com'],
    'Phone': ['1234567890', '1234567890', '1234567890', '0987654321', None],
    'Country': ['US', 'US', 'US', 'US', 'USA']  # inconsistency: 'US' vs 'USA'
}
df_customers = pd.DataFrame(data_customers)

# Online shop dataset for Task 2
data_shop = {
    'OrderID': [101, 102, 103, 104],
    'OrderDate': ['2024-04-01', '2024-04-10', '2024-02-30', ''],  # One invalid date, one missing
    'DeliveryDate': ['2024-04-05', '2024-04-12', '2024-03-05', '2024-04-15'],
    'OrderAmount': [250.5, 300, 0, 150]
}
df_shop = pd.DataFrame(data_shop)

# Financial dataset for Task 3
data_financial = {
    'TransactionID': [201, 202, 203, 204],
    'Amount': [1000.00, 500.12345, None, 1200.0],  # One missing, one with extra precision
    'Currency': ['USD', 'usd', 'USD', 'EUR'],       # Inconsistent case
    'Accessible': [True, True, False, True]          # Accessibility flag
}
df_financial = pd.DataFrame(data_financial)

# -------------------------------------
# Task 1: Customer Dataset Data Quality
# -------------------------------------

print("\n--- Task 1: Customer Dataset Data Quality ---")

# Completeness: fraction of non-missing values in key columns
completeness_email = df_customers['Email'].notna().mean()
completeness_phone = df_customers['Phone'].notna().mean()
completeness_score = (completeness_email + completeness_phone) / 2

# Uniqueness: fraction of unique CustomerIDs and unique Emails
uniqueness_customerid = df_customers['CustomerID'].is_unique
uniqueness_email = df_customers['Email'].dropna().is_unique
uniqueness_score = np.mean([uniqueness_customerid, uniqueness_email])

# Consistency: measure of consistent country codes (count mode fraction)
country_mode_frac = df_customers['Country'].mode().size / df_customers['Country'].nunique()
consistency_score = country_mode_frac

overall_quality = np.mean([completeness_score, uniqueness_score, consistency_score])

print(f"Completeness Score: {completeness_score:.2f}")
print(f"Uniqueness Score: {uniqueness_score:.2f}")
print(f"Consistency Score: {consistency_score:.2f}")
print(f"Overall Data Quality Score: {overall_quality:.2f}")

print("Improvement suggestions:")
if completeness_score < 1:
    print("- Fill missing emails and phone numbers.")
if uniqueness_score < 1:
    print("- Remove duplicate customer records.")
if consistency_score < 1:
    print("- Standardize country codes.")

# -------------------------------------
# Task 2: Online Shop Dataset Data Quality
# -------------------------------------

print("\n--- Task 2: Online Shop Dataset Data Quality ---")

# Accuracy: validate dates using pd.to_datetime with errors='coerce'
valid_order_dates = pd.to_datetime(df_shop['OrderDate'], errors='coerce').notna().mean()
valid_delivery_dates = pd.to_datetime(df_shop['DeliveryDate'], errors='coerce').notna().mean()
accuracy_score = (valid_order_dates + valid_delivery_dates) / 2

# Timeliness: orders delivered within 7 days (assuming both dates valid)
df_shop['OrderDate_dt'] = pd.to_datetime(df_shop['OrderDate'], errors='coerce')
df_shop['DeliveryDate_dt'] = pd.to_datetime(df_shop['DeliveryDate'], errors='coerce')
df_shop['DeliveryTime'] = (df_shop['DeliveryDate_dt'] - df_shop['OrderDate_dt']).dt.days
timely_deliveries = df_shop['DeliveryTime'].le(7).mean()

# Integrity: no zero or negative amounts
integrity_score = (df_shop['OrderAmount'] > 0).mean()

overall_quality = np.mean([accuracy_score, timeliness, integrity_score])

print(f"Accuracy Score: {accuracy_score:.2f}")
print(f"Timeliness Score: {timely_deliveries:.2f}")
print(f"Integrity Score: {integrity_score:.2f}")
print(f"Overall Data Quality Score: {overall_quality:.2f}")

print("Improvement suggestions:")
if accuracy_score < 1:
    print("- Fix invalid or missing dates.")
if timely_deliveries < 1:
    print("- Improve delivery times.")
if integrity_score < 1:
    print("- Ensure order amounts are positive and valid.")

# -------------------------------------
# Task 3: Financial Dataset Data Quality
# -------------------------------------

print("\n--- Task 3: Financial Dataset Data Quality ---")

# Validity: no missing Amounts, Currency standardized (uppercase)
validity_amount = df_financial['Amount'].notna().mean()
currency_standardized = (df_financial['Currency'].str.upper() == df_financial['Currency']).mean()

validity_score = (validity_amount + currency_standardized) / 2

# Precision: count amounts with no more than 2 decimals
def check_precision(x):
    if pd.isna(x):
        return False
    return round(x, 2) == x

precision_score = df_financial['Amount'].apply(check_precision).mean()

# Accessibility: fraction of records marked accessible
accessibility_score = df_financial['Accessible'].mean()

overall_quality = np.mean([validity_score, precision_score, accessibility_score])

print(f"Validity Score: {validity_score:.2f}")
print(f"Precision Score: {precision_score:.2f}")
print(f"Accessibility Score: {accessibility_score:.2f}")
print(f"Overall Data Quality Score: {overall_quality:.2f}")

print("Improvement suggestions:")
if validity_score < 1:
    print("- Fill missing amounts and standardize currency codes.")
if precision_score < 1:
    print("- Round amounts to 2 decimal places.")
if accessibility_score < 1:
    print("- Improve data accessibility for all transactions.")



--- Task 1: Customer Dataset Data Quality ---
Completeness Score: 0.80
Uniqueness Score: 0.00
Consistency Score: 0.50
Overall Data Quality Score: 0.43
Improvement suggestions:
- Fill missing emails and phone numbers.
- Remove duplicate customer records.
- Standardize country codes.

--- Task 2: Online Shop Dataset Data Quality ---


NameError: name 'timeliness' is not defined