In [1]:
# Data Quality Metrics & Scoring Examples

# Task 1:
# Assign scores to a customer dataset based on completeness, uniqueness, and consistency.
# Analyze the overall data quality score and identify areas for improvement.





# Task 2:
# Evaluate a dataset for an online shop using metrics such as accuracy, timeliness, and
# integrity. Calculate the data quality score and provide improvement suggestions.





# Task 3:
# Perform a data quality assessment on a financial dataset, scoring it based on validity,
# precision, and accessibility. Review the results and propose corrective actions.


import pandas as pd
import numpy as np

# --------------------------
# Task 1: Customer Dataset Quality Scoring
# --------------------------
print("\n--- Task 1: Customer Data Quality Scoring ---")

# Sample customer dataset
df_customer = pd.DataFrame({
    'CustomerID': [101, 102, 103, 104, 104, np.nan],
    'Email': ['a@x.com', 'b@x.com', 'c@x.com', '', 'd@x.com', np.nan],
    'Phone': ['123', '456', '789', '123', '456', '789']
})

# Completeness Score
completeness = df_customer.notnull().mean().mean()

# Uniqueness Score
uniqueness = df_customer['CustomerID'].nunique() / df_customer['CustomerID'].dropna().shape[0]

# Consistency: e.g., no blank strings in Email
consistency = (df_customer['Email'].str.strip() != '').mean()

# Final score (average)
customer_score = round((completeness + uniqueness + consistency) / 3, 2)
print(f"Completeness: {completeness:.2f}, Uniqueness: {uniqueness:.2f}, Consistency: {consistency:.2f}")
print(f"Overall Customer Data Quality Score: {customer_score:.2f}")

# --------------------------
# Task 2: Online Shop Dataset Quality Scoring
# --------------------------
print("\n--- Task 2: Online Shop Data Quality Scoring ---")

df_shop = pd.DataFrame({
    'OrderID': [1, 2, 3, 4],
    'Product': ['Phone', 'Laptop', 'Phone', 'Tablet'],
    'DeliveryDate': ['2024-01-05', '2023-12-15', '', '2024-01-10'],
    'Price': [500.0, 900.0, np.nan, 300.0]
})

# Accuracy: assume no missing Price
accuracy = df_shop['Price'].notnull().mean()

# Timeliness: DeliveryDate filled correctly
timeliness = df_shop['DeliveryDate'].str.match(r'^\d{4}-\d{2}-\d{2}$').mean()

# Integrity: no null in primary key (OrderID)
integrity = df_shop['OrderID'].notnull().mean()

shop_score = round((accuracy + timeliness + integrity) / 3, 2)
print(f"Accuracy: {accuracy:.2f}, Timeliness: {timeliness:.2f}, Integrity: {integrity:.2f}")
print(f"Overall Online Shop Data Quality Score: {shop_score:.2f}")

# --------------------------
# Task 3: Financial Dataset Quality Scoring
# --------------------------
print("\n--- Task 3: Financial Data Quality Scoring ---")

df_finance = pd.DataFrame({
    'TransactionID': [201, 202, 203, 204],
    'Amount': [1000.0, 1050.75, None, 980.55],
    'Currency': ['USD', 'usd', 'USD', 'usd']
})

# Validity: TransactionID and Amount not null
validity = (df_finance['TransactionID'].notnull() & df_finance['Amount'].notnull()).mean()

# Precision: check decimals in Amount (at least one decimal point)
precision = df_finance['Amount'].dropna().apply(lambda x: float(x).is_integer()).mean()
precision_score = 1 - precision  # higher is better if values are not whole numbers

# Accessibility: assume 100% if no access issue simulated
accessibility = 1.0

finance_score = round((validity + precision_score + accessibility) / 3, 2)
print(f"Validity: {validity:.2f}, Precision: {precision_score:.2f}, Accessibility: {accessibility:.2f}")
print(f"Overall Financial Data Quality Score: {finance_score:.2f}")



--- Task 1: Customer Data Quality Scoring ---
Completeness: 0.89, Uniqueness: 0.80, Consistency: 0.83
Overall Customer Data Quality Score: 0.84

--- Task 2: Online Shop Data Quality Scoring ---
Accuracy: 0.75, Timeliness: 0.75, Integrity: 1.00
Overall Online Shop Data Quality Score: 0.83

--- Task 3: Financial Data Quality Scoring ---
Validity: 0.75, Precision: 0.67, Accessibility: 1.00
Overall Financial Data Quality Score: 0.81
