In [3]:
# Data Quality Metrics & Scoring Examples

# Task 1:
# Assign scores to a customer dataset based on completeness, uniqueness, and consistency.
# Analyze the overall data quality score and identify areas for improvement.





# Task 2:
# Evaluate a dataset for an online shop using metrics such as accuracy, timeliness, and
# integrity. Calculate the data quality score and provide improvement suggestions.





# Task 3:
# Perform a data quality assessment on a financial dataset, scoring it based on validity,
# precision, and accessibility. Review the results and propose corrective actions.





In [4]:
import pandas as pd
import numpy as np
import datetime as dt

# -------------------------------------
# 1. Manually create CSV files
# -------------------------------------

# Customer dataset for Task 1
data_customers = {
    'CustomerID': [1, 2, 2, 4, 5],
    'Email': ['a@example.com', 'b@example.com', 'b@example.com', None, 'e@example.com'],
    'Phone': ['1234567890', '1234567890', '1234567890', '0987654321', None],
    'Country': ['US', 'US', 'US', 'US', 'USA']  # inconsistency: 'US' vs 'USA'
}
df_customers = pd.DataFrame(data_customers)
df_customers.to_csv('customers.csv', index=False)
print("Created customers.csv")

# Online shop dataset for Task 2
data_shop = {
    'OrderID': [101, 102, 103, 104],
    'OrderDate': ['2024-04-01', '2024-04-10', '2024-02-30', ''],  # One invalid date, one missing
    'DeliveryDate': ['2024-04-05', '2024-04-12', '2024-03-05', '2024-04-15'],
    'OrderAmount': [250.5, 300, 0, 150]
}
df_shop = pd.DataFrame(data_shop)
df_shop.to_csv('shop_orders.csv', index=False)
print("Created shop_orders.csv")

# Financial dataset for Task 3
data_financial = {
    'TransactionID': [201, 202, 203, 204],
    'Amount': [1000.00, 500.12345, None, 1200.0],  # One missing, one with extra precision
    'Currency': ['USD', 'usd', 'USD', 'EUR'],       # Inconsistent case
    'Accessible': [True, True, False, True]          # Accessibility flag
}
df_financial = pd.DataFrame(data_financial)
df_financial.to_csv('financials.csv', index=False)
print("Created financials.csv")


# -------------------------------------
# 2. Load CSVs and perform Data Quality Assessments
# -------------------------------------

# Task 1: Customer Dataset Data Quality
df = pd.read_csv('customers.csv')
print("\n--- Task 1: Customer Dataset Data Quality ---")

# Completeness
compl_email = df['Email'].notna().mean()
compl_phone = df['Phone'].notna().mean()
completeness_score = (compl_email + compl_phone) / 2

# Uniqueness
uniq_custid = df['CustomerID'].is_unique
uniq_email = df['Email'].dropna().is_unique
uniqueness_score = np.mean([uniq_custid, uniq_email])

# Consistency (country codes)
mode_country = df['Country'].mode()[0]
consistency_score = (df['Country'] == mode_country).mean()

overall_quality_1 = np.mean([completeness_score, uniqueness_score, consistency_score])

print(f"Completeness Score: {completeness_score:.2f}")
print(f"Uniqueness Score: {uniqueness_score:.2f}")
print(f"Consistency Score: {consistency_score:.2f}")
print(f"Overall Data Quality Score: {overall_quality_1:.2f}")
print("Improvement suggestions:")
if completeness_score < 1:
    print("- Fill missing emails and phone numbers.")
if uniqueness_score < 1:
    print("- Remove duplicate customer records.")
if consistency_score < 1:
    print("- Standardize country codes.")


# Task 2: Online Shop Dataset Data Quality
df = pd.read_csv('shop_orders.csv')
print("\n--- Task 2: Online Shop Dataset Data Quality ---")

# Accuracy (valid dates)
valid_order = pd.to_datetime(df['OrderDate'], errors='coerce').notna().mean()
valid_delivery = pd.to_datetime(df['DeliveryDate'], errors='coerce').notna().mean()
accuracy_score = (valid_order + valid_delivery) / 2

# Timeliness (delivery within 7 days)
df['OrderDate_dt'] = pd.to_datetime(df['OrderDate'], errors='coerce')
df['DeliveryDate_dt'] = pd.to_datetime(df['DeliveryDate'], errors='coerce')
df['DeliveryTime'] = (df['DeliveryDate_dt'] - df['OrderDate_dt']).dt.days
timeliness_score = df['DeliveryTime'].le(7).mean()

# Integrity (positive amounts)
integrity_score = (df['OrderAmount'] > 0).mean()

overall_quality_2 = np.mean([accuracy_score, timeliness_score, integrity_score])

print(f"Accuracy Score: {accuracy_score:.2f}")
print(f"Timeliness Score: {timeliness_score:.2f}")
print(f"Integrity Score: {integrity_score:.2f}")
print(f"Overall Data Quality Score: {overall_quality_2:.2f}")
print("Improvement suggestions:")
if accuracy_score < 1:
    print("- Fix invalid or missing dates.")
if timeliness_score < 1:
    print("- Improve delivery times.")
if integrity_score < 1:
    print("- Ensure order amounts are positive and valid.")


# Task 3: Financial Dataset Data Quality
df = pd.read_csv('financials.csv')
print("\n--- Task 3: Financial Dataset Data Quality ---")

# Validity (no missing Amounts, currency uppercase)
valid_amount = df['Amount'].notna().mean()
currency_upper = (df['Currency'] == df['Currency'].str.upper()).mean()
validity_score = (valid_amount + currency_upper) / 2

# Precision (<=2 decimal places)
def has_two_decimals(x):
    if pd.isna(x):
        return False
    return float(f"{x:.2f}") == x

precision_score = df['Amount'].apply(has_two_decimals).mean()

# Accessibility
accessibility_score = df['Accessible'].mean()

overall_quality_3 = np.mean([validity_score, precision_score, accessibility_score])

print(f"Validity Score: {validity_score:.2f}")
print(f"Precision Score: {precision_score:.2f}")
print(f"Accessibility Score: {accessibility_score:.2f}")
print(f"Overall Data Quality Score: {overall_quality_3:.2f}")
print("Improvement suggestions:")
if validity_score < 1:
    print("- Fill missing amounts and standardize currency codes.")
if precision_score < 1:
    print("- Round amounts to 2 decimal places.")
if accessibility_score < 1:
    print("- Improve data accessibility for all transactions.")


Created customers.csv
Created shop_orders.csv
Created financials.csv

--- Task 1: Customer Dataset Data Quality ---
Completeness Score: 0.80
Uniqueness Score: 0.00
Consistency Score: 0.80
Overall Data Quality Score: 0.53
Improvement suggestions:
- Fill missing emails and phone numbers.
- Remove duplicate customer records.
- Standardize country codes.

--- Task 2: Online Shop Dataset Data Quality ---
Accuracy Score: 0.75
Timeliness Score: 0.50
Integrity Score: 0.75
Overall Data Quality Score: 0.67
Improvement suggestions:
- Fix invalid or missing dates.
- Improve delivery times.
- Ensure order amounts are positive and valid.

--- Task 3: Financial Dataset Data Quality ---
Validity Score: 0.75
Precision Score: 0.50
Accessibility Score: 0.75
Overall Data Quality Score: 0.67
Improvement suggestions:
- Fill missing amounts and standardize currency codes.
- Round amounts to 2 decimal places.
- Improve data accessibility for all transactions.
