In [None]:
import pandas as pd
import numpy as np
#
def calculate_completeness(df):
    """Calculates the completeness score for each column in a DataFrame."""
    completeness_scores = {}
    for col in df.columns:
        non_missing_count = df[col].count()
        total_count = len(df)
        completeness_scores[col] = (non_missing_count / total_count) * 100
    return completeness_scores

def calculate_uniqueness(df):
    """Calculates the uniqueness score for each column in a DataFrame."""
    uniqueness_scores = {}
    for col in df.columns:
        unique_count = df[col].nunique()
        total_count = len(df)
        uniqueness_scores[col] = (unique_count / total_count) * 100
    return uniqueness_scores

def calculate_consistency(df, column_pairs):
    """
    Calculates consistency scores for specified pairs of columns.
    Assumes consistency means identical values across the pairs.
    """
    consistency_scores = {}
    for col1, col2 in column_pairs:
        if col1 in df.columns and col2 in df.columns:
            consistent_count = (df[col1] == df[col2]).sum()
            total_count = len(df)
            consistency_scores[f'{col1}-{col2}'] = (consistent_count / total_count) * 100
        else:
            consistency_scores[f'{col1}-{col2}'] = "Columns not found"
    return consistency_scores

def calculate_accuracy(df, actual_col, expected_col):
    """Calculates accuracy by comparing an actual column to an expected column."""
    if actual_col in df.columns and expected_col in df.columns:
        accurate_count = (df[actual_col] == df[expected_col]).sum()
        total_count = len(df)
        return (accurate_count / total_count) * 100
    else:
        return "One or both columns not found"

def is_valid_date(date_str, format_str):
    """Checks if a string is a valid date according to the given format."""
    try:
        pd.to_datetime(date_str, format=format_str, errors='raise')
        return True
    except ValueError:
        return False

def calculate_validity(df, column, validation_criteria):
    """
    Calculates the validity score for a column based on provided criteria.
    Validation criteria can be a list of valid values, a data type, or a format string for dates.
    """
    valid_count = 0
    total_count = len(df)
    if column in df.columns:
        for value in df[column]:
            if isinstance(validation_criteria, list):
                if value in validation_criteria:
                    valid_count += 1
            elif isinstance(validation_criteria, type):
                if isinstance(value, validation_criteria):
                    valid_count += 1
            elif isinstance(validation_criteria, str) and "date" in validation_criteria.lower():
                format_str = validation_criteria.split(":")[1].strip()
                if is_valid_date(str(value), format_str):
                    valid_count += 1
            # Add more validation types as needed
        return (valid_count / total_count) * 100
    else:
        return "Column not found"

def calculate_precision(df, column, expected_decimal_places):
    """Calculates the precision score for a numeric column."""
    if column in df.columns and pd.api.types.is_numeric_dtype(df[column]):
        precise_count = 0
        total_count = len(df)
        for value in df[column]:
            if isinstance(value, float) and len(str(value).split('.')[-1]) <= expected_decimal_places:
                precise_count += 1
            elif isinstance(value, int):
                precise_count += 1 # Integers are considered precise
        return (precise_count / total_count) * 100
    else:
        return "Column not found or is not numeric"

# --- Task 1: Customer Dataset ---
print("\n--- Task 1: Customer Dataset ---")
customer_data = {'CustomerID': [1, 2, 3, 4, 5],
                 'Name': ['Alice', 'Bob', None, 'David', 'Eve'],
                 'Email': ['alice@example.com', 'bob@example.com', 'charlie@example.com', 'david@example.com', 'eve@example.com'],
                 'Phone': ['123-456-7890', '987-654-3210', '123-456-7890', None, '555-123-4567'],
                 'City': ['Bengaluru', 'Mumbai', 'Delhi', 'Bengaluru', 'Chennai']}
customer_df = pd.DataFrame(customer_data)

completeness_scores_customer = calculate_completeness(customer_df)
uniqueness_scores_customer = calculate_uniqueness(customer_df)
consistency_scores_customer = calculate_consistency(customer_df, [('Name', 'CustomerID')]) # Example of inconsistent columns

print("\nCompleteness Scores:")
print(completeness_scores_customer)
print("\nUniqueness Scores:")
print(uniqueness_scores_customer)
print("\nConsistency Scores (Name vs. CustomerID):")
print(consistency_scores_customer)

# Overall Data Quality Score (Simple Average - can be weighted)
overall_quality_customer = np.mean(list(completeness_scores_customer.values()) +
                                    list(uniqueness_scores_customer.values()) +
                                    [score for score in consistency_scores_customer.values() if isinstance(score, (int, float))])
print(f"\nOverall Data Quality Score: {overall_quality_customer:.2f}%")
print("\nAreas for Improvement: Columns with lower completeness (e.g., Name, Phone), and investigating the inconsistency between Name and CustomerID.")

# --- Task 2: Online Shop Dataset ---
print("\n--- Task 2: Online Shop Dataset ---")
shop_data = {'OrderID': [101, 102, 103, 104, 105],
             'Product': ['Laptop', 'Mouse', 'Keyboard', 'Monitor', 'Webcam'],
             'ActualPrice': [1200.00, 25.50, 75.00, 300.00, 50.00],
             'ListedPrice': [1200.00, 25.50, 70.00, 300.00, 50.00], # Potential inaccuracy
             'OrderDate': ['2023-10-26', '2023-10-27', '2023-10-28', '2023-10-26', '2023-10-29'],
             'DeliveryDate': ['2023-10-28', '2023-10-29', '2023-10-30', '2023-10-28', '2023-10-31']}
shop_df = pd.DataFrame(shop_data)

accuracy_price = calculate_accuracy(shop_df, 'ActualPrice', 'ListedPrice')

# Assuming timeliness is about DeliveryDate being after OrderDate
timeliness_correct = (pd.to_datetime(shop_df['DeliveryDate']) >= pd.to_datetime(shop_df['OrderDate'])).mean() * 100

# Assuming integrity means no duplicate OrderIDs
integrity_orderid = calculate_uniqueness(shop_df[['OrderID']])['OrderID']

print(f"\nAccuracy (ActualPrice vs. ListedPrice): {accuracy_price:.2f}%")
print(f"Timeliness (Delivery after Order): {timeliness_correct:.2f}%")
print(f"Integrity (Uniqueness of OrderID): {integrity_orderid:.2f}%")

overall_quality_shop = np.mean([accuracy_price if isinstance(accuracy_price, (int, float)) else 0,
                                timeliness_correct,
                                integrity_orderid])
print(f"\nOverall Data Quality Score: {overall_quality_shop:.2f}%")
print("\nImprovement Suggestions: Investigate the discrepancy in 'ListedPrice' for 'Keyboard'. Ensure 'DeliveryDate' is always on or after 'OrderDate'. Maintain uniqueness of 'OrderID'.")

# --- Task 3: Financial Dataset ---
print("\n--- Task 3: Financial Dataset ---")
financial_data = {'TransactionID': [1, 2, 3, 4, 5],
                  'Amount': [100.25, 50.00, 200.50, 75.00, 120.755], # Potential precision issue
                  'TransactionDate': ['2023-11-01', '2023-11-01', '2023-11-02', '2023-11-03', '2023-11-03'],
                  'AccountType': ['Savings', 'Checking', 'Savings', 'Investment', 'Checking'],
                  'IsValid': [True, True, False, True, True]} # Example of validity

financial_df = pd.DataFrame(financial_data)

validity_isvalid = calculate_validity(financial_df, 'IsValid', [True, False])
precision_amount = calculate_precision(financial_df, 'Amount', 2) # Expecting 2 decimal places

# Assuming accessibility means the data is in a standard DataFrame format (subjective)
accessibility_score = 100 # Assuming it is accessible if we can load it into a DataFrame

print(f"\nValidity (IsValid): {validity_isvalid:.2f}%")
print(f"Precision (Amount - 2 decimal places): {precision_amount:.2f}%")
print(f"Accessibility: {accessibility_score:.2f}%")

overall_quality_financial = np.mean([validity_isvalid if isinstance(validity_isvalid, (int, float)) else 0,
                                      precision_amount if isinstance(precision_amount, (int, float)) else 0,
                                      accessibility_score])
print(f"\nOverall Data Quality Score: {overall_quality_financial:.2f}%")
print("\nCorrective Actions: Investigate and correct the invalid 'IsValid' entries. Standardize the 'Amount' column to two decimal places. Ensure the data remains in an easily accessible format.")


--- Task 1: Customer Dataset ---

Completeness Scores:
{'CustomerID': 100.0, 'Name': 80.0, 'Email': 100.0, 'Phone': 80.0, 'City': 100.0}

Uniqueness Scores:
{'CustomerID': 100.0, 'Name': 80.0, 'Email': 100.0, 'Phone': 60.0, 'City': 80.0}

Consistency Scores (Name vs. CustomerID):
{'Name-CustomerID': 0.0}

Overall Data Quality Score: 80.00%

Areas for Improvement: Columns with lower completeness (e.g., Name, Phone), and investigating the inconsistency between Name and CustomerID.

--- Task 2: Online Shop Dataset ---

Accuracy (ActualPrice vs. ListedPrice): 80.00%
Timeliness (Delivery after Order): 100.00%
Integrity (Uniqueness of OrderID): 100.00%

Overall Data Quality Score: 93.33%

Improvement Suggestions: Investigate the discrepancy in 'ListedPrice' for 'Keyboard'. Ensure 'DeliveryDate' is always on or after 'OrderDate'. Maintain uniqueness of 'OrderID'.

--- Task 3: Financial Dataset ---

Validity (IsValid): 100.00%
Precision (Amount - 2 decimal places): 80.00%
Accessibility: 100.0