## Build a Data Quality Dashboard

**Description**: Create a simple dashboard that displays data quality metrics using a library like `dash` or `streamlit`.

**Steps:**
1. Install Streamlit: pip install streamlit
2. Create a Python script dashboard.py.
3. Run the dashboard: streamlit run dashboard.py

In [1]:
import pandas as pd
import re

# --------- Validation Helper ---------
def validate_dataframe(df: pd.DataFrame) -> bool:
    """
    Check if input is a valid non-empty DataFrame with at least one non-missing cell.
    """
    if not isinstance(df, pd.DataFrame):
        raise ValueError("Input must be a pandas DataFrame.")
    if df.empty or df.isnull().all().all():
        return False
    return True

# --------- Completeness Metric ---------
def calculate_completeness(df: pd.DataFrame) -> float:
    """
    Calculate completeness as proportion of non-missing values.
    """
    if not validate_dataframe(df):
        return 0.0
    total_cells = df.size
    non_missing = df.notnull().sum().sum()
    return round(non_missing / total_cells, 2)

# --------- Accuracy Metric (Email Validation) ---------
def calculate_accuracy(df: pd.DataFrame, email_col='email') -> float:
    """
    Calculate accuracy as proportion of valid email addresses.
    """
    if not validate_dataframe(df):
        return 0.0
    if email_col not in df.columns:
        return 0.0

    email_pattern = r'^[\w\.-]+@[\w\.-]+\.\w+$'
    valid_emails = df[email_col].dropna().apply(lambda x: bool(re.match(email_pattern, str(x))))
    return round(valid_emails.sum() / len(valid_emails), 2) if len(valid_emails) > 0 else 0.0

# --------- Consistency Metric (Grade Validation) ---------
def calculate_consistency(df: pd.DataFrame, grade_col='grade') -> float:
    """
    Calculate consistency as proportion of valid grades.
    Accepts letter grades A-F or numeric 0-100 (you can modify as needed).
    """
    if not validate_dataframe(df):
        return 0.0
    if grade_col not in df.columns:
        return 0.0

    valid_letter_grades = {'A', 'B', 'C', 'D', 'E', 'F'}
    grades = df[grade_col].dropna()

    def is_valid_grade(x):
        if isinstance(x, str):
            return x.upper() in valid_letter_grades
        elif isinstance(x, (int, float)):
            return 0 <= x <= 100
        return False

    consistent = grades.apply(is_valid_grade)
    return round(consistent.sum() / len(consistent), 2) if len(consistent) > 0 else 0.0

# --------- Combined DQI Score ---------
def calculate_dqi(df: pd.DataFrame) -> dict:
    """
    Calculate a combined DQI score as average of completeness, accuracy, and consistency.
    """
    completeness = calculate_completeness(df)
    accuracy = calculate_accuracy(df)
    consistency = calculate_consistency(df)
    dqi_score = round((completeness + accuracy + consistency) / 3, 2)
    return {
        "completeness": completeness,
        "accuracy": accuracy,
        "consistency": consistency,
        "dqi_score": dqi_score
    }

# --------- Sample Dataset Generator ---------
def generate_sample_dataset(filename='data_quality_sample.csv'):
    data = {
        'id': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
        'name': ['John Doe', 'Jane Smith', 'Bob Johnson', 'Alice White', 'Charlie Brown',
                 'David Lee', 'Eve Adams', 'Ann Marie', 'Tom O\'Neil', None],
        'age': [20, 22, None, 25, 21, 19, 23, None, 30, 19],
        'grade': ['A', 'B', 'Z', 88, 95, 80, 101, 'F', 85, None],  # Mixed letters and numeric with invalid 'Z' and 101
        'email': ['johndoe@example.com', 'janesmith@example.com', 'bobjohnson@ex.com',
                  'alicewhite@example.com', 'charliebrown@example', 'invalid-email',
                  'eveadams@example.com', 'annmarie@example.com', 'tomoneil@example.com', None]
    }
    df = pd.DataFrame(data)
    df.to_csv(filename, index=False)
    print(f"✅ Sample dataset saved as '{filename}'")

# --------- Main Script ---------
if __name__ == "__main__":
    # Generate sample dataset for demonstration
    generate_sample_dataset()

    # Load dataset
    df = pd.read_csv('data_quality_sample.csv')

    # Calculate DQI metrics
    scores = calculate_dqi(df)
    print("\nData Quality Scores:")
    print(scores)


✅ Sample dataset saved as 'data_quality_sample.csv'

Data Quality Scores:
{'completeness': 0.9, 'accuracy': 0.78, 'consistency': 0.33, 'dqi_score': 0.67}
