## Build a Data Quality Dashboard

**Description**: Create a simple dashboard that displays data quality metrics using a library like `dash` or `streamlit`.

**Steps:**
1. Install Streamlit: pip install streamlit
2. Create a Python script dashboard.py.
3. Run the dashboard: streamlit run dashboard.py

In [2]:
# Write your code from here
# Write your code from here
import streamlit as st
import pandas as pd

def calculate_dqi(df):
    """
    Calculates the Data Quality Index (DQI) for a given DataFrame.

    Args:
        df (pd.DataFrame): The input DataFrame.

    Returns:
        float: The DQI, a value between 0 and 1 (inclusive).
               Returns 0 if the DataFrame is empty.
    """
    if df.empty:
        return 0.0  
    total_cells = df.size
    missing_cells = df.isnull().sum().sum()
    dqi = (total_cells - missing_cells) / total_cells
    return dqi

def calculate_completeness(df):
    """
    Calculates the completeness of the data in a DataFrame.

    Args:
        df (pd.DataFrame): The input DataFrame

    Returns:
        float: Completeness, a value between 0 and 1.
    """
    if df.empty:
        return 0.0
    total_cells = df.size
    non_missing_cells = total_cells - df.isnull().sum().sum()
    completeness = non_missing_cells / total_cells
    return completeness

def calculate_duplicates(df):
    """
    Calculates the number of duplicate rows in a DataFrame

    Args:
        df (pd.DataFrame): The input DataFrame

    Returns:
        int: Number of duplicate rows
    """
    if df.empty:
        return 0
    return df.duplicated().sum()

def calculate_validity(df,valid_values_dict):
    """
    Calculates the validity of data in specific columns of a DataFrame
    based on a dictionary of valid values.

    Args:
        df (pd.DataFrame): The input DataFrame
        valid_values_dict (dict): A dictionary where keys are column names
            and values are lists of valid values for that column.

    Returns:
        dict: A dictionary where keys are column names and values are
            the validity (between 0 and 1) for that column.
            Returns an empty dict if valid_values_dict is empty or df is empty.
    """
    if df.empty or not valid_values_dict:
        return {}

    validity_scores = {}
    for col, valid_values in valid_values_dict.items():
        if col in df.columns:
            column_data = df[col]
            valid_count = column_data[column_data.isin(valid_values)].size
            total_count = column_data.size
            validity = valid_count / total_count if total_count else 0.0
            validity_scores[col] = validity
        else:
             validity_scores[col] = 0.0 
    return validity_scores
def calculate_data_quality_metrics(df, valid_values_dict={}):
    """
    Calculates several data quality metrics for a given DataFrame.

    Args:
        df (pd.DataFrame): The input DataFrame.
        valid_values_dict (dict, optional): Dictionary of valid values for columns.
            Defaults to {}.

    Returns:
        dict: A dictionary containing the calculated metrics:
            - 'DQI': Data Quality Index
            - 'Completeness': Completeness of the data
            - 'Duplicates': Number of duplicate rows
            - 'Validity': Dictionary of validity scores for specified columns
    """
    dqi = calculate_dqi(df)
    completeness = calculate_completeness(df)
    duplicates = calculate_duplicates(df)
    validity = calculate_validity(df, valid_values_dict)

    return {
        'DQI': dqi,
        'Completeness': completeness,
        'Duplicates': duplicates,
        'Validity': validity
    }

def display_data_quality_dashboard(df, filename, valid_values_dict={}):
    """
    Displays a data quality dashboard using Streamlit.

    Args:
        df (pd.DataFrame): The input DataFrame.
        filename (str): The name of the file being analyzed.
        valid_values_dict (dict, optional): Dictionary of valid values for columns.
            Defaults to {}.
    """
    st.title(f"Data Quality Dashboard for {filename}")
    metrics = calculate_data_quality_metrics(df, valid_values_dict)
    st.header("Data Quality Metrics")
    st.write(f"Data Quality Index (DQI): {metrics['DQI']:.2f}")
    st.write(f"Completeness: {metrics['Completeness']:.2f}")
    st.write(f"Number of Duplicates: {metrics['Duplicates']}")
    st.subheader("Validity of Columns")
    if metrics['Validity']:
        for col, validity in metrics['Validity'].items():
            st.write(f"  - {col}: {validity:.2f}")
    else:
        st.write("  - No validity checks defined.")

    # Display DataFrame
    st.header("Data Preview")
    st.dataframe(df.head())

def main():
    """
    Main function to run the Streamlit application.
    """
    st.sidebar.title("Data Quality Analysis")
    file_path = st.sidebar.file_uploader("Upload a CSV file", type=["csv"])

    if file_path is not None:
        try:
            df = pd.read_csv(file_path)
            filename = file_path.name  # Get the filename for the title
            valid_values = {}
            for col in df.columns:
                unique_vals = df[col].unique()
                if len(unique_vals) <= 50:  #Limit the size of the dropdown.
                    valid_values[col] = st.sidebar.multiselect(f"Valid values for column '{col}'", unique_vals, default = list(unique_vals))
                elif df[col].dtype == 'object': #Suggesting a sample set.
                    top_occuring_values = df[col].value_counts().head(50).index.tolist()
                    valid_values[col] = st.sidebar.multiselect(f"Valid values for column '{col}'", unique_vals, default = top_occuring_values)
                else:
                    valid_values[col] = []

            display_data_quality_dashboard(df, filename, valid_values)
        except Exception as e:
            st.error(f"Error: {e}")
    else:
        st.info("Please upload a CSV file to analyze.")
if __name__ == "__main__":
    main()



2025-05-23 02:27:11.528 
  command:

    streamlit run /home/vscode/.local/lib/python3.10/site-packages/ipykernel_launcher.py [ARGUMENTS]
