## Build a Data Quality Dashboard

**Description**: Create a simple dashboard that displays data quality metrics using a library like `dash` or `streamlit`.

**Steps:**
1. Install Streamlit: pip install streamlit
2. Create a Python script dashboard.py.
3. Run the dashboard: streamlit run dashboard.py

In [2]:
%pip install streamlit
import streamlit as st
import pandas as pd
import numpy as np
import re

# --- Upload CSV File ---
st.set_page_config(page_title="Data Quality Dashboard", layout="wide")
st.title("📊 Data Quality Dashboard")

uploaded_file = st.file_uploader("Upload a CSV file", type=["csv"])

if uploaded_file is not None:
    df = pd.read_csv(uploaded_file)
    st.success("File uploaded successfully!")

    st.subheader("🔍 Dataset Preview")
    st.dataframe(df.head())

    # --- Data Quality Checks ---

    # 1. Missing Values
    st.subheader("🟡 Missing Values")
    missing_counts = df.isnull().sum()
    missing_percentage = (missing_counts / len(df)) * 100
    missing_df = pd.DataFrame({
        'Missing Count': missing_counts,
        'Missing Percentage (%)': missing_percentage
    })
    st.dataframe(missing_df)

    # 2. Duplicates
    st.subheader("🔁 Duplicate Records")
    duplicate_rows = df.duplicated().sum()
    st.write(f"Total duplicate rows: {duplicate_rows}")

    # 3. Data Types
    st.subheader("🔤 Column Data Types")
    st.dataframe(df.dtypes.astype(str).reset_index().rename(columns={'index': 'Column', 0: 'Data Type'}))

    # 4. Basic Statistics
    st.subheader("📈 Summary Statistics")
    st.write(df.describe(include='all'))

    # 5. Unique Values
    st.subheader("🔗 Unique Values by Column")
    unique_vals = df.nunique()
    st.dataframe(unique_vals.reset_index().rename(columns={'index': 'Column', 0: 'Unique Count'}))

    # 6. Optional Validations (Email & Age)
    st.subheader("✅ Optional Validations")

    if 'Email' in df.columns:
        email_pattern = r"^[\w\.-]+@[\w\.-]+\.\w+$"
        invalid_emails = df[~df['Email'].astype(str).str.match(email_pattern)]
        st.write(f"Invalid emails found: {len(invalid_emails)}")
        st.dataframe(invalid_emails)

    if 'Age' in df.columns:
        invalid_ages = df[(df['Age'] < 0) | (df['Age'] > 120)]
        st.write(f"Invalid ages found: {len(invalid_ages)}")
        st.dataframe(invalid_ages)

    # --- Visuals ---

    st.subheader("📊 Visualizations")

    st.markdown("**Missing Values (%) by Column**")
    st.bar_chart(missing_percentage)

else:
    st.info("👈 Upload a CSV file to get started!")

Defaulting to user installation because normal site-packages is not writeable
Collecting streamlit
  Downloading streamlit-1.45.1-py3-none-any.whl.metadata (8.9 kB)
Collecting cachetools<6,>=4.0 (from streamlit)
  Downloading cachetools-5.5.2-py3-none-any.whl.metadata (5.4 kB)
Collecting packaging<25,>=20 (from streamlit)
  Downloading packaging-24.2-py3-none-any.whl.metadata (3.2 kB)
Collecting protobuf<7,>=3.20 (from streamlit)
  Downloading protobuf-6.31.0-cp39-abi3-manylinux2014_x86_64.whl.metadata (593 bytes)
Collecting pyarrow>=7.0 (from streamlit)
  Downloading pyarrow-20.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting tenacity<10,>=8.1.0 (from streamlit)
  Downloading tenacity-9.1.2-py3-none-any.whl.metadata (1.2 kB)
Collecting toml<2,>=0.10.1 (from streamlit)
  Downloading toml-0.10.2-py2.py3-none-any.whl.metadata (7.1 kB)
Collecting watchdog<7,>=2.1.5 (from streamlit)
  Downloading watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl.metadata (44 kB)
Collec

2025-05-21 17:28:44.910 
  command:

    streamlit run /home/vscode/.local/lib/python3.10/site-packages/ipykernel_launcher.py [ARGUMENTS]
