## Data Quality Dashboard in Python

**Description**: Create a basic dashboard using a Python library (e.g., Plotly Dash) to visualize data quality metrics for a given dataset.

In [1]:
# Write your code from here
!pip install dash pandas


Defaulting to user installation because normal site-packages is not writeable
Collecting dash
  Downloading dash-3.0.4-py3-none-any.whl (7.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.9/7.9 MB[0m [31m15.8 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting importlib-metadata
  Downloading importlib_metadata-8.7.0-py3-none-any.whl (27 kB)
Collecting Werkzeug<3.1
  Downloading werkzeug-3.0.6-py3-none-any.whl (227 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m228.0/228.0 kB[0m [31m27.8 MB/s[0m eta [36m0:00:00[0m
Collecting retrying
  Downloading retrying-1.3.4-py3-none-any.whl (11 kB)
Collecting Flask<3.1,>=1.0.4
  Downloading flask-3.0.3-py3-none-any.whl (101 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m101.7/101.7 kB[0m [31m22.5 MB/s[0m eta [36m0:00:00[0m
Collecting itsdangerous>=2.1.2
  Downloading itsdangerous-2.2.0-py3-none-any.whl (16 kB)
Collecting click>=8.1.3
  Downloading click-8.2.1-py3-none-

In [3]:
import dash
from dash import dcc, html
from dash.dependencies import Input, Output
import pandas as pd
import plotly.express as px

# Sample dataset: Replace this with loading your own dataset
df = pd.DataFrame({
    'Age': [25, 30, None, 45, 22],
    'Salary': [50000, 60000, 55000, None, 48000],
    'Department': ['HR', 'IT', 'IT', 'Finance', None]
})

# Calculate missing values per column
missing_data = df.isnull().sum().reset_index()
missing_data.columns = ['Column', 'Missing Values']

# Data types per column
data_types = pd.DataFrame(df.dtypes).reset_index()
data_types.columns = ['Column', 'Data Type']

# Summary statistics for numeric columns
summary_stats = df.describe().reset_index()

# Initialize Dash app
app = dash.Dash(__name__)

app.layout = html.Div([
    html.H1("Data Quality Dashboard", style={'textAlign': 'center'}),
    
    html.H2("Missing Values by Column"),
    dcc.Graph(
        id='missing-values-bar',
        figure=px.bar(missing_data, x='Column', y='Missing Values', 
                      title="Missing Values per Column",
                      text='Missing Values')
    ),
    
    html.H2("Data Types"),
    html.Table([
        html.Thead([
            html.Tr([html.Th(col) for col in data_types.columns])
        ]),
        html.Tbody([
            html.Tr([
                html.Td(data_types.iloc[i][col]) for col in data_types.columns
            ]) for i in range(len(data_types))
        ])
    ], style={'width': '50%', 'margin': 'auto', 'border': '1px solid black', 'borderCollapse': 'collapse'}),
    
    html.H2("Summary Statistics (Numerical Columns)"),
    html.Table([
        html.Thead([
            html.Tr([html.Th(col) for col in summary_stats.columns])
        ]),
        html.Tbody([
            html.Tr([
                html.Td(summary_stats.iloc[i][col]) for col in summary_stats.columns
            ]) for i in range(len(summary_stats))
        ])
    ], style={'width': '80%', 'margin': 'auto', 'border': '1px solid black', 'borderCollapse': 'collapse'})
])

if __name__ == '__main__':
    app.run(debug=True)


In [4]:
import pandas as pd
import dash
from dash import dcc, html
import plotly.graph_objs as go
import logging

# Setup basic logging
logging.basicConfig(level=logging.INFO, format='%(levelname)s:%(message)s')

# === Create synthetic dataset ===
def create_sample_data():
    data = {
        'Age': [25, 30, None, 22, 40, 35, None],
        'Income': [50000, 60000, 58000, None, 72000, None, 52000],
        'Gender': ['Male', 'Female', 'Female', 'Male', None, 'Male', 'Female'],
        'Purchased': ['Yes', 'No', 'Yes', 'No', 'Yes', 'No', None]
    }
    df = pd.DataFrame(data)
    logging.info(f"Sample data created with shape {df.shape}")
    return df

# === Utility functions ===

def validate_columns(df, required_cols):
    """Ensure required columns exist in the DataFrame."""
    missing = [col for col in required_cols if col not in df.columns]
    if missing:
        raise ValueError(f"Missing required columns: {missing}")

def compute_missing_data(df):
    """Return a Series with count of missing values per column."""
    return df.isnull().sum()

def compute_data_types(df):
    """Return a Series with data types of each column."""
    return df.dtypes

def compute_summary_stats(df):
    """Return descriptive statistics for numerical columns."""
    return df.describe()

def create_missing_data_bar(missing_series):
    """Create Plotly bar chart for missing data counts."""
    return go.Bar(
        x=missing_series.index,
        y=missing_series.values,
        marker_color='indianred'
    )

def create_data_types_table(data_types):
    """Create an HTML table to show data types."""
    rows = []
    for col, dtype in data_types.items():
        rows.append(html.Tr([html.Td(col), html.Td(str(dtype))]))
    return html.Table([
        html.Thead(html.Tr([html.Th("Column"), html.Th("Data Type")])),
        html.Tbody(rows)
    ])

def create_summary_stats_table(summary_df):
    """Create an HTML table for summary statistics."""
    header = [html.Th(col) for col in summary_df.columns]
    rows = []
    for idx, row in summary_df.iterrows():
        rows.append(html.Tr([html.Td(idx)] + [html.Td(round(val, 3)) for val in row]))
    return html.Table([
        html.Thead(html.Tr([html.Th("Statistic")] + header)),
        html.Tbody(rows)
    ])

# === Dash App Setup ===

app = dash.Dash(__name__)
app.title = "Data Quality Dashboard"

try:
    df = create_sample_data()
    # validate_columns(df, ['Age', 'Income', 'Gender', 'Purchased']) # Uncomment if you want column validation
except Exception as e:
    df = None
    error_message = str(e)

app.layout = html.Div([
    html.H1("Data Quality Dashboard", style={'textAlign': 'center'}),
    
    html.Div(id='error-div', children=[
        html.H3(f"Error loading data: {error_message}") if df is None else ""
    ], style={'color': 'red', 'textAlign': 'center'}),
    
    html.Div(id='dashboard-content', children=[
        html.H2("Missing Data Overview"),
        dcc.Graph(
            id='missing-data-plot',
            figure={
                'data': [create_missing_data_bar(compute_missing_data(df))],
                'layout': go.Layout(
                    title='Missing Values per Column',
                    yaxis={'title': 'Count of Missing Values'},
                    xaxis={'title': 'Columns'}
                )
            }
        ),

        html.H2("Data Types"),
        create_data_types_table(compute_data_types(df)),

        html.H2("Summary Statistics"),
        create_summary_stats_table(compute_summary_stats(df))
    ]) if df is not None else None
],
style={'width': '80%', 'margin': 'auto', 'fontFamily': 'Arial'})

if __name__ == '__main__':
    app.run(debug=True)


INFO:Sample data created with shape (7, 4)
