# Open Physicians Dataset Workbook

This workbook provides a lightweight walkthrough for loading, inspecting, and summarizing the curated physicians dataset.


In [None]:
from pathlib import Path

import pandas as pd


def resolve_data_path(filename: str) -> Path:
    cwd = Path.cwd()
    candidates = [cwd / "data" / "cleaned" / filename]
    candidates += [parent / "data" / "cleaned" / filename for parent in cwd.parents]

    for candidate in candidates:
        if candidate.exists():
            return candidate

    kaggle_root = Path("/kaggle/input")
    kaggle_searched = False
    if kaggle_root.exists():
        matches = list(kaggle_root.rglob(filename))
        kaggle_searched = True
        if matches:
            return matches[0]

    searched = [str(path) for path in candidates]
    if kaggle_searched:
        searched.append(f"{kaggle_root} (recursively via rglob)")
    raise FileNotFoundError(
        "Could not find the dataset file. Searched: " + ", ".join(searched)
    )


DATA_PATH = resolve_data_path("physicians_clean.csv")
df = pd.read_csv(DATA_PATH)
df.head()


## Snapshot

Use the cells below to understand the dataset footprint before diving deeper.

In [None]:
df.shape


In [None]:
df.info()


## Coverage checks

These checks highlight missing values and coverage for key fields.

In [None]:
missing_rate = df.isna().mean().sort_values(ascending=False)
missing_rate


In [None]:
if df.empty:
    print("The dataset is currently empty. Run the ingestion pipeline to populate it.")
else:
    display(df['license_status'].value_counts(dropna=False))


## Next steps

- Filter by `location_code` or `specialty_code` for targeted profiling.
- Join against mapping tables in `mappings/` for human-readable labels.
- Export subsets for downstream modeling or QA workflows.