# Open Physicians Dataset Workbook

This workbook provides a lightweight walkthrough for loading, inspecting, and summarizing the curated physicians dataset.


In [None]:
from pathlib import Path

import pandas as pd


def resolve_data_path(filename: str) -> Path:
    """Resolve the path to a data file, checking multiple locations.
    
    Search order:
    1. data/cleaned/ in current directory and parent directories
    2. Kaggle input directory (if exists) with limited depth search
    """
    cwd = Path.cwd()
    candidates = [cwd / "data" / "cleaned" / filename]
    candidates += [parent / "data" / "cleaned" / filename for parent in cwd.parents]

    for candidate in candidates:
        if candidate.exists():
            return candidate

    # Kaggle fallback with optimized search
    kaggle_root = Path("/kaggle/input")
    if kaggle_root.exists():
        # Search with depth limit to avoid performance issues
        # Most Kaggle datasets are in /kaggle/input/{dataset-name}/ (depth 2)
        kaggle_matches = []
        try:
            # Limit search to 3 levels deep to avoid scanning entire directory tree
            for path in kaggle_root.glob(f"*/*/{filename}"):
                kaggle_matches.append(path)
            if not kaggle_matches:
                # Try one level up if not found
                for path in kaggle_root.glob(f"*/{filename}"):
                    kaggle_matches.append(path)
        except (OSError, PermissionError):
            # Skip if we can't access the directory
            pass
        
        if len(kaggle_matches) == 1:
            return kaggle_matches[0]
        elif len(kaggle_matches) > 1:
            raise FileNotFoundError(
                f"Multiple files named {filename!r} found under {kaggle_root}. "
                f"Please specify a more specific path. Matches: "
                + ", ".join(str(path) for path in kaggle_matches)
            )

    searched = [str(path) for path in candidates]
    if kaggle_root.exists():
        searched.append(f"{kaggle_root} (limited depth search)")
    raise FileNotFoundError(
        "Could not find the dataset file. Searched: " + ", ".join(searched)
    )


DATA_PATH = resolve_data_path("physicians_clean.csv")
df = pd.read_csv(DATA_PATH)
df.head()


## Snapshot

Use the cells below to understand the dataset footprint before diving deeper.

In [None]:
df.shape


In [None]:
df.info()


## Coverage checks

These checks highlight missing values and coverage for key fields.

In [None]:
missing_rate = df.isna().mean().sort_values(ascending=False)
missing_rate


In [None]:
if df.empty:
    print("The dataset is currently empty. Run the ingestion pipeline to populate it.")
else:
    display(df['license_status'].value_counts(dropna=False))


## Next steps

- Filter by `location_code` or `specialty_code` for targeted profiling.
- Join against mapping tables in `mappings/` for human-readable labels.
- Export subsets for downstream modeling or QA workflows.