# Synthetic HCP Profile Creation

In this python script, I will be generating synthetic HCP profiles by linking it to synthetic patients data created previously from synthea software. Purpose of creating this type of dataset was to utilize for exploratory data analysis and in various potential projects in near future.

#### Required libraries for linking hcp with patients data

In [30]:

import os
import requests
import zipfile
import pandas as pd
from io import BytesIO
import random

#### Extracting downloaded csv file from NPI

In [None]:
zip_filename = "C:/Users/bhand/Desktop/Data Science - My Collection/Deep Learning Project - 1/Data/NPPES_Data_Dissemination_August_2025.zip"
output_dir = "npi_data"

with zipfile.ZipFile(zip_filename, "r") as z:
    z.extractall(output_dir)

print("Files extracted to:", output_dir)

#### Configurations

In [20]:
# Extracting only the required columns from the dataset
usecols = [
    "NPI",
    "Entity Type Code",
    "Provider Last Name (Legal Name)",
    "Provider First Name",
    "Provider Credential Text",
    "Provider Business Mailing Address City Name",
    "Provider Business Mailing Address State Name",
    "Provider Business Mailing Address Postal Code",
    "Healthcare Provider Taxonomy Code_1"
]

chunksize = 50000
max_hcps = 20000

#### Step 1 : Process NPI in chunks

In [24]:
npi_csv = "C:/Users/bhand/Desktop/Data Science - My Collection/Deep Learning Project - 1/Data/Data for Recommender System/npi_data/npidata_pfile_20050523-20250810.csv"
hcp_master_list = []
total_hcps = 0

print("Processing NPI dataset in chunks...")
for chunk in pd.read_csv(npi_csv, chunksize = chunksize, usecols = usecols, dtype = str, low_memory = True):
    # filter : individuals only
    chunk = chunk[chunk["Entity Type Code"] == "1"]

    # rename columns
    chunk = chunk.rename(columns={
        "NPI" : "hcp_id",
        "Provider Last Name (Legal Name)": "last_name",
        "Provider First Name": "first_name",
        "Provider Credential Text":"credential",
        "Provider Business Mailing Address City Name": "city",
        "Provider Business Mailing Address State Name": "state",
        "Provider Business Mailing Address Postal Code": "zip",
        "Healthcare Provider Taxonomy Code_1": "taxonomy_code"
    })

    hcp_master_list.append(chunk)
    total_hcps += len(chunk)

    # stop once enough HCPs collected
    if total_hcps >= max_hcps:
        break

# combine processed chunks
hcp_master = pd.concat(hcp_master_list, ignore_index = True)

Processing NPI dataset in chunks...


#### Step 2 : Taxonomy mapping -> speciality

In [27]:
hcp_output_csv = "hcp_master.csv"

taxonomy_map = {
    "207R00000X": "Internal Medicine",
    "207Q00000X": "Family Medicine",
    "207RE0101X": "Endocrinology",
    "207RC0000X": "Cardiology",
    "207RX0202X": "Oncology",
    "208D00000X": "General Practice"
}

# speciality
hcp_master["speciality"] = hcp_master["taxonomy_code"].map(taxonomy_map).fillna("Other")

## saving hdcp_master csv file

hcp_master.to_csv(hcp_output_csv, index = False)
print(f"HCP Master saved: {hcp_output_csv}, row{len(hcp_master)}")

HCP Master saved: hcp_master.csv, row37729


#### Step 3 : Linking with Synthea Patients

In [33]:
# Join with synthea patients

patients_csv = "C:/mysql_files/patients_clean.csv"
patients_output_csv = "patients_with_hcp.csv"

print("Loading Synthea Patients...")
patients = pd.read_csv(patients_csv, low_memory = False)

# assign specialities randomwly (placeholder logic)

specialties = hcp_master["speciality"].unique().tolist()
patients["assigned_specialty"] = patients["patient_id"].apply(lambda x:random.choice(specialties))

# link each patient to a random HCP of that specialty

hcp_lookup = {}
for spec in specialties:
    subset = hcp_master[hcp_master["speciality"] == spec]["hcp_id"].tolist()
    if subset:
        hcp_lookup[spec] = subset

assigned_hcps = []
for spec in patients["assigned_specialty"]:
    if spec in hcp_lookup:
        assigned_hcps.append(random.choice(hcp_lookup[spec]))
    else:
        assigned_hcps.append(None)

patients["hcp_id"] = assigned_hcps


Loading Synthea Patients...
Patients linked with HCPs saved: patients_with_hcp.csv, rows: 1739


In [None]:
patients.to_csv(patients_output_csv, index = False)
print(f"Patients linked with HCPs saved: {patients_output_csv}, rows: {len(patients)}")
