In [1]:
from faker import Faker
import pandas as pd
import random

In [2]:
fake = Faker ()
Faker.seed(42)
random.seed(42)

In [3]:
specialties = [
    (1,'Cardiology','CARD'),
    (2,'Internal Medicine','IM'),
    (3,'Emergency','ER'),
    (4,'Neurology','NEUR'),
    (5,'Pediatrics','PED'),
    (6,'Oncology','ONC'),
    (7,'Orthopedics','ORTH'),
    (8,'Dermatology','DERM'),
    (9,'Psychiatry','PSY'),
    (10,'Radiology','RAD'),
    (11,'Gastroenterology','GAST'),
    (12,'Pulmonology','PULM'),
    (13,'Endocrinology','ENDO'),
    (14,'Nephrology','NEPH'),
    (15,'Ophthalmology','OPH'),
    (16,'Rheumatology','RHEU'),
    (17,'Urology','URO'),
    (18,'ENT','ENT'),
    (19,'Anesthesiology','ANES'),
    (20,'Pathology','PATH')
]

departments = [
    (1,'Cardiology Unit',3,20),
    (2,'Internal Medicine',2,30),
    (3,'Emergency',1,45),
    (4,'Neurology',4,25),
    (5,'Pediatrics Ward',2,30),
    (6,'Oncology',5,20),
    (7,'Orthopedics',3,25),
    (8,'Dermatology',2,15),
    (9,'Psychiatry',3,20),
    (10,'Radiology',1,30),
    (11,'Gastroenterology',4,20),
    (12,'Pulmonology',4,20),
    (13,'Endocrinology',3,15),
    (14,'Nephrology',4,15),
    (15,'Ophthalmology',2,15),
    (16,'Rheumatology',3,15),
    (17,'Urology',4,20),
    (18,'ENT',2,15),
    (19,'Anesthesiology',1,10),
    (20,'Pathology',1,10)
]

# Save to CSV
pd.DataFrame(specialties, columns=['specialty_id','specialty_name','specialty_code']).to_csv('specialties.csv', index=False)
pd.DataFrame(departments, columns=['department_id','department_name','floor','capacity']).to_csv('departments.csv', index=False)


In [4]:
num_patients = 10000
patients = []

# Seed patients
patients.append((1,'John','Doe','1955-03-15','M','MRN001'))
patients.append((2,'Jane','Smith','1962-07-22','F','MRN002'))
patients.append((3,'Robert','Johnson','1948-11-08','M','MRN003'))

for i in range(3, num_patients):
    first = fake.first_name()
    last = fake.last_name()
    dob = fake.date_of_birth(minimum_age=0, maximum_age=90).isoformat()
    gender = random.choice(['M','F'])
    mrn = f"MRN{str(i+1).zfill(6)}"
    patients.append((i+1, first, last, dob, gender, mrn))

pd.DataFrame(patients, columns=['patient_id','first_name','last_name','date_of_birth','gender','mrn']).to_csv('patients.csv', index=False)


In [5]:
num_providers = 1000
providers = []

# Seed providers
providers.append((1,'James','Chen','MD',1,1))
providers.append((2,'Sarah','Williams','MD',2,2))
providers.append((3,'Michael','Rodriguez','MD',3,3))

for i in range(3, num_providers):
    first = fake.first_name()
    last = fake.last_name()
    credential = 'MD'
    specialty_id = random.randint(1,20)
    department_id = random.randint(1,20)
    providers.append((i+1, first, last, credential, specialty_id, department_id))

pd.DataFrame(providers, columns=['provider_id','first_name','last_name','credential','specialty_id','department_id']).to_csv('providers.csv', index=False)


In [6]:
num_encounters = 10000
encounters = []

# Seed encounters
encounters.append((1,1,'Outpatient','2024-05-10 10:00:00','2024-05-10 11:30:00',1))
encounters.append((2,1,'Inpatient','2024-06-02 14:00:00','2024-06-06 09:00:00',1))
encounters.append((3,2,'Outpatient','2024-05-15 09:00:00','2024-05-15 10:15:00',2))
encounters.append((4,3,'ER','2024-06-12 23:45:00','2024-06-13 06:30:00',3))

for i in range(4, num_encounters):
    patient_id = random.randint(1,num_patients)
    provider_id = random.randint(1,num_providers)
    encounter_type = random.choice(['Outpatient','Inpatient','ER'])
    start_date = fake.date_time_between(start_date='-2y', end_date='now')
    discharge_date = start_date + pd.Timedelta(days=random.randint(0,5), hours=random.randint(1,12))
    department_id = random.randint(1,20)
    encounters.append((i+1, patient_id, provider_id, encounter_type, start_date.isoformat(sep=' '), discharge_date.isoformat(sep=' '), department_id))

pd.DataFrame(encounters, columns=['encounter_id','patient_id','provider_id','encounter_type','encounter_date','discharge_date','department_id']).to_csv('encounters.csv', index=False)


In [7]:
# Diagnoses (500)
diagnoses = [(1,'I10','Hypertension'),(2,'E11.9','Type 2 Diabetes'),(3,'I50.9','Heart Failure')]
for i in range(3,500):
    code = f"D{str(i+1).zfill(4)}"
    desc = f"Diagnosis {i+1}"
    diagnoses.append((i+1, code, desc))
pd.DataFrame(diagnoses, columns=['diagnosis_id','icd10_code','icd10_description']).to_csv('diagnoses.csv', index=False)

# Procedures (500)
procedures = [(1,'99213','Office Visit'),(2,'93000','EKG'),(3,'71020','Chest X-ray')]
for i in range(3,500):
    code = f"P{str(i+1).zfill(4)}"
    desc = f"Procedure {i+1}"
    procedures.append((i+1, code, desc))
pd.DataFrame(procedures, columns=['procedure_id','cpt_code','cpt_description']).to_csv('procedures.csv', index=False)


In [8]:
# Encounter Diagnoses (20k rows)
enc_diag = []
for i in range(20000):
    encounter_id = random.randint(1,num_encounters)
    diagnosis_id = random.randint(1,500)
    seq = random.randint(1,3)
    enc_diag.append((i+1, encounter_id, diagnosis_id, seq))
pd.DataFrame(enc_diag, columns=['encounter_diagnosis_id','encounter_id','diagnosis_id','diagnosis_sequence']).to_csv('encounter_diagnoses.csv', index=False)

# Encounter Procedures (20k rows)
enc_proc = []
for i in range(20000):
    encounter_id = random.randint(1,num_encounters)
    procedure_id = random.randint(1,500)
    proc_date = fake.date_between(start_date='-2y', end_date='today')
    enc_proc.append((i+1, encounter_id, procedure_id, proc_date))
pd.DataFrame(enc_proc, columns=['encounter_procedure_id','encounter_id','procedure_id','procedure_date']).to_csv('encounter_procedures.csv', index=False)

# Billing (10k)
billing = []
for i in range(num_encounters):
    encounter_id = i+1
    claim_amount = round(random.uniform(100,10000),2)
    allowed_amount = round(claim_amount * random.uniform(0.7,0.95),2)
    claim_date = fake.date_between(start_date='-2y', end_date='today')
    status = random.choice(['Paid','Pending','Denied'])
    billing.append((i+1, encounter_id, claim_amount, allowed_amount, claim_date, status))
pd.DataFrame(billing, columns=['billing_id','encounter_id','claim_amount','allowed_amount','claim_date','claim_status']).to_csv('billing.csv', index=False)
