In [29]:
import pandas as pd
import numpy as np
from faker import Faker
import random
from datetime import datetime, timedelta
import hashlib

In [30]:
fake=Faker()

In [None]:
def generate_medical_dataset(n):
    data=[]
    hospitals=["General Hospital","City Medical Center","University Hospital","Community Health","Regional Medical","ST. Mary's Hospital"]
    conditions=[
        ("Hypertension","I10","Cardiovascular"),
        ("Diabetes Mellitus","E11","Endocrine"),
        ("Asthma","J45","Pulmonary"),
        ("Chronic Obstructive Pulmonary Disease","I25","Respiratory"),
        ("Heart Failure","J44","Cardiovascular"),
        ("Pneumonia","J18","Respiratory"),
        ("Chronic Kidney Disease","N18","Genitourinary"),       
        ("Depression","F32","Mental Health"),
        ("Anxiety Disorder","F41","Mental Health"),
        ("Osteoarthritis","M19","Musculoskeletal")
        ]

    causes_of_death=[
        "Cardiac Arrest",
        "Respiratory Failure",
        "Sepsis",
        "Multiple Organ Failure",
        "Cerebral Hemorrhage",
        "Pulmonary Embolism",
        "Liver Failure",
        "Renal Failure",
        ]

    # sensitive personal information
    for i in range(n):
        first_name=fake.first_name()
        last_name=fake.last_name()
        dob=fake.date_of_birth(minimum_age=0,maximum_age=100)
        age=(datetime.now().date()-dob).days//365
        ssn=fake.ssn()

        # sensitive medical information
        credit_card=fake.credit_card_number()
        credit_card_expiry=fake.credit_card_expire()
        credit_card_cvv=fake.credit_card_security_code()

        # contact information
        phone_number=fake.phone_number()
        email=fake.email()
        address=fake.address().replace("\n",", ")

        # medical episode
        cond_=conditions
        condition,icd_10,category=random.choice(cond_)
        admission_date=fake.date_between(start_date='-2y',end_date='today')

        # determine if patient died
        died=random.choices([True,False],weights=[0.1,0.9])[0]
        # if person died
        if died:
            stay_days=random.randint(1,30)
            release_date=admission_date+timedelta(days=stay_days)
            cause_of_death=random.choice(causes_of_death)
            death_date=release_date
        # if person survived
        else:
            stay_days=random.randint(1,14)
            release_date=admission_date+timedelta(days=stay_days)
            cause_of_death=None
            death_date=None

        transferred=random.choices([True,False],weights=[0.2,0.8])[0]
        if transferred:
            other_hospitals=random.sample([h for h in hospitals if h!=hospitals],k=random.randint(1,2))
            transfer_hospitals=random.choice(other_hospitals)
            if stay_days>7:
                transfer_date=admission_date+timedelta(days=random.randint(3,stay_days-3))
            else:
                transfer_date=admission_date
        else:
            transfer_hospitals=None
            transfer_date=None

        # doctor appointments
        if not died and random.random()>0.3:        # 70% gets follow up appointments
            follow_up_date=release_date+timedelta(days=random.randint(7,90))
        else:
            follow_up_date=None

        # insurance iformation
        insurance_provider=["ABSA Insurance",
                            "Hollard Insurance",
                            "Momentum Insurance",
                            "MiWay Insurance",
                            "OUTsurance",
                            "Santam Insurance",
                            "Standard Insurance",
                            "Old Mutual Insurance",
                            "Liberty Insurance"]
        insurance_id=fake.random_int(1000000000,9999999999)
    
        # medical notes (mix of sensitive and clinical data)
        medical_notes=f"Patient presented with {condition}. "\
                    f"Contact: {phone_number}. "\
                    f"Email: {email}. "\
                    f"Address: {address}. "\
                    f"Insurance: {random.choice(insurance_provider)} . "\
                    f"SSN: {ssn}. "\
                    f"Clinical findings: {fake.text(max_nb_chars=400)}. "
    
        # payments information
        total_cost=round(random.uniform(5000,50000),2)
        insurance_coverage=round(total_cost*random.uniform(0.5,0.9),2)
        out_of_pocket=round(total_cost-insurance_coverage,2)

        # create record
        record={
            # highly sensitive personal info
            'patient_first_name':first_name,
            'patient_last_name':last_name,
            'patient_dob':dob,
            'patient_ssn':ssn,
            'patient_address':address,
            'patient_phone':phone_number,
            'patient_email':email,

            # sensitive financial information
            'patient_credit_card':credit_card,
            'patient_credit_card_expiry':credit_card_expiry,
            'patient_credit_card_cvv':credit_card_cvv,
            'insurance_provider':random.choice(insurance_provider),
            'insurance_id':insurance_id,

            # clinical information
            'age':age,
            'admission_date':admission_date,
            'release_date':release_date,
            'primary_diagnosis':condition,
            'icd_10_code':icd_10,
            'diagnosis_category':category,
            'attending_physician':fake.name(),

            # sensitive outcomes
            'died':died,
            'cause_of_death':cause_of_death,
            'death_date':death_date,

            # hospital information
            'hospital_name':random.choice(hospitals),
            'transferred':transferred,
            'transfer_hospitals':transfer_hospitals,
            'transfer_date':transfer_date,

            # follow-up care
            'follow_up_appointment':follow_up_date,
            'follow_up_doctor':fake.name() if follow_up_date else None,

            # financial information
            'total_cost':total_cost,
            'insurance_coverage':insurance_coverage,
            'out_of_pocket':out_of_pocket,

            # medical notes (contains mix of sensitive information)
            'medical_notes':medical_notes,

            # Non-sensitive identifiers
            'patient_id':f"PAT{random.randint(100000,999999)}",
            'encounter_id':f"ENC{random.randint(1000000,9999999)}",
        }
        
        data.append(record)
    
    return pd.DataFrame(data)

Generate Dataset

In [48]:
print("Generating medical dataset...")
df_medical=generate_medical_dataset(100000)

Generating medical dataset...


In [None]:
print("\nSample of Generated Data:")
print(df_medical[['patient_first_name',
                  'patient_last_name',
                  'primary_diagnosis',
                  'admission_date',
                  'died',
                  'patient_ssn']].head(10000))
print(f"\nDataset shape: {df_medical.shape}")
print(f"Number of deaths: {df_medical['died'].sum()}")

SECURITY

In [None]:
def anonymize_medical_data(df_medical):
    df_anon=df_medical.copy()
    # hash sensitive identifiers
    df_anon['patient_first_name']=df_anon['patient_first_name'].apply(
            lambda x: hashlib.sha256(x.encode()).hexdigest()[:8]
        )
    # remove or mask highly sensitive data
    df_anon['patient_ssn']='XXX-XX-'+df_anon['patient_ssn'].str[-4:]
    df_anon['patient_address']='REDACTED'
    df_anon['patient_phone']='REDACTED'
    df_anon['insurance_id']='REDACTED'
    df_anon['patient_email']='REDACTED'

    # mask credit card information
    df_anon['patient_credit_card']='XXXX-XXXX-XXXX-'+df_anon['patient_credit_card'].str[-4:]
    df_anon['patient_credit_card_expiry']='XX/XX'
    df_anon['patient_credit_card_cvv']='XXX'

    # remove specific dates but keep age
    df_anon['patient_dob']='REDACTED'

    # clean medical notes of PII
    def clean_notes(medical_notes):
        cleaned = medical_notes
        for term in['Contact:','Email:','Address:','Insurance:','SSN:']:
            cleaned=cleaned.split(term)[0]+' REDACTED' if term in cleaned else cleaned
        return cleaned
    
    df_anon['medical_notes']=df_anon['medical_notes'].apply(clean_notes)

    return df_anon

In [81]:
print("\nCreating anonymized version...")
df_anon=anonymize_medical_data(df_medical)


Creating anonymized version...


In [None]:
print("\nAnonymizing medical dataset...")
print(df_anon[['patient_first_name',
                'patient_last_name',
                'primary_diagnosis',
                'admission_date',
                'died',
                'patient_credit_card']].head(10))

In [None]:
print("\n===MEDICAL DATA ANALYSIS")

# mortality analysis by condition
mortality_by_diagnosis=df_medical.groupby('primary_diagnosis').agg({
    'died':['sum','count'],
    'age':['mean','min','max'],
    'total_cost':['mean','min','max']
}).round(2)

print("\nMortality by Diagnosis:")
print(mortality_by_diagnosis)

SAVING DATA

In [None]:
# data for engineers
df_medical.to_csv("private_medical_data.csv",index=False)

In [82]:
# data for public
df_anon.to_csv("public_medical_data_.csv",index=False)