In [1]:
import pandas as pd
import random
import uuid
from datetime import datetime, timedelta
# Replace with your actual file name
df = pd.read_csv('healthcare_dataset.csv')

# Show the first few rows
df.head()

# === 2. Constants & Setup ===
print("Setting up constants and province mappings...")
random.seed(42)
provinces = ['AB', 'BC', 'MB', 'NB', 'NL', 'NT', 'NS', 'NU', 'ON', 'PE', 'QC', 'SK', 'YT']

area_codes = {
    'AB': ['403', '587', '780', '825'], 'BC': ['236', '250', '604', '778'],
    'MB': ['204', '431'], 'NB': ['506'], 'NL': ['709'], 'NT': ['867'],
    'NS': ['902', '782'], 'NU': ['867'], 'ON': ['226', '289', '343', '416', '437', '519', '613', '647', '705', '807', '905'],
    'PE': ['902'], 'QC': ['418', '438', '450', '514', '579', '581', '819', '873'],
    'SK': ['306', '639'], 'YT': ['867']
}

province_cities = {
    'AB': ['Calgary', 'Edmonton', 'Red Deer'], 'BC': ['Vancouver', 'Victoria', 'Kelowna'],
    'MB': ['Winnipeg', 'Brandon'], 'NB': ['Fredericton', 'Moncton'], 'NL': ["St. John's", 'Corner Brook'],
    'NT': ['Yellowknife'], 'NS': ['Halifax', 'Sydney'], 'NU': ['Iqaluit'],
    'ON': ['Toronto', 'Ottawa', 'Hamilton'], 'PE': ['Charlottetown'],
    'QC': ['Montreal', 'Quebec City', 'Sherbrooke'], 'SK': ['Regina', 'Saskatoon'], 'YT': ['Whitehorse']
}

province_clinics = {
    'AB': ['Alberta Health Clinic', 'Calgary Wellness Centre'],
    'BC': ['Vancouver Family Clinic', 'Kelowna Care Centre'],
    'MB': ['Winnipeg Medical Hub'], 'NB': ['Moncton HealthCare'],
    'NL': ["St. John's Clinic"], 'NT': ['Yellowknife Medical'],
    'NS': ['Halifax Diagnostic Centre'], 'NU': ['Iqaluit Wellness Clinic'],
    'ON': ['Toronto Medical Group', 'Ottawa Family Health'],
    'PE': ['Charlottetown Health Clinic'],
    'QC': ['Montreal Specialist Clinic', 'Quebec City Medical'],
    'SK': ['Regina Care Clinic'], 'YT': ['Whitehorse Health Services']
}

ctas_symptoms_mapping = {
    'Cancer': {
        1: 'Severe pain, weight loss, fatigue, bleeding',
        2: 'Moderate pain, fatigue, unexplained weight loss',
        3: 'Mild pain, fatigue, general malaise',
        4: 'Minimal pain, occasional fatigue',
        5: 'Mild cold, localized pain, no systemic symptoms, regular follow up'
    },
    'Obesity': {
        1: 'Severe difficulty breathing, chest pain',
        2: 'Moderate shortness of breath, tiredness',
        3: 'Mild fatigue, shortness of breath after exertion',
        4: 'Occasional fatigue, no other symptoms',
        5: 'No significant symptoms - Regualar follow up'
    },
    'Diabetes': {
        1: 'Severe hyperglycemia, confusion, coma',
        2: 'Moderate hyperglycemia, dehydration, dizziness',
        3: 'Mild fatigue, frequent urination, thirst',
        4: 'Occasional dizziness, mild thirst',
        5: 'No noticeable symptoms'
    },
    'Asthma': {
        1: 'Severe shortness of breath, wheezing, chest tightness',
        2: 'Moderate shortness of breath, wheezing, fatigue',
        3: 'Mild wheezing, shortness of breath on exertion',
        4: 'Occasional wheezing, slight shortness of breath',
        5: 'No noticeable symptoms'
    },
    'Hypertension': {
        1: 'Severe chest pain, severe headache, confusion',
        2: 'Moderate headache, dizziness, blurred vision',
        3: 'Mild headache, dizziness, fatigue',
        4: 'Occasional dizziness, no headache',
        5: 'No noticeable symptoms'
    },
    'Arthritis': {
        1: 'Severe joint pain, swelling, redness',
        2: 'Moderate joint pain, stiffness, mild swelling',
        3: 'Mild joint pain, occasional stiffness',
        4: 'Occasional joint discomfort',
        5: 'Minimal pain, no swelling or stiffness'
    }
}

# Define allowed CTAS levels based on Admission Type and Test Result
ctas_level_map = {
    'Emergency': {
        'Normal': [4, 5],
        'Abnormal': [1, 2],
        'Inconclusive': [1, 2]
    },
    'Urgent': {
        'Normal': [5],
        'Abnormal': [2, 3],
        'Inconclusive': [2, 3]
    },
    'Elective': {
        'Normal': [5],
        'Abnormal': [3],
        'Inconclusive': [4, 5]
    }
}

department_mapping = {
    ('Cancer', 2): ['Oncology', 'Gynecology', 'Neurology'],
    ('Cancer', 3): ['Oncology', 'Gastroenterology'],
    ('Cancer', 4): ['Oncology', 'General Medicine'],
    ('Cancer', 5): ['Oncology', 'Family Medicine'],
    ('Obesity', 2): ['Internal Medicine', 'Cardiology'],
    ('Obesity', 3): ['Internal Medicine', 'Respiratory'],
    ('Obesity', 4): ['Internal Medicine'],
    ('Obesity', 5): ['Family Medicine', 'Physiotherapy'],
    ('Diabetes', 2): ['Endocrinology', 'Neurology'],
    ('Diabetes', 3): ['Endocrinology', 'General Medicine'],
    ('Diabetes', 4): ['Endocrinology', 'Geriatrics'],
    ('Diabetes', 5): ['Endocrinology', 'Nursing', 'Home Care'],
    ('Asthma', 1): ['Emergency', 'Pulmonology'],
    ('Asthma', 2): ['Pulmonology', 'ENT'],
    ('Asthma', 3): ['Pulmonology'],
    ('Asthma', 4): ['Family Medicine', 'ENT'],
    ('Asthma', 5): ['Family Medicine'],
    ('Hypertension', 2): ['Cardiology', 'Emergency'],
    ('Hypertension', 3): ['Cardiology'],
    ('Hypertension', 4): ['Family Medicine', 'Internal Medicine'],
    ('Arthritis', 3): ['Rheumatology', 'Orthopedics'],
    ('Arthritis', 4): ['Orthopedics', 'General Medicine'],
    ('Arthritis', 5): ['Rheumatology', 'Family Medicine']
}

# Map CTAS Level to Urgency
ctas_to_urgency = {
    1: 'Urgent',
    2: 'Urgent',
    3: 'Routine / Urgent',
    4: 'Routine',
    5: 'Routine'
}

# === 3. Generators ===
print("Defining data generation functions...")
def generate_phn(province):
    if province in ['AB', 'MB', 'NB', 'NL', 'NT', 'NS', 'NU', 'PE', 'SK', 'YT']:
        return str(random.randint(100000000, 999999999))
    elif province == 'BC':
        return str(random.randint(9000000000, 9999999999))
    elif province == 'ON':
        return f"{random.randint(1000000000, 9999999999)}-{random.choice(['AA', 'AB', 'AC', 'AD', 'AE'])}"
    elif province == 'QC':
        return f"{random.choice(['LENO', 'DURO', 'SMIT', 'BORD'])}{random.randint(100000000, 999999999)}"

def generate_phone_number(province):
    code = random.choice(area_codes[province])
    return f"({code}) {random.randint(100,999)}-{random.randint(1000,9999)}"

def generate_postal_code():
    letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
    return f"{random.choice(letters)}{random.randint(1,9)}{random.choice(letters)} {random.randint(1,9)}{random.choice(letters)}{random.randint(1,9)}"

def generate_address(province):
    city = random.choice(province_cities[province])
    street = f"{random.randint(100, 9999)} {random.choice(['Main St', 'Maple Ave', 'Oak St', 'Elm Rd', 'Pine Cres'])}"
    return f"{street}, {city}, {province}, {generate_postal_code()}"

def build_physician_directory():
    return {
        province: [
            {
                "name": f"Dr. {random.choice(['Smith', 'Lee', 'Patel', 'Wong', 'Martin'])}",
                "address": generate_address(province),
                "phone": generate_phone_number(province)
            } for _ in range(3)
        ] for province in provinces
    }

def extract_physician_info(province):
    physicians = build_physician_directory()
    selected = random.choice(physicians[province])
    return pd.Series([selected['name'], selected['phone'], selected['address']])

def generate_referral_between(adm_date, dis_date):
    if pd.isna(adm_date) or pd.isna(dis_date) or adm_date >= dis_date:
        return adm_date
    return adm_date + timedelta(days=random.randint(0, (dis_date - adm_date).days))

# Function to extract and mix symptoms
def assign_symptom(row):
    admission_type = row['Admission Type']
    test_result = row['Test Results']
    condition = row['Medical Condition']

    allowed_ctas_levels = ctas_level_map.get(admission_type, {}).get(test_result, [5])
    chosen_ctas = random.choice(allowed_ctas_levels)

    # Get base symptoms
    base_symptoms = ctas_symptoms_mapping[condition][chosen_ctas].split(', ')

    # Optionally add symptoms from an adjacent CTAS level
    adjacent_ctas = random.choice([lvl for lvl in range(1, 6) if lvl != chosen_ctas])
    extra_symptoms = ctas_symptoms_mapping[condition][adjacent_ctas].split(', ')

    # Mix symptoms (select 1 to 3)
    all_possible = list(set(base_symptoms + extra_symptoms))
    selected_symptoms = random.sample(all_possible, k=min(len(all_possible), random.randint(1, 3)))

    return chosen_ctas, ', '.join(selected_symptoms)

def generate_referral_reason(row):
    # Example logic for generating a referral reason
    condition = row['Medical Condition']
    symptoms = row['Symptoms']
    department = row['Referred Department']
    reason = f"Patient with {condition} showing symptoms of {symptoms} requires referral to a {department}"
    return reason

def get_department(row):
    key = (row['Medical Condition'], row['CTAS Level'])
    departments = department_mapping.get(key, ['General Medicine'])
    return random.choice(departments)

# Define how urgency is assigned, with randomness for level 3
def get_urgency(ctas_level):
    if ctas_level == 3:
        return random.choice(['Urgent', 'Routine'])
    elif ctas_level in [1, 2]:
        return 'Urgent'
    else:  # 4 or 5
        return 'Routine'

# Assign churn: 1 if urgent, 0 if routine
def assign_churn_from_ctas(ctas_level):
    urgency = get_urgency(ctas_level)
    return 1 if urgency == 'Urgent' else 0

# === 4. Apply Generators to DataFrame ===
print("Assigning province, PHN, phone, and address...")
df['Province'] = random.choices(provinces, k=len(df))
df['PHN'] = df['Province'].apply(generate_phn)
df['Patient Phone'] = df['Province'].apply(generate_phone_number)
df['Patient Address'] = df['Province'].apply(generate_address)

print("Assigning referred physicians...")
df[['Referred Physician', 'Physician Phone', 'Physician Address']] = df['Province'].apply(extract_physician_info)

# Dates
print("Parsing and generating dates...")
df['Date of Admission'] = pd.to_datetime(df['Date of Admission'], errors='coerce')
df['Discharge Date'] = pd.to_datetime(df['Discharge Date'], errors='coerce')
df['Referral Date'] = df.apply(lambda row: generate_referral_between(row['Date of Admission'], row['Discharge Date']), axis=1)
df['Referral Date'] = df['Referral Date'].dt.strftime('%Y-%m-%d')

df['Referral ID'] = [f"REF-{i:05d}" for i in range(1, len(df) + 1)]
df['Clinic Name'] = df['Province'].apply(lambda p: random.choice(province_clinics[p]))
print("Finished assigning clinic and referral IDs.")

# Apply function
print("Mapping CTAS level to symptoms...")
df[['CTAS Level', 'Symptoms']] = df.apply(assign_symptom, axis=1, result_type='expand')
df['Referred Department'] = df.apply(get_department, axis=1)
df['Reason for Referral'] = df.apply(generate_referral_reason, axis=1)
print("Assigned Department and Reason for Referral.")

fatal_conditions = ['Cancer', 'Diabetes', 'Asthma', 'Hypertension']

df['Condition Severity'] = df['Medical Condition'].apply(
    lambda x: 'Fatal' if x in fatal_conditions else 'Non-Fatal'
)

bins = [0, 17, 64, 120]  # define age ranges
labels = ['Adolescent', 'Adult', 'Senior']

df['Age Group'] = pd.cut(df['Age'], bins=bins, labels=labels, right=True)

def categorize_age(age):
    if age <= 17:
        return 'Adolescent'
    elif age <= 64:
        return 'Adult'
    else:
        return 'Senior'

df['Age Group'] = df['Age'].apply(categorize_age)

Setting up constants and province mappings...
Defining data generation functions...
Assigning province, PHN, phone, and address...
Assigning referred physicians...
Parsing and generating dates...
Finished assigning clinic and referral IDs.
Mapping CTAS level to symptoms...
Assigned Department and Reason for Referral.


In [2]:
def assign_churn(row):
    condition_severity = row['Condition Severity']
    age_group = row['Age Group']
    admission_type = row['Admission Type']
    test_result = row['Test Results']
    ctas_level = row['CTAS Level']

    # Define the CTAS level ranges for churn=1 based on your image
    if condition_severity == 'Fatal':
        if age_group == 'Adolescent':
            if admission_type == 'Elective':
                if test_result == 'Normal' and ctas_level == 1:
                    return 1
                elif test_result == 'Abnormal' and 1 <= ctas_level <= 4:
                    return 1
                elif test_result == 'Inconclusive' and 1 <= ctas_level <= 3:
                    return 1
            elif admission_type == 'Urgent':
                if test_result == 'Normal' and 1 <= ctas_level <= 2:
                    return 1
                elif test_result == 'Abnormal' and 1 <= ctas_level <= 4:
                    return 1
                elif test_result == 'Inconclusive' and 1 <= ctas_level <= 3:
                    return 1
            elif admission_type == 'Emergency':
                if test_result == 'Normal' and 1 <= ctas_level <= 3:
                    return 1
                elif test_result == 'Abnormal' and 1 <= ctas_level <= 5:
                    return 1
                elif test_result == 'Inconclusive' and 1 <= ctas_level <= 4:
                    return 1

        elif age_group == 'Adult':
            if admission_type == 'Elective':
                if test_result == 'Normal' and ctas_level == 1:
                    return 1
                elif test_result == 'Abnormal' and 1 <= ctas_level <= 2:
                    return 1
                elif test_result == 'Inconclusive' and 1 <= ctas_level <= 2:
                    return 1
            elif admission_type == 'Urgent':
                if test_result == 'Normal' and ctas_level == 1:
                    return 1
                elif test_result == 'Abnormal' and 1 <= ctas_level <= 3:
                    return 1
                elif test_result == 'Inconclusive' and 1 <= ctas_level <= 2:
                    return 1
            elif admission_type == 'Emergency':
                if test_result == 'Normal' and 1 <= ctas_level <= 2:
                    return 1
                elif test_result == 'Abnormal' and 1 <= ctas_level <= 4:
                    return 1
                elif test_result == 'Inconclusive' and 1 <= ctas_level <= 3:
                    return 1

        elif age_group == 'Senior':
            if admission_type == 'Elective':
                if test_result == 'Normal' and 1 <= ctas_level <= 3:
                    return 1
                elif test_result == 'Abnormal' and 1 <= ctas_level <= 4:
                    return 1
                elif test_result == 'Inconclusive' and 1 <= ctas_level <= 3:
                    return 1
            elif admission_type == 'Urgent':
                if test_result == 'Abnormal' and 1 <= ctas_level <= 3:
                    return 1
                elif test_result == 'Normal' and 1 <= ctas_level <= 5:
                    return 1
                elif test_result == 'Inconclusive' and 1 <= ctas_level <= 3:
                    return 1
            elif admission_type == 'Emergency':
                if test_result == 'Abnormal' and 1 <= ctas_level <= 5:
                    return 1
                elif test_result == 'Normal' and 1 <= ctas_level <= 4:
                    return 1
                elif test_result == 'Inconclusive' and 1 <= ctas_level <= 4:
                    return 1

    elif condition_severity == 'Non-Fatal':
        if age_group == 'Adolescent':
            if admission_type == 'Elective':
                if test_result == 'Normal' and ctas_level == 1:
                    return 1
                elif test_result == 'Abnormal' and 1 <= ctas_level <= 3:
                    return 1
                elif test_result == 'Inconclusive' and 1 <= ctas_level <= 2:
                    return 1
            elif admission_type == 'Urgent':
                if test_result == 'Normal' and ctas_level == 1:
                    return 1
                elif test_result == 'Abnormal' and 1 <= ctas_level <= 3:
                    return 1
                elif test_result == 'Inconclusive' and 1 <= ctas_level <= 2:
                    return 1
            elif admission_type == 'Emergency':
                if test_result == 'Normal' and 1 <= ctas_level <= 3:
                    return 1
                elif test_result == 'Abnormal' and 1 <= ctas_level <= 4:
                    return 1
                elif test_result == 'Inconclusive' and 1 <= ctas_level <= 3:
                    return 1

        elif age_group == 'Adult':
            if admission_type == 'Elective':
                if test_result == 'Normal' and ctas_level == 1:
                    return 1
                elif test_result == 'Abnormal' and 1 <= ctas_level <= 2:
                    return 1
                elif test_result == 'Inconclusive' and 1 <= ctas_level <= 2:
                    return 1
            elif admission_type == 'Urgent':
                if test_result == 'Normal' and ctas_level == 1:
                    return 1
                elif test_result == 'Abnormal' and 1 <= ctas_level <= 3:
                    return 1
                elif test_result == 'Inconclusive' and 1 <= ctas_level <= 2:
                    return 1
            elif admission_type == 'Emergency':
                if test_result == 'Normal' and ctas_level == 1:
                    return 1
                elif test_result == 'Abnormal' and 1 <= ctas_level <= 4:
                    return 1
                elif test_result == 'Inconclusive' and 1 <= ctas_level <= 3:
                    return 1

        elif age_group == 'Senior':
            if admission_type == 'Elective':
                if test_result == 'Normal' and 1 <= ctas_level <= 2:
                    return 1
                elif test_result == 'Abnormal' and 1 <= ctas_level <= 4:
                    return 1
                elif test_result == 'Inconclusive' and 1 <= ctas_level <= 3:
                    return 1
            elif admission_type == 'Urgent':
                if test_result == 'Normal' and 1 <= ctas_level <= 3:
                    return 1
                elif test_result == 'Abnormal' and 1 <= ctas_level <= 4:
                    return 1
                elif test_result == 'Inconclusive' and 1 <= ctas_level <= 3:
                    return 1
            elif admission_type == 'Emergency':
                if test_result == 'Normal' and 1 <= ctas_level <= 3:
                    return 1
                elif test_result == 'Abnormal' and 1 <= ctas_level <= 5:
                    return 1
                elif test_result == 'Inconclusive' and 1 <= ctas_level <= 4:
                    return 1

    # Default to 0 if none of the conditions are met
    return 0

# Apply the function
df['Churn'] = df.apply(assign_churn, axis=1)
print(df[['Condition Severity', 'Age Group', 'Admission Type', 'Test Results', 'CTAS Level', 'Churn']].sample(10))

      Condition Severity Age Group Admission Type  Test Results  CTAS Level  \
16044              Fatal     Adult         Urgent      Abnormal           2   
28580              Fatal    Senior       Elective        Normal           5   
27874          Non-Fatal     Adult      Emergency        Normal           4   
3221           Non-Fatal    Senior      Emergency        Normal           4   
11251              Fatal     Adult       Elective      Abnormal           3   
36711          Non-Fatal    Senior      Emergency  Inconclusive           1   
29149              Fatal    Senior       Elective        Normal           5   
11313              Fatal     Adult         Urgent  Inconclusive           2   
39678          Non-Fatal     Adult      Emergency      Abnormal           2   
35362              Fatal     Adult         Urgent      Abnormal           2   

       Churn  
16044      1  
28580      0  
27874      0  
3221       0  
11251      0  
36711      1  
29149      0  
11313     

In [3]:
df['Churn'].value_counts()

Churn
0    29234
1    26266
Name: count, dtype: int64

In [4]:
# Filter and display only rows where Churn == 1
positive_churn_df = df[df['Age Group'] == 'Adolescent']
positive_churn_df[['Condition Severity', 'Age Group', 'Admission Type', 'Test Results', 'CTAS Level', 'Churn']]

Unnamed: 0,Condition Severity,Age Group,Admission Type,Test Results,CTAS Level,Churn
50038,Fatal,Adolescent,Emergency,Inconclusive,1,1
50150,Fatal,Adolescent,Emergency,Abnormal,1,1
50210,Fatal,Adolescent,Emergency,Normal,5,0
50296,Fatal,Adolescent,Elective,Abnormal,3,1
50335,Fatal,Adolescent,Emergency,Inconclusive,2,1
...,...,...,...,...,...,...
55227,Non-Fatal,Adolescent,Elective,Abnormal,3,1
55262,Non-Fatal,Adolescent,Emergency,Inconclusive,1,1
55382,Fatal,Adolescent,Urgent,Abnormal,3,1
55456,Fatal,Adolescent,Urgent,Abnormal,3,1


In [5]:
# === 5. Save to CSV and Provide Download Link ===
output_filename = 'synthetic_healthcare_data.csv'
df.to_csv(output_filename, index=False)