In [47]:
import numpy as np 
import pandas as pd 


In [48]:
np.random.seed(42)

In [49]:
# Number of records
N = 50000

In [50]:
# County population weights ( top counties from california)
counties = [
    ("Los Angeles", "06037", 0.25),
    ("San Diego", "06073", 0.08),
    ("Orange", "06059", 0.08),
    ("Riverside", "06065", 0.06),
    ("San Bernardino", "06071", 0.05),
    ("Santa Clara", "06085", 0.05),
    ("Alameda", "06001", 0.04),
    ("Sacramento", "06067", 0.03),
    ("Contra Costa", "06013", 0.03),
    ("Other Counties", "06999", 0.33)
]

county_df = pd.DataFrame(counties, columns=["county", "county_code", "weight"])

In [51]:
# Adding Household structure
household_type_probs = {
    "living_alone": 0.29, # single-person
    "married_no_children": 0.28,
    "married_with_children": 0.20,
    "single_parent": 0.13,
    "rommates": 0.03,
    "multigenerational": 0.07
}

hh_types = list(household_type_probs.keys())
hh_probs = list(household_type_probs.values())

# generating household structure

def generate_household_structure(hh_type):
    
    if hh_type == "living_alone":
        if np.random.rand() < 0.50:
            return {"adults": 1, "children": 0, "elders": 0}
        else:
            return {"adults": 0, "children": 0, "elders": 1}
        
    if hh_type == "married_no_children":
        return {"adults": 2, "children": 0, "elders": 0}
    
    if hh_type == "married_with_children":
        num_children = np.random.choice([1,2,3], p=[0.6,0.3,0.1])
        return {"adults": 2, "children": num_children, "elders": 0}
    
    if hh_type == "single_parent":
        num_children = np.random.choice([1,2,3], p=[0.7,0.2,0.1])
        return {"adults": 1, "children": num_children, "elders": 0}
    
    if hh_type == "rommates":
        num_adults = np.random.choice([2,3,4], p=[0.7,0.2,0.1])
        return {"adults": num_adults, "children": 0, "elders": 0}
    
    if hh_type == "multigenerational":
        num_children = np.random.choice([1,2], p=[0.7,0.3])
        return {"adults": 1, "children": num_children, "elders": 1}
    
    raise ValueError(f"Unknown household type: {hh_type}")
    

In [52]:
# generating ages consistent with household structure
def generate_ages_for_household(struct):
    ages = []

    # Elders: 65-90
    for _ in range(struct["elders"]):
        ages.append(np.random.randint(65, 91))
    
    # Adults: 25-60
    for _ in range(struct["adults"]):
        ages.append(np.random.randint(25, 61))
    
    # children: 1-17
    for _ in range(struct["children"]):
        parent_candidates = [a for a in ages if a >= 25]
        if parent_candidates:
            parent_age = np.random.choice(parent_candidates)
            max_child_age = max(0, parent_age - 18)
            child_age = np.random.randint(0, min(17, max_child_age) + 1)
        
        else:
            child_age = np.random.randint(0, 18)
        ages.append(child_age)
    return ages

In [53]:
# Build households untile we reach N people
household_records = []
person_counter = 0
household_id = 1

while person_counter < N:
    hh_type = np.random.choice(hh_types, p=hh_probs)
    struct = generate_household_structure(hh_type)
    ages = generate_ages_for_household(struct)

    if person_counter + len(ages) > N:
        ages = ages[: (N - person_counter)]

    for age in ages:
        household_records.append({
            "household_id": household_id,
            "age": age,
            "household_type": hh_type
        })

    person_counter += len(ages)
    household_id += 1

households_df = pd.DataFrame(household_records)

# Household size
households_df["household_size"] = households_df["household_id"].map(
    households_df["household_id"].value_counts()
)

# Multigenerational flag (based on type)
households_df["multigenerational"] = (households_df["household_type"] == "multigenerational").astype(int)

In [54]:
# Sample counties for each household
unique_households = households_df["household_id"].unique()

# Sample counties for each household, using county weights
sampled_counties = county_df.sample(
    n=len(unique_households),
    replace=True,
    weights=county_df["weight"]
).reset_index(drop=True)

household_to_county = dict(zip(unique_households, sampled_counties["county"]))
household_to_code = dict(zip(unique_households, sampled_counties["county_code"]))

households_df["county"] = households_df["household_id"].map(household_to_county)
households_df["county_code"] = households_df["household_id"].map(household_to_code)


In [55]:
# Occupations & Frontline status
occupations = [
    ("Healthcare", 0.09, 1),
    ("Education", 0.06, 1),
    ("Retail/Food", 0.10, 1),
    ("Transportation", 0.09, 1),
    ("Manufacturing", 0.07, 0),
    ("Office/Tech", 0.20, 0),
    ("Construction", 0.05, 0),
    ("Unemployed", 0.055, 0),
    ("Retired", 0.17, 0),
    ("Other", 0.135, 0)
]

occ_df = pd.DataFrame(
    occupations,
    columns=["occupation", "prob", "frontline"]
)

occ_sample = occ_df.sample(
    n = len(households_df),
    replace=True,
    weights=occ_df["prob"]
).reset_index(drop=True)

households_df["occupation"] = occ_sample["occupation"]
households_df["frontline_worker"] = occ_sample["frontline"]

In [56]:
# Daily Activity level
def daily_activity(age, occupation):
    if age>= 65:
        return np.random.choice(["Low", "Moderate"], p=[0.65, 0.35])
    if occupation in ["Construction","Transportation"]:
        return np.random.choice(["Moderate", "High"], p=[0.6, 0.4])
    if occupation in ["Office/Tech"]:
        return np.random.choice(["Low", "Moderate"], p=[0.55, 0.45])
    return np.random.choice(["low","Moderate","High"], p=[0.4, 0.4, 0.2])

households_df["daily_activity_level"] = [
    daily_activity(a, o)
    for a, o in zip(households_df["age"], households_df["occupation"])
]



In [57]:
# Health Conditions
def asthma_prob(age):
    if age < 18: return 0.10
    if age < 35: return 0.09
    if age < 50: return 0.08
    return 0.07

def copd_prob(age):
    if age < 35: return 0.01
    if age < 50: return 0.03
    if age < 65: return 0.06
    return 0.10

def other_resp_prob(age):
    if age < 35: return 0.01
    if age < 65: return 0.02
    return 0.03

def hypertension_prob(age):
    if age < 35: return 0.10
    if age < 50: return 0.22
    if age < 65: return 0.40
    return 0.55

def heart_disease_prob(age):
    if age < 40: return 0.02
    if age < 55: return 0.05
    if age < 65: return 0.10
    return 0.18

def stroke_prob(age):
    if age < 45: return 0.01
    if age < 65: return 0.03
    return 0.07

def diabetes_prob(age):
    if age < 35: return 0.05
    if age < 50: return 0.10
    if age < 65: return 0.18
    return 0.28

def obesity_prob(age, activity_level):
    base_prob = 0.28 if age >= 18 else 0.20
    if activity_level == "Low": base_prob +=0.10
    elif activity_level == "High": base_prob -=0.10
    return min(max(base_prob, 0.0), 1.0)

def smoking_prob(age):
    if age < 18: return 0.02
    if age < 35: return 0.12
    if age < 50: return 0.15
    return 0.10

# Apply health conditions
households_df["has_asthma"] = [
    np.random.rand() < asthma_prob(a) for a in households_df["age"]
]
households_df["has_copd_or_chronic_resp_disease"] = [
    np.random.rand() < copd_prob(a) for a in households_df["age"]
]
households_df["has_other_respiratory_issue"] = [
    np.random.rand() < other_resp_prob(a) for a in households_df["age"]
]
households_df["has_hypertension"] = [
    np.random.rand() < hypertension_prob(a) for a in households_df["age"]
]
households_df["has_heart_disease"] = [
    np.random.rand() < heart_disease_prob(a) for a in households_df["age"]
]
households_df["has_stroke_history"] = [
    np.random.rand() < stroke_prob(a) for a in households_df["age"]
]
households_df["has_diabetes"] = [
    np.random.rand() < diabetes_prob(a) for a in households_df["age"]
]
households_df["has_obesity"] = [
    np.random.rand() < obesity_prob(a, dal)
    for a, dal in zip(households_df["age"], households_df["daily_activity_level"])
]
households_df["has_smoking_history"] = [
    np.random.rand() < smoking_prob(a) for a in households_df["age"]
]


In [58]:
# SES QUINTILE
def ses_quintile(county, occupation):
    base = np.random.choice([1,2,3,4,5], p=[0.2, 0.2, 0.2, 0.2, 0.2])
    if county in ["Los Angeles", "San Bernardino", "Riverside"]: base -=1
    if county in ["Santa Clara", "Contra Costa"]: base +=1
    if occupation in ["Healthcare", "Office/Tech"]: base +=1
    if occupation in ["Retail/Food", "Unemployed"]: base -=1
    return min(max(base, 1), 5)

households_df["ses_quintile"] = [
    ses_quintile(c, o)
    for c, o in zip(households_df["county"], households_df["occupation"])
]


In [None]:
# long-term care resident flag
def ltc_prob(age, comorbidities):
    if age < 65:
        return 0.0
    base = 0.003  # ~0.3% among 65+
    if comorbidities.sum() > 2:
        base *= 2
    return base

comorb_cols = [
    "has_asthma",
    "has_copd_or_chronic_resp_disease",
    "has_other_respiratory_issue",
    "has_hypertension",
    "has_heart_disease",
    "has_stroke_history",
    "has_diabetes",
    "has_obesity",
    "has_smoking_history"
]

households_df["is_long_term_care"] = [
    np.random.rand() < ltc_prob(row["age"], row[comorb_cols])
    for _, row in households_df.iterrows()
]


In [None]:
# Final Dataset
households_df = households_df.reset_index(drop=True)
households_df["id"] = np.arange(1, len(households_df) + 1)

df = households_df[
    [
        "id",
        "age",
        "county",
        "county_code",
        "household_id",
        "household_size",
        "household_type",
        "multigenerational",
        "occupation",
        "frontline_worker",
        "daily_activity_level",
        "has_asthma",
        "has_copd_or_chronic_resp_disease",
        "has_other_respiratory_issue",
        "has_hypertension",
        "has_heart_disease",
        "has_stroke_history",
        "has_diabetes",
        "has_obesity",
        "has_smoking_history",
        "ses_quintile",
        "is_long_term_care"
    ]
]

print("N individuals:", len(df))
print("Average household size:", df["household_size"].mean())
print(df.head())


N individuals: 50000
Average household size: 2.63832
   id  age        county county_code  household_id  household_size  \
0   1   53  Contra Costa       06013             1               2   
1   2   39  Contra Costa       06013             1               2   
2   3   43       Alameda       06001             2               3   
3   4   47       Alameda       06001             2               3   
4   5   10       Alameda       06001             2               3   

          household_type  multigenerational   occupation  frontline_worker  \
0    married_no_children                  0        Other                 0   
1    married_no_children                  0  Office/Tech                 0   
2  married_with_children                  0   Healthcare                 1   
3  married_with_children                  0        Other                 0   
4  married_with_children                  0      Retired                 0   

   ... has_copd_or_chronic_resp_disease  has_other_respir

In [66]:
df

Unnamed: 0,id,age,county,county_code,household_id,household_size,household_type,multigenerational,occupation,frontline_worker,...,has_copd_or_chronic_resp_disease,has_other_respiratory_issue,has_hypertension,has_heart_disease,has_stroke_history,has_diabetes,has_obesity,has_smoking_history,ses_quintile,is_long_term_care
0,1,53,Contra Costa,06013,1,2,married_no_children,0,Other,0,...,False,False,False,False,False,False,True,False,3,False
1,2,39,Contra Costa,06013,1,2,married_no_children,0,Office/Tech,0,...,False,False,False,False,False,False,True,False,5,False
2,3,43,Alameda,06001,2,3,married_with_children,0,Healthcare,1,...,False,False,False,True,False,True,False,False,2,False
3,4,47,Alameda,06001,2,3,married_with_children,0,Other,0,...,False,False,False,False,False,True,False,True,4,False
4,5,10,Alameda,06001,2,3,married_with_children,0,Retired,0,...,False,False,True,False,False,False,False,False,5,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,49996,56,Los Angeles,06037,23095,4,married_with_children,0,Manufacturing,0,...,False,False,False,False,False,False,False,False,4,False
49996,49997,60,Los Angeles,06037,23095,4,married_with_children,0,Retail/Food,1,...,False,False,False,False,False,False,False,False,1,False
49997,49998,12,Los Angeles,06037,23095,4,married_with_children,0,Education,1,...,False,False,False,False,False,False,False,False,3,False
49998,49999,5,Los Angeles,06037,23095,4,married_with_children,0,Office/Tech,0,...,False,False,False,False,False,False,True,False,4,False


In [67]:
df.to_csv("synthetic_population.csv", index=False)