In [23]:
import numpy as np
import pandas as pd
np.random.seed(42)
n = 1000
age = np.random.normal(65, 10, n).clip(30, 90)
sex = np.random.binomial(1, 0.5, n)
aps = np.random.normal(50, 15, n).clip(10, 100)  # Acute Physiology Score
comorbidities = np.random.poisson(2, n).clip(0, 5)
wblc = np.random.normal(7.5, 1.5, n).clip(2, 15)
income = np.random.normal(40000, 15000, n).clip(10000, 100000)
sps = 0.6 * aps + 3.5 * comorbidities + np.random.normal(0, 10, n)
sps = sps.clip(0, 100)  # Keep it within a reasonable 0–100 range
risk_score = (
    0.03 * age +
    0.04 * aps +
    0.5 * comorbidities +
    0.05 * sps + 
    0.1 * sex +
    0.02 * wblc -
    0.00005 * income
)

hazard = baseline_hazard * np.exp(risk_score)
true_time = np.random.exponential(scale=1 / hazard, size=n)
censoring_time = np.random.exponential(scale=100, size=n)
observed_time = np.minimum(true_time, censoring_time)
observed_time = np.maximum(observed_time, 1)  # clip min to 1 day
event_observed = (true_time <= censoring_time).astype(int)
simulated_df = pd.DataFrame({
    "age": age,
    "sex": sex,
    "aps": aps,
    "sps": sps,
    "num.co": comorbidities,
    "wblc": wblc,
    "income": income.astype(int),
    "time_active": observed_time,
    "dropout": event_observed
})


simulated_df.to_csv("final_data.csv", index=False)
print(" Final dataset final_data.csv")




 Final dataset final_data.csv


In [21]:
df=pd.read_csv("final_data.csv")

In [22]:
df.head()

Unnamed: 0,age,sex,aps,sps,num.co,wblc,income,time_active,dropout
0,69.967142,0,45.361864,32.651604,1,7.724397,27757,1.93917,1
1,63.617357,0,38.717654,39.829244,1,7.646741,18174,1.0,1
2,71.476885,1,54.787618,39.129117,2,8.154009,44529,1.158443,1
3,80.230299,1,70.106757,47.21854,2,8.765759,31788,1.0,1
4,62.658466,0,21.872413,19.046743,0,8.088951,68029,11.446893,0
