In [None]:
import pandas as pd
import numpy as np

merged_data = pd.read_csv("merged_data (1).csv")
np.random.seed(42)
fips_codes = merged_data["FIPS"].values
patient_ids = [f"PT{str(fips)[-6:]}" for fips in fips_codes]

ages = []
for index, row in merged_data.iterrows():
    base_age = 45 if row["TractSeniors"] > row["TractKids"] else 38
    if row["MedianFamilyIncome"] < 40000:
        age_adjust = np.random.normal(5, 3) 
    else:
        age_adjust = np.random.normal(-2, 3)  
        
    age = max(18, min(95, base_age + age_adjust))
    ages.append(int(age))

genders = np.random.choice(["Male", "Female"], len(fips_codes), p=[0.45, 0.55])

has_diabetes = []
has_hypertension = []
has_obesity = []
has_asthma = []
has_heart_disease = []

for i, age in enumerate(ages):
    
    diabetes_prob = min(0.7, 0.05 + (age-40)*0.01)
    has_diabetes.append(np.random.random() < diabetes_prob)
   
    hypertension_prob = min(0.8, 0.1 + (age-40)*0.012)
    has_hypertension.append(np.random.random() < hypertension_prob)

    obesity_prob = min(0.6, 0.15 + (age-40)*0.005)
    has_obesity.append(np.random.random() < obesity_prob)
    
    asthma_prob = min(0.4, 0.1 + (40-age)*0.005 if age < 40 else 0.1)
    has_asthma.append(np.random.random() < asthma_prob)
    
    heart_disease_prob = min(0.6, 0.02 + (age-50)*0.01 if age > 50 else 0.02)
    has_heart_disease.append(np.random.random() < heart_disease_prob)

healthcare_visits = []
for i, age in enumerate(ages):
    base_visits = 4  
    
    age_factor = max(0, (age - 40) / 20)
    chronic_factor = (has_diabetes[i] + has_hypertension[i] + has_obesity[i] + 
                     has_asthma[i] + has_heart_disease[i]) * 1.5
    
    visits = max(0, np.random.poisson(base_visits + age_factor + chronic_factor))
    healthcare_visits.append(visits)

bmi_values = []
cholesterol_levels = []
blood_pressure_systolic = []
blood_pressure_diastolic = []

for i, age in enumerate(ages):
    if has_obesity[i]:
        bmi = np.random.normal(32, 3)
    else:
        bmi = np.random.normal(25, 3)
    bmi_values.append(round(max(18, min(45, bmi)), 1))
   
    base_cholesterol = 180 + (age / 4)
    cholesterol = np.random.normal(base_cholesterol, 20)
    cholesterol_levels.append(round(max(120, min(300, cholesterol))))
    
    base_systolic = 120 + (age / 3)
    if has_hypertension[i]:
        base_systolic += 15
    systolic = np.random.normal(base_systolic, 10)
    blood_pressure_systolic.append(round(max(90, min(180, systolic))))
 
    base_diastolic = 80
    if has_hypertension[i]:
        base_diastolic += 8
    diastolic = np.random.normal(base_diastolic, 7)
    blood_pressure_diastolic.append(round(max(60, min(120, diastolic))))

blood_types = np.random.choice(["A+", "A-", "B+", "B-", "AB+", "AB-", "O+", "O-"], 
                              len(fips_codes), p=[0.32, 0.06, 0.09, 0.02, 0.03, 0.01, 0.38, 0.09])

smoking_status = []
for age in ages:
    if age < 30:
        smoking_status.append(np.random.choice(["Never", "Current", "Former"], p=[0.6, 0.25, 0.15]))
    elif age < 50:
        smoking_status.append(np.random.choice(["Never", "Current", "Former"], p=[0.5, 0.2, 0.3]))
    else:
        smoking_status.append(np.random.choice(["Never", "Current", "Former"], p=[0.4, 0.15, 0.45]))

patients_df = pd.DataFrame({
    "Patient_ID": patient_ids,
    "FIPS": fips_codes,
    "Age": ages,
    "Gender": genders,
    "Blood_Type": blood_types,
    "Smoking_Status": smoking_status,
    "Diabetes": has_diabetes,
    "Hypertension": has_hypertension,
    "Obesity": has_obesity,
    "Asthma": has_asthma,
    "Heart_Disease": has_heart_disease,
    "BMI": bmi_values,
    "Cholesterol": cholesterol_levels,
    "BP_Systolic": blood_pressure_systolic,
    "BP_Diastolic": blood_pressure_diastolic,
    "Healthcare_Visits_Last_Year": healthcare_visits
})

patients_df.to_csv("patients_dataset.csv", index=False)

print(f"Patients dataset created with {len(patients_df)} records (same as merged dataset)")
print(patients_df.head())
print("\nDataset summary:")
print(patients_df.describe())

print("\nCondition prevalence:")
print(f"Diabetes: {patients_df['Diabetes'].mean():.2%}")
print(f"Hypertension: {patients_df['Hypertension'].mean():.2%}")
print(f"Obesity: {patients_df['Obesity'].mean():.2%}")
print(f"Asthma: {patients_df['Asthma'].mean():.2%}")
print(f"Heart Disease: {patients_df['Heart_Disease'].mean():.2%}")

Patients dataset created with 60352 records (same as merged dataset)
  Patient_ID        FIPS  Age  Gender Blood_Type Smoking_Status  Diabetes  \
0   PT020100  1001020100   37    Male         O+          Never     False   
1   PT020200  1001020200   35    Male         B+          Never     False   
2   PT020300  1001020300   37  Female         O+        Current     False   
3   PT020400  1001020400   40  Female         O+         Former      True   
4   PT020600  1001020600   35    Male         O+         Former     False   

   Hypertension  Obesity  Asthma  Heart_Disease   BMI  Cholesterol  \
0         False     True   False          False  36.7          211   
1         False    False   False          False  24.2          180   
2         False    False   False          False  27.6          180   
3         False    False   False          False  19.2          184   
4         False     True   False          False  37.3          218   

   BP_Systolic  BP_Diastolic  Healthcare_Visits

In [4]:
patients_df.head()

Unnamed: 0,Patient_ID,FIPS,Age,Gender,Blood_Type,Smoking_Status,Diabetes,Hypertension,Obesity,Asthma,Heart_Disease,BMI,Cholesterol,BP_Systolic,BP_Diastolic,Healthcare_Visits_Last_Year
0,PT020100,1001020100,37,Male,O+,Never,False,False,True,False,False,36.7,211,141,76,4
1,PT020200,1001020200,35,Male,B+,Never,False,False,False,False,False,24.2,180,131,78,3
2,PT020300,1001020300,37,Female,O+,Current,False,False,False,False,False,27.6,180,144,69,3
3,PT020400,1001020400,40,Female,O+,Former,True,False,False,False,False,19.2,184,128,73,2
4,PT020600,1001020600,35,Male,O+,Former,False,False,True,False,False,37.3,218,158,83,2


In [7]:
patients_df.shape   

(60352, 14)