In [1]:
import numpy as np
import pandas as pd
import scipy.stats as stats

def generate_patient_vitals(id, n, t=3):
    np.random.seed(id)
    
    # Simulate random values for heart rate (bpm)
    heart_rate = np.round(stats.truncnorm.rvs(-t, t, 80, 5, n), 2)
    
    # Simulate random values for blood pressure (mmHg)
    systolic_bp = np.round(stats.truncnorm.rvs(-t, t, 105, 5, n), 2)
    diastolic_bp = np.round(stats.truncnorm.rvs(-t, t, 70, 4, n), 2)

    # Simulate random values for temperature (°C)
    temperature = np.round(stats.truncnorm.rvs(-t, t, 37, 0.15, n), 2)

    # Simulate random values for respiration rate (breaths per minute)
    respiration_rate = np.round(stats.truncnorm.rvs(-t, t, 17, 1.3, n), 2)

    # Simulate random values for SpO2 (%)
    spo2 = np.round(stats.truncnorm.rvs(-t, t, 98, 0.65, n), 2)
    spo2[spo2 > 100] = 100

    return pd.DataFrame({
        'Patient ID': id,
        'Heart Rate': heart_rate,
        'Systolic BP': systolic_bp,
        'Diastolic BP': diastolic_bp,
        'Temperature': temperature,
        'Respiration Rate': respiration_rate,
        'SpO2': spo2
    })

In [2]:
import json

# Processing address dataset
geojson_file_path = '../ma_addresses/city_of_boston-addresses-city.geojson'

# Extract relevant information
addresses = []
with open(geojson_file_path, 'r') as file:
    for line in file:
        try:
            data = json.loads(line)
            properties = data.get("properties", {})
            number = properties.get("number", "")
            street = properties.get("street", "")
            postcode = properties.get("postcode", "")

            # Create consolidated address
            consolidated_address = f"{number} {street}, {postcode}"

            # Append to addresses list
            addresses.append(consolidated_address)

        except json.JSONDecodeError as e:
            print(f"Error decoding JSON: {e}")

# Create a DataFrame
address_df = pd.DataFrame({"address": addresses})

# Checking Addresses
address_df.head()

Unnamed: 0,address
0,"6-10 A Street, 02136"
1,"7 A Street, 02136"
2,"10 A Street, 02127"
3,"172-174 A Street, 02210"
4,"176-178 A Street, 02210"


In [3]:
Patient_count = 500000

In [6]:
# patient_df = pd.DataFrame([])
# for i in range(1, Patient_count+1):
#     patient_df = pd.concat([patient_df, generate_patient_vitals(i, 100)])
address_df_sample = address_df.sample(n = Patient_count, replace=True).reset_index().drop('index', axis=1, errors='ignore')    

In [7]:
address_df_sample.count()

address    500000
dtype: int64

In [8]:
from faker import Faker
import random

fake = Faker()

def generate_random_phone_number():
    # Generate a random US phone number for Boston area code (617)
    area_code = "617"
    middle_digits = f"{random.randint(200, 999):03d}"
    last_digits = f"{random.randint(1000, 9999):04d}"
    return f"({area_code}) {middle_digits}-{last_digits}"

def generate_random_ward_number():
    # Generate a random ward number (assuming a range)
    return random.randint(1, 20)

def generate_random_age():
    return random.randint(18, 75)

# Generate random patients with names, phone numbers, and ward numbers
names = []
phnumber = []
ward_list = []
age_list = []
for _ in range(Patient_count):
    names.append(fake.name())
    phnumber.append(generate_random_phone_number())
    ward_list.append(generate_random_ward_number())
    age_list.append(generate_random_age())

patient_id_list = list(range(1, Patient_count+1))
    
detail_df = pd.DataFrame({
    "patientId": patient_id_list,
    "patient_name": names, 
    "phone_number": phnumber,
    "age": age_list,
    "admitted_ward": ward_list
    })
detail_df

Unnamed: 0,patientId,patient_name,phone_number,age,admitted_ward
0,1,Calvin Powell,(617) 665-9737,54,11
1,2,Lauren Sanchez,(617) 674-9156,59,6
2,3,Andrew Jones,(617) 957-4838,23,20
3,4,Vicki Phillips,(617) 735-3027,67,16
4,5,Jessica Krueger,(617) 392-1687,50,12
...,...,...,...,...,...
499995,499996,Elizabeth Lambert,(617) 677-4388,47,6
499996,499997,Deanna Rivera,(617) 291-9900,73,9
499997,499998,Martha Keith,(617) 955-7046,68,5
499998,499999,Nicole Hanson,(617) 302-3072,71,6


In [9]:
# Joining detail DF and addresses DF
patient_info_df = detail_df.join(address_df_sample)
patient_info_df

Unnamed: 0,patientId,patient_name,phone_number,age,admitted_ward,address
0,1,Calvin Powell,(617) 665-9737,54,11,"37-39 Chickatawbut Street, 02122"
1,2,Lauren Sanchez,(617) 674-9156,59,6,"835 Huntington Avenue, 02115"
2,3,Andrew Jones,(617) 957-4838,23,20,"17 Grovenor Road, 02130"
3,4,Vicki Phillips,(617) 735-3027,67,16,"8 Navillus Terrace, 02122"
4,5,Jessica Krueger,(617) 392-1687,50,12,"20 Cummins Highway, 02131"
...,...,...,...,...,...,...
499995,499996,Elizabeth Lambert,(617) 677-4388,47,6,"526 Columbus Avenue, 02118"
499996,499997,Deanna Rivera,(617) 291-9900,73,9,"20 Tampa Street, 02126"
499997,499998,Martha Keith,(617) 955-7046,68,5,"268 Fairmount Avenue, 02136"
499998,499999,Nicole Hanson,(617) 302-3072,71,6,"126 W Newton Street, 02118"


In [164]:
patient_df.columns

Index(['Patient ID', 'Heart Rate', 'Systolic BP', 'Diastolic BP',
       'Temperature', 'Respiration Rate', 'SpO2'],
      dtype='object')

In [165]:
patient_df.describe()

Unnamed: 0,Patient ID,Heart Rate,Systolic BP,Diastolic BP,Temperature,Respiration Rate,SpO2
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,5.5,79.89921,104.8537,70.00163,37.00314,17.02041,97.97008
std,2.873719,4.717041,5.015229,3.871222,0.148834,1.27567,0.67272
min,1.0,65.12,90.68,58.31,36.6,13.31,96.06
25%,3.0,76.915,101.3275,67.4075,36.91,16.19,97.5
50%,5.5,79.945,105.04,70.045,37.0,17.03,97.97
75%,8.0,82.9125,108.38,72.76,37.1,17.89,98.44
max,10.0,92.25,118.25,80.5,37.43,20.6,99.87


In [166]:
patient_df.to_csv("patient_vitals_CSV/patient_vitals.csv", index=False)


In [10]:
patient_info_df.to_csv("patient_info.csv", index=False)