# Evaluation Data Generator

Due to HIPPAA, I am not able to perform this project live. Thus, I will create a large dataset using logic and probability. For example, Even though my data is random, it makes sense that a patient who was unable to perform supine to sitting is then not able to ambulate. I will use this logic to also develop functions to 'guess' length-of-stay as well as discharge location, and thus the need for rehabilitation following the acute care stay. 

It is important to note that I use only a select few features to predict length-of-stay and need for rehab. These features are:
* Ambulation Distance
* Number of falls
* Pain
* Type of fusion

We will be able to analyze the effects of each of these during the machine learning section of this project.

In [1]:
import numpy as np
import pandas as pd

rng = np.random.default_rng()

In [2]:
# Age, Sex, Fusion, Provider samples
age_sample = np.random.normal(loc = 65, scale = 9, size = 1000).astype(int)
sex_sample = np.random.choice(['female','male'], size = 1000)
fusion_sample = np.random.choice(['cervical','lumbar'], size = 1000)
provider_sample = np.random.choice(['Nolan', 'Myers', 'Kuzak', 'Woo', 'Smith'], size = 1000)

# Brace sample based on type of fusion performed
brace_sample = []

for fusion in fusion_sample:
    if fusion == 'cervical':
        brace_sample.append(np.random.choice(['none','aspen']))
    if fusion == 'lumbar':
        brace_sample.append(np.random.choice(['none','tlso','lso']))

# Ensure string samples are of type string
sex_sample = [str(n) for n in sex_sample]
fusion_sample = [str(n) for n in fusion_sample]
provider_sample = [str(n) for n in provider_sample]
brace_sample = [str(n) for n in brace_sample]

# Pain and prior location samples
pain_sample = np.random.choice(range(0, 11), size = 1000)
prior_loc_sample = np.random.choice(range(0, 8), size = 1000, p=[0.3, 0.2, 0.1, 0.12, 0.1, 0.1, 0.05, 0.03])

# Stairs to enter sample based on prior location
ste_sample = []
for loc in prior_loc_sample:
    if loc in (0, 1, 2):
        ste_sample.append(int(rng.gamma(shape = 1, scale = 2, size = None)))
    if loc in (3, 4, 5, 6, 7):
        ste_sample.append(0)

# Handrails sample based on steps to enter
hr_sample = []

for step in ste_sample:
    if step >= 2:
        hr_sample.append(np.random.choice([1,2], size = None))
    if step < 2:
        hr_sample.append(np.random.choice([0, 1, 2], p = [0.5, 0.4, 0.1], size = None))

# Assistive device sample
ad_sample = np.random.choice([0, 1, 2], size = 1000)

# Number of falls sample based on prior location
num_falls_sample = []
for loc in prior_loc_sample:
    if loc in (0, 1, 2):
        num_falls_sample.append(int(rng.gamma(shape = 1, scale = 1, size = None)))
    elif loc in (3, 4):
        num_falls_sample.append(int(rng.gamma(shape = 1, scale = 1.5, size = None)))
    else:
        num_falls_sample.append(int(rng.gamma(shape = 2, scale = 2.5, size = None)))


In [3]:
# Mobility samples based on fusion type, prior location, prior mobility performance, and pain
sup_sit_sample = []
sit_stand_sample = []
amb_assist_sample = []
amb_distance_sample = []
stairs_assist_sample = []
num_stairs_sample = []

for index, fusion in enumerate(fusion_sample):
    if pain_sample[index] > 7:
        if prior_loc_sample[index] in (5, 6, 7):
            sup_sit_sample.append(np.random.choice([0, 1, 2, 3, 4], p = [0.05, 0.05, 0.35, 0.30, 0.25]))
        elif fusion == 'lumbar':
            sup_sit_sample.append(np.random.choice([0, 1, 2, 3, 4], p = [0.05, 0.15, 0.35, 0.30, 0.15]))
        else:
            sup_sit_sample.append(np.random.choice([0, 1, 2, 3, 4], p = [0.10, 0.30, 0.40, 0.10, 0.10]))
    else:
        if prior_loc_sample[index] in (5, 6, 7):
            sup_sit_sample.append(np.random.choice([0, 1, 2, 3, 4], p = [0.05, 0.15, 0.30, 0.35, 0.15]))
        elif fusion == 'lumbar':
            sup_sit_sample.append(np.random.choice([0, 1, 2, 3, 4], p = [0.25, 0.25, 0.30, 0.15, 0.05]))
        else:
            sup_sit_sample.append(np.random.choice([0, 1, 2, 3, 4], p = [0.35, 0.45, 0.10, 0.05, 0.05]))

for index, fusion in enumerate(fusion_sample):
    if pain_sample[index] > 7:
        if sup_sit_sample == 4:
            sit_stand_sample.append(4)
        elif prior_loc_sample[index] in (5, 6, 7):
            sit_stand_sample.append(np.random.choice([0, 1, 2, 3, 4], p = [0.05, 0.10, 0.35, 0.30, 0.20]))
        elif fusion == 'lumbar':
            sit_stand_sample.append(np.random.choice([0, 1, 2, 3, 4], p = [0.05, 0.15, 0.35, 0.30, 0.15]))
        else:
            sit_stand_sample.append(np.random.choice([0, 1, 2, 3, 4], p = [0.20, 0.25, 0.25, 0.20, 0.10]))
    else:
        if sup_sit_sample == 4:
            sit_stand_sample.append(4)
        elif prior_loc_sample[index] in (5, 6, 7):
            sit_stand_sample.append(np.random.choice([0, 1, 2, 3, 4], p = [0.05, 0.10, 0.40, 0.3, 0.15]))
        elif fusion == 'lumbar':
            sit_stand_sample.append(np.random.choice([0, 1, 2, 3, 4], p = [0.10, 0.20, 0.40, 0.25, 0.05]))
        else:
            sit_stand_sample.append(np.random.choice([0, 1, 2, 3, 4], p = [0.30, 0.40, 0.20, 0.05, 0.05]))

for index, fusion in enumerate(fusion_sample):
    if sit_stand_sample[index] == 4:
        amb_assist_sample.append(4)
    elif prior_loc_sample[index] in (5, 6, 7):
        amb_assist_sample.append(np.random.choice([0, 1, 2, 3, 4], p = [0.02, 0.3, 0.4, 0.1, 0.18]))
    elif fusion == 'lumbar':
        amb_assist_sample.append(np.random.choice([0, 1, 2, 3, 4], p = [0.1, 0.3, 0.3, 0.25, 0.05]))
    else:
        amb_assist_sample.append(np.random.choice([0, 1, 2, 3, 4], p = [0.3, 0.4, 0.2, 0.05, 0.05]))

for index, fusion in enumerate(fusion_sample):
    if amb_assist_sample[index] == 4:
        amb_distance_sample.append(0)
    elif prior_loc_sample[index] in (5, 6, 7):
        if amb_assist_sample[index] > 1: 
            amb_distance_sample.append(int(rng.gamma(shape = 1, scale = 5, size = None)))
        else: 
            amb_distance_sample.append(int(rng.gamma(shape = 2, scale = 15, size = None)))
    elif fusion == 'lumbar':
        amb_distance_sample.append(int(rng.gamma(shape = 2, scale = 20, size = None)))
    else:
        amb_distance_sample.append(int(rng.gamma(shape = 3, scale = 30, size = None)))

for index, fusion in enumerate(fusion_sample):
    if ((amb_assist_sample[index] in (2, 3, 4)) or (prior_loc_sample[index] in (3, 4, 5, 6, 7)) or (amb_distance_sample[index] < 25)):
        stairs_assist_sample.append(4)
    elif fusion == 'lumbar':
        stairs_assist_sample.append(np.random.choice([0, 1, 2, 3, 4], p = [0.25, 0.25, 0.1, 0.05, 0.35]))
    else:
        stairs_assist_sample.append(np.random.choice([0, 1, 2, 3, 4], p = [0.3, 0.3, 0.1, 0.05, 0.25]))

for index, fusion in enumerate(fusion_sample):
    if (stairs_assist_sample[index] == 4) or (ste_sample[index] == 0):
        num_stairs_sample.append(0)
        continue
    num_stairs_sample.append(np.random.choice(range(1, ste_sample[index] + 1), size = None))

For the labels, if a patient was previously admitted from a SNF, inpatient rehabilitation hospital, or LTC, they have a 100% likelihood to discharge to that location.

In [4]:
# Function to guess length of stay is based on fusion type, ambulation distance, pain, and number of falls in the last 6 months


In [5]:
# Function to guess discharge location is based on fusion type, ambulation distance, pain, and number of falls in the last 6 months


In [6]:
dict = {
    'age' : age_sample,
    'sex' : sex_sample,
    'fusion' : fusion_sample,
    'provider' : provider_sample,
    'brace' : brace_sample,
    'pain' : pain_sample,
    'prior_loc' : prior_loc_sample,
    'ste' : ste_sample,
    'hr' : hr_sample,
    'ad' : ad_sample,
    'num_falls' : num_falls_sample,
    'sup_sit' : sup_sit_sample,
    'sit_stand' : sit_stand_sample,
    'amb_assist' : amb_assist_sample,
    'amb_distance' : amb_distance_sample,
    'stairs_assist' : stairs_assist_sample,
    'num_stairs' : num_stairs_sample
}

df = pd.DataFrame(dict)

## Finishing Touches

We are able to specify our label, discharge location, a bit more. We are most interested in whether the patient requires rehabilitation following discharge. We can create a new label column, 'need_rehab' which can store this information in a binary format.

It may also be useful to categorize the pain scale into three bins: mild, moderate, and severe. This is because the difference between each level of pain is not necessarily linear and we may expect that severe pain (7+) would improve length-of-stay and/or need for rehabilitation.

In [7]:
from clean_predict import guess_los, guess_dc_loc, categorize_pain, calc_rehab

df['los'] = (
    df.apply(lambda row: guess_los(row['fusion'], row['pain'], row['amb_distance'], row['num_falls']), axis = 1)
)
df['dc_loc'] = (
    df.apply(lambda row: guess_dc_loc(row['prior_loc'], row['fusion'], row['pain'], row['amb_distance'], \
                                                 row['num_falls']), axis=1)
)
df['pain'] = (
    df['pain'].apply(lambda x: categorize_pain(x))
)

df['need_rehab'] = df['dc_loc'].apply(lambda x: calc_rehab(x))

In [8]:
df.head(5)

Unnamed: 0,age,sex,fusion,provider,brace,pain,prior_loc,ste,hr,ad,num_falls,sup_sit,sit_stand,amb_assist,amb_distance,stairs_assist,num_stairs,los,dc_loc,need_rehab
0,97,male,cervical,Nolan,none,severe,3,0,0,2,3,1,1,0,96,4,0,4.8,6,1
1,67,male,cervical,Smith,aspen,moderate,0,0,0,2,0,2,2,2,89,4,0,4.1,0,0
2,51,female,cervical,Woo,none,moderate,1,1,1,1,0,1,2,0,153,4,0,2.2,1,0
3,64,male,lumbar,Kuzak,none,severe,4,0,0,0,0,2,3,2,28,4,0,7.5,6,1
4,62,female,cervical,Kuzak,none,mild,3,0,0,2,3,0,1,1,25,4,0,6.6,6,1


In [9]:
df.to_csv('PTDataForML.csv', sep = ',', index = False, encoding = 'utf-8')