## This notebook generates LOS models for NH facilityfacts files ##

In [None]:
import os
import pandas as pd
from IPython.display import display, HTML
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import expon, lognorm
import scipy.optimize as op
from stats import fullCRVFromPDFModel, CachedCDFGenerator
import tools_util as tu
import pyrheautils as pu

## Load the model ##

In [None]:
runDesc = '/home/welling/git/pyRHEA_github/src/sim/twoyear_allfac_OC.yaml'
inputDict = tu.readModelInputs(runDesc)
pu.prepPathTranslations(inputDict)
facDict = tu.getFacDict(inputDict)


## Load the line list data ##

In [None]:
lineDF = pd.read_csv(pu.pathTranslate('$(MODELDIR)/'
                                     'OC_Nursing_Home_LOS_Line-Lists_for_RHEA_2.0_-_2011-2015_-_Adult_Only_-_05-21-2019_UPDATE_v2_NH_LOS_Line_List.csv'))
print lineDF.columns
lineDF = lineDF[lineDF['NOT YET DISCHARGED?'].isnull()]
display(lineDF.head())

## Some Useful Functions ##

Also, make the plots big enough to see

In [None]:
%config InlineBackend.print_figure_kwargs = {'bbox_inches':None}
plt.rcParams["figure.figsize"] = [16, 10]

In [None]:
def pltCRV(crv, scale=1.0, label=None, xlim=None):
    mv = crv.mean()
    if xlim is None:
        xV = np.linspace(0.0, round(3*mv), 101)
    else:
        xV = np.linspace(0.0, xlim, 101)
    yV = scale * crv.pdf(xV)
    if label is None:
        plt.plot(xV, yV, '-')
    else:
        plt.plot(xV, yV, '-', label=label)
    plt.plot([mv], [0.0], '*')
    print mv


In [None]:
def chisqr(crv, samps, highBound):
    sampV = np.zeros(highBound)
    for val in samps['RAW LOS (days)']:
        if int(val) < highBound:
            sampV[int(val)] += 1
    totSamps = samps[samps['RAW LOS (days)'] <= highBound]['NH CODE'].count()
    scale = totSamps / crv.cdf(highBound)
    predV = crv.pdf(np.linspace(0, highBound - 1, highBound) + 0.5) * scale
    diffV = sampV - predV
    rslt = np.sqrt((diffV * diffV).sum())
    return rslt

def buildLOSModel(estV, disch, tto, meanPop):
    assert len(estV) == 2, 'was expecting two parameters?'
    alpha = float(disch - tto)/float(disch)
    ttoRate = float(tto)/(365. * meanPop)
    losModel = {'parms': [alpha, estV[0], estV[1], ttoRate],
                'pdf': "$0*lognorm(mu=$1,sigma=$2)+(1-$0)*expon(lambda=$3)"}
    return losModel


def calcChiSqr(estV, disch, tto, meanPop, sampDF):
    losModel = buildLOSModel(estV, disch, tto, meanPop)
    crv = fullCRVFromPDFModel(losModel)
    rslt = chisqr(crv, facLOSDF, 365)
    #print '%s -> %s' % (losModel, rslt)
    return rslt




In [None]:
def estLogNormParms(losDF):
    lnSampV = np.log(losDF['RAW LOS (days)'].values)
    mu = np.average(lnSampV)
    sigma = np.std(lnSampV)
    return [mu, sigma]
    

## Estimate fit values for all NHs ##

In [None]:
fitD = {}
abbrevL = [abbrev for abbrev, rec in facDict.items() if rec['category']=='NURSINGHOME']
for abbrev in abbrevL:
    facLOSDF = lineDF[lineDF['NH CODE'] == abbrev]
    guessV = estLogNormParms(facLOSDF)
    facRec = facDict[abbrev]
    tto = 0.0
    for elt in facRec['totalTransfersOut']:
        tto += elt['count']['value']
    meanPop = facRec['meanPop']['value']
    disch = facRec['totalDischarges']['value']
    rslt = op.minimize(calcChiSqr, guessV, args=(disch, tto, meanPop, facLOSDF), method='Nelder-Mead')
    fitLOSModel = buildLOSModel(rslt.x, disch, tto, meanPop)
    nLines = facLOSDF.count()['NH CODE']
    nSamps = facLOSDF[facLOSDF['RAW LOS (days)'] <= 365]['NH CODE'].count()
    fitLOSModel['nsamples'] = nSamps
    fitLOSModel['chisqr_per_sample'] = rslt.fun / nSamps
    fitLOSModel['converged'] = rslt.success
    fitD[abbrev] = fitLOSModel
    print '%s: %s' % (abbrev, fitLOSModel)


## Plot a random sampling as examples ##

In [None]:
showL = np.random.choice(abbrevL, size=10)
print showL

for abbrev in showL:
    fitCRV = fullCRVFromPDFModel(fitD[abbrev])
    facLOSDF = lineDF[lineDF['NH CODE'] == abbrev]
    plt.hist(facLOSDF['RAW LOS (days)'], range=(0,300), bins=300, label='valid line-list')
    nLines = facLOSDF.count()['NH CODE']
    pltCRV(fitCRV, scale=nLines/fitCRV.cdf(365.), label='result of fitting', xlim=300)
    plt.legend()
    plt.title('%s %s' % (abbrev, fitLOSModel['parms']))
    plt.show()

## Look at the cases where fitting failed ##

In [None]:
showL = [abbrev for abbrev, rec in fitD.items() if not rec['converged']]
print showL

for abbrev in showL:
    fitCRV = fullCRVFromPDFModel(fitD[abbrev])
    facLOSDF = lineDF[lineDF['NH CODE'] == abbrev]
    display(facLOSDF.head())
    plt.hist(facLOSDF['RAW LOS (days)'], range=(0,300), bins=300, label='valid line-list')
    nLines = facLOSDF.count()['NH CODE']
    pltCRV(fitCRV, scale=nLines/fitCRV.cdf(365.), label='result of fitting', xlim=300)
    plt.legend()
    plt.title('%s %s' % (abbrev, fitLOSModel['parms']))
    plt.show()

## Save the fit models ##

In [None]:
orecL = []
for abbrev, rec in fitD.items():
    orec = {('parms{}'.format(i)) : v for i, v in enumerate(rec['parms'])}
    orec.update({'abbrev': abbrev, 'pdf': rec['pdf'], 'nsamples': rec['nsamples'],
                 'chisqr_per_sample': rec['chisqr_per_sample']})
    orecL.append(orec)
odf = pd.DataFrame(orecL)
odf = odf.sort_values(['abbrev'], axis=0)
odf.to_csv('los_model_fit_nh_split_fates.csv', index=False)

## Analytical Model ##

How do we map these fits into a partition between FRAIL and non-FRAIL patients?  The notion is that all patients are subject to the exponential term, which represents spontaneous problems requiring hospitalization.  Only non-FRAIL patients are subject to the lognorm term, which represents the completion of rehab and is followed by a return to the community.

$$
P = \lambda (\alpha P_H + (1 - \alpha)P_t) + (1 - \lambda)P_t
$$

$$
P = \lambda \alpha P_H + \{ \lambda (1-\alpha) + (1 - \lambda) \} P_t
$$

$$
\hat{\alpha} = \lambda \alpha \\
1 - \hat{\alpha} = \lambda (1-\alpha) + (1 - \lambda)
$$

* $P$ is total discharge probability for a patient (known)
* $P_H$ is probability of discharge home (known)
* $P_t$ is probability of transfer to another facility (known)
* $N$ is total discharges per year (known)
* $n$ is total transfers per year (known)
* $\lambda$ is the fraction *not* frail
* $\alpha$ is the weight of the lognormal term in the non-frail PDF
* $\hat{\alpha}$ is the weight of the lognormal term in the overall PDF

It looks like a perfectly valid solution is $\lambda = a = \sqrt{\hat{a}}$ .

In [None]:
for abbrev in ['NEWO']:
    losModel = fitD[abbrev]
    losCRV = fullCRVFromPDFModel(losModel)
    homeLOSModel = {'parms': losModel['parms'][1:3], 'pdf': 'lognorm(mu=$0,sigma=$1)'}
    homeCRV = fullCRVFromPDFModel(homeLOSModel)
    transLOSModel = {'parms': losModel['parms'][3:], 'pdf': 'expon(lambda=$0)'}
    transCRV = fullCRVFromPDFModel(transLOSModel)
    facLOSDF = lineDF[lineDF['NH CODE'] == abbrev]
    print 'a-hat: ', losModel['parms'][0:1]
    print 'full: ', 1.0 - losCRV.cdf(365.)
    print 'home: ', 1.0 - homeCRV.cdf(365.)
    print 'trans: ', 1.0 - transCRV.cdf(365.)
    ratio = float(facLOSDF[facLOSDF['RAW LOS (days)'] >= 365.].count()['NH CODE'])/float(facLOSDF.count()['NH CODE'])
    print 'by count: ', ratio

## Read and plot intervalprob values captured by JournalingCachedCDFGenerator ##

In [None]:
simDF = pd.read_msgpack(pu.pathTranslate('$(SIMDIR)/cdf_intervalprob_results.mpz'))
#display(simDF.head())
for abbrev in simDF['abbrev'].unique():
    print abbrev
    df = simDF[simDF.abbrev == abbrev]
    #display(df.head())
    for pdf in df['pdf'].unique():
        print pdf
        subDF = df[df.pdf == pdf]
        display(subDF.tail())
        plt.plot(0.5*(subDF['start'] + subDF['end']), subDF['rslt'], '*', alpha=0.01, label=pdf)
    plt.xlim(0.0, 300.0)
    plt.title(abbrev)
    plt.legend()
    plt.show()
    