## This notebook tests LOS internal consistency for NH facilityfacts files ##

In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import expon
import scipy.optimize as op
from stats import fullCRVFromPDFModel, CachedCDFGenerator
import tools_util as tu
import pyrheautils as pu

In [None]:
runDesc = '/home/welling/git/pyRHEA_github/src/sim/twoyear_allfac_OC.yaml'
inputDict = tu.readModelInputs(runDesc)
pu.prepPathTranslations(inputDict)
facDict = tu.getFacDict(inputDict)


In [None]:
print 'hello world'
print facDict['NEWO']['losModel']

In [None]:
xV = []
yV = []
labels = []
for abbrev, rec in facDict.items():
    if rec['category'] == 'NURSINGHOME':
        print abbrev
        losModel = facDict[abbrev]['losModel']
        losCRV = fullCRVFromPDFModel(losModel)
        xV.append(rec['meanLOS']['value'])
        yV.append(losCRV.mean())
        labels.append(abbrev)
        


In [None]:
xVV = np.asarray(xV)
yVV = np.asarray(yV)
plt.plot(xVV, yVV, '*')
plt.plot([xVV.min(), xVV.max()], [xVV.min(), xVV.max()], '-')
plt.xlabel('meanLOS')
plt.ylabel('LOS PDF mean')
for x, y, abbrev in zip(xV, yV, labels):
    ratio = x/y
    if ratio < 0.99 or ratio > 1.01 or abbrev in ['EXTW']:
        print abbrev, x, y, facDict[abbrev]['losModel']
        plt.annotate(abbrev, xy=(x, y))
plt.show()

In [None]:
lm = facDict['STAN']['losModel'].copy()
lm['parms'] = lm['parms'][:]  # avoid overwriting
print lm

In [None]:
lm['parms'][0] = 0.0
print lm
print facDict['STAN']['losModel']

In [None]:
print lm

In [None]:
crv = fullCRVFromPDFModel(facDict['STAN']['losModel'])

In [None]:
print crv.mean()

In [None]:
from scipy.stats import weibull_min
print weibull_min(0.610925, scale=162.329318).mean()

In [None]:
baseScale = 20.559685
baseK = 1.352236
nonFrailCRV = weibull_min(baseK, scale=baseScale)
print nonFrailCRV.mean()

In [None]:
xV = np.linspace(0.0, 100.0, 101)
yV = nonFrailCRV.pdf(xV)
plt.plot(xV, yV, '-')
plt.plot([nonFrailCRV.mean()], [0.0], '*')
plt.show()

In [None]:
def pltCRV(crv, scale=1.0, label=None, xlim=None):
    mv = crv.mean()
    if xlim is None:
        xV = np.linspace(0.0, round(3*mv), 101)
    else:
        xV = np.linspace(0.0, xlim, 101)
    yV = scale * crv.pdf(xV)
    if label is None:
        plt.plot(xV, yV, '-')
    else:
        plt.plot(xV, yV, '-', label=label)
    plt.plot([mv], [0.0], '*')
    print mv


In [None]:
pltCRV(nonFrailCRV)
pltCRV(weibull_min(baseK, scale=1.5*baseScale))
plt.show()

In [None]:
frailCRV = weibull_min(0.610925, scale=162.329318)
pltCRV(frailCRV)
plt.show()

In [None]:
for idx in xrange(10000):
    sv = frailCRV.cdf(float(idx))
    ev = frailCRV.cdf(float(idx+1))
    chance = (ev-sv)/(1.0-sv)
    if np.random.random() < chance:
        print idx, chance


In [None]:
import phacsl.utils.formats.csv_tools as csv_tools
keys, recs = csv_tools.parseCSV(pu.pathTranslate('$(MODELDIR)/'
                                                 'OC_Nursing_Home_LOS_Line-Lists_for_RHEA_2.0_-_2011-2015_-_Adult_Only_-_09-29-2017_FINAL_NH_LOS_Line_List.csv'))
print keys
samps = []
for rec in recs:
    if rec['CODE'] == 'NEWO':
        samps.append(rec['LOS'])

In [None]:
%config InlineBackend.print_figure_kwargs = {'bbox_inches':None}
plt.rcParams["figure.figsize"] = [16, 10]

baseScale = 20.559685
baseK = 1.352236
nonFrailCRV = weibull_min(baseK, scale=baseScale)
scaledNonFrailCRV = weibull_min(baseK, scale=2.4*baseScale)

frailScale = 162.329318
frailK = 0.610925
frailCRV = weibull_min(frailK, scale=frailScale)
scaledFrailCRV = weibull_min(frailK, scale=2.4*frailScale)


sampV = np.asarray(samps)
newoLOSModel = facDict['NEWO']['losModel']
newoCRV = fullCRVFromPDFModel(newoLOSModel)
xlim = 300
#xlim = None
plt.hist(sampV, bins=100, range=(0.0, xlim), density=True, label='LINE_LIST samples')
#plt.hist(sampV, bins=100, density=True, label='LINE_LIST samples', log=True)
#pltCRV(newoCRV, label='NEWO scaled LOS', xlim=xlim)
lmda = newoLOSModel['parms'][0]
pltCRV(frailCRV, scale=lmda, label='FRAIL before scaling', xlim=xlim)
pltCRV(nonFrailCRV, scale=1.0-lmda, label='non-FRAIL before scaling', xlim=xlim)
pltCRV(scaledFrailCRV, scale=lmda, label='FRAIL after scaling', xlim=xlim)
pltCRV(scaledNonFrailCRV, scale=1.0-lmda, label='non-FRAIL after scaling', xlim=xlim)

unscaledLOSModel = newoLOSModel.copy()
unscaledLOSModel['parms'] = newoLOSModel['parms'][:]  # avoid overwriting
print unscaledLOSModel['parms']
unscaledLOSModel['parms'][4] = baseScale
unscaledLOSModel['parms'][2] = frailScale
print unscaledLOSModel['parms']
unscaledFullCRV = fullCRVFromPDFModel(unscaledLOSModel)
pltCRV(unscaledFullCRV, label='NEWO LOS model before scaling', xlim=xlim)

plt.legend()
plt.show()

In [None]:
print nonFrailCRV.mean()
print frailCRV.mean()

In [None]:
samps = []
for idx in range(10):
    samps.append(frailCRV.rvs(100).mean())
print np.asarray(samps).mean()

In [None]:
samps = []
for idx in range(10):
    samps.append(nonFrailCRV.rvs(100).sum()/100.0)
print np.asarray(samps).mean()

In [None]:
print facDict['NEWO']['losModel']

In [None]:
import cPickle as pickle
with open('/home/welling/git/pyRHEA_github/src/sim/tmp.pkl', 'rU') as f:
    sampBins, sampCounts = pickle.load(f)
print sampBins
print sampCounts
ctSum = float(sum(sampCounts))
scaledCounts = [float(ct)/ctSum for ct in sampCounts]
print scaledCounts

In [None]:
wtSum = 0.0
for x, ct in zip(sampBins, sampCounts):
    wtSum += x * ct
print wtSum/ctSum

In [None]:
newoCRV = fullCRVFromPDFModel(facDict['NEWO']['losModel'])
#plt.hist(sampV, bins=100, range=(0,300), density=True, label='double Weibull CRV')
pltCRV(newoCRV, label='NEWO LINE_LIST')
meanLOS = facDict['NEWO']['meanLOS']['value']
plt.plot((meanLOS, meanLOS), (0.0, 0.03), '-')
rects = plt.bar(sampBins, scaledCounts, width=1.0,  color='b')

plt.legend()
plt.show()

In [None]:
cG = CachedCDFGenerator(newoCRV)
totPatients = 10000.0
nPatients = totPatients
low = 0.0
tripleL = []
while low < 300.0:
    high = low + 1.0
    drop = cG.intervalProb(low, high) * nPatients
    tripleL.append((low, high, drop))
    nPatients -= drop
    low += 1.0
print '%s patients remain' % nPatients


synthBins = []
synthCts = []
for low, high, drop in tripleL:
    synthBins.append(0.5*(low + high))
    synthCts.append(drop/totPatients)
rects = plt.bar(synthBins, synthCts, width=1.0,  color='b', label='fractions', alpha=0.3)
pltCRV(newoCRV, label='NEWO CRV')
plt.legend()
plt.show()
print sum([a*b for a, b in zip(synthBins, synthCts)])/totPatients


In [None]:
df = pd.read_msgpack('/home/welling/git/pyRHEA_github/src/sim/cdf_intervalprob_results.mpz')
df.columns

In [None]:
newoDF = df[df.abbrev == 'NEWO']
print newoDF['abbrev'].count()
xL = []
yL = []
for row in xrange(newoDF['abbrev'].count()):
    ser = newoDF.iloc[row]
    xL.append(ser['start'])
    yL.append(ser['rslt'])
plt.plot(xL, yL, 'o', alpha=0.1)
rects = plt.bar(synthBins, synthCts, width=1.0,  color='b', label='fractions')

plt.show()


In [None]:
uniqueDF = newoDF.drop_duplicates(subset=['start', 'end', 'abbrev'])
uniqueDF.columns
all(uniqueDF['end'] - uniqueDF['start'] == 1)
#uniqueDF['dt'] = (uniqueDF['end'] - uniqueDF['start']).copy()
#uniqueDF['dt'] == 1

In [None]:
vL = []
wt = 1.0
for idx, row in uniqueDF.iterrows():
    frac = row['rslt']
    wt *= (1.0-frac)
    vL.append(wt)
uniqueDF['accumwt'] = np.asarray(vL)
#uniqueDF

In [None]:
plt.plot(0.5*(uniqueDF['start']+uniqueDF['end']), uniqueDF['accumwt'])
xV = np.linspace(0.0, 300.0, 100)
yV = 1.0 - newoCRV.cdf(xV)
plt.plot(xV, yV, 'x')
plt.show()

In [None]:
lst = [(x, y) for x, y in zip(xL, yL)]
lst.sort()
print lst[:10]

In [None]:
plt.hist([a for a,b in lst], bins=30, log=True)
plt.show()

In [None]:
oldDS = pd.read_csv(pu.pathTranslate('$(MODELDIR)/OC_Nursing_Home_LOS_Line-Lists_for_RHEA_2.0_-_2011-2015_-_Adult_Only_-_09-29-2017_FINAL_NH_LOS_Line_List.csv'))

In [None]:
newDS = pd.read_csv(pu.pathTranslate('$(MODELDIR)/OC_Nursing_Home_LOS_Line-Lists_for_RHEA_2.0_-_2011-2015_-_Adult_Only_-_05-21-2019_UPDATE_v2_NH_LOS_Line_List.csv'))

In [None]:
oldDS.columns

In [None]:
newDS.columns

In [None]:
newDS[(newDS['NH CODE'] == 'NEWO') & (newDS['DISCHARGE/ASSESSMENT YEAR']==2014) & newDS['NOT YET DISCHARGED?'].notna()]

In [None]:
print newDS[newDS['NH CODE'] == 'NEWO'].columns
subDS = newDS[newDS['NH CODE'] == 'NEWO']
subDS[subDS['NOT YET DISCHARGED?'] == 'X']

In [None]:
oldCtDS = oldDS.groupby(['CODE', 'DISCHARGE YEAR']).count()
oldCtDS

In [None]:
dischargeDS = oldCtDS.groupby('CODE').mean().rename(columns={'LOS':'OLD DISCH MEAN'})
dischargeDS

In [None]:
print newDS.columns
newNotDischDS = newDS.dropna().groupby(['NH CODE', 'YEAR']).count().groupby('NH CODE').mean()


In [None]:
dischargeDS = dischargeDS.join(newNotDischDS, how='left').drop(columns=['LOS (days)'])


In [None]:
print newDS.columns
print dischargeDS.columns
newDischCtDS = newDS[newDS['Not yet discharged?'].isna()].drop(columns=['Not yet discharged?']).groupby(['NH CODE', 'YEAR']).count().groupby(['NH CODE']).mean()

#dischargeDS = dischargeDS.join(newDischCtDS, how='left')
print dischargeDS.columns
dischargeDS = dischargeDS.rename(columns={'LOS (days)': 'NEW DISCH MEAN'})
dischargeDS

In [None]:
print facDict['NEWO'].keys()
print facDict['NEWO']['meanLOS']
print 365.0*(facDict['NEWO']['meanPop']['value']/facDict['NEWO']['totalAdmissions']['value'])

In [None]:
dischargeDS = dischargeDS.reset_index()

In [None]:
def myFun(row):
    abbrev = row['CODE']
    if abbrev in facDict:
        return facDict[abbrev]['totalAdmissions']['value']
    else:
        return np.NaN
dischargeDS['totalAdmissions'] = dischargeDS.apply(myFun, axis=1)
dischargeDS

In [None]:
def myFun(row):
    abbrev = row['CODE']
    if abbrev in facDict:
        return facDict[abbrev]['totalDischarges']['value']
    else:
        return np.NaN
dischargeDS['totalDischarges'] = dischargeDS.apply(myFun, axis=1)
dischargeDS

In [None]:
def myFun(row):
    abbrev = row['CODE']
    if abbrev in facDict:
        return facDict[abbrev]['meanPop']['value']
    else:
        return np.NaN
dischargeDS['meanPop'] = dischargeDS.apply(myFun, axis=1)
dischargeDS

In [None]:
newDF = newDS
oldDF = oldDS

In [None]:
plt.hist(newDF['LOS (days)'], bins=300, log=True, label='include')
plt.hist(newDF[newDF['Not yet discharged?'].isna()]['LOS (days)'], bins=300, log=True, label='exclude')
plt.hist(oldDF['LOS'], bins=300, log=True, label='old')
plt.legend()
plt.show()

In [None]:
plt.hist(newDF['LOS (days)'], bins=300, label='include', range=(0,300))
#plt.hist(oldDF['LOS'], bins=300, label='old', range=(0,300))
plt.hist(newDF[newDF['Not yet discharged?'].isna()]['LOS (days)'], bins=300, label='exclude', range=(0, 300))
plt.xlim(0, 300)
plt.legend()
plt.show()

In [None]:
df = newDF[newDS['Not yet discharged?'].isna()].drop(columns=['Not yet discharged?', 'YEAR']).groupby(['NH CODE']).mean()

df = df.rename(columns={'LOS (days)':'New mean LOS'})
losDF = df.copy()
df = oldDF.drop(columns='DISCHARGE YEAR').groupby(['CODE']).mean()
losDF = losDF.join(df, how='left')
losDF = losDF.rename(columns={'LOS':'Old mean LOS'})

def myFun(row):
    abbrev = row.name
    if abbrev in facDict:
        return facDict[abbrev]['meanLOS']['value']
    else:
        return np.NaN
ser = losDF.apply(myFun, axis=1)
losDF['LOS from props'] = ser
losDF

In [None]:
plt.plot(losDF['New mean LOS'], losDF['LOS from props'], 'o')
plt.plot((0.0, 300.0), (0.0, 300.0))
plt.show()

In [None]:
print newDS.columns
newDS.head()

In [None]:
newDS['COUNTME'] = 1

In [None]:
rawCountDF = newDS[['NH CODE', 'DISCHARGE/ASSESSMENT YEAR', 'COUNTME']].groupby(['NH CODE', 'DISCHARGE/ASSESSMENT YEAR']).count()
rawCountDF = rawCountDF.reset_index()

In [None]:
rawCountDF[rawCountDF['NH CODE'] == 'NEWO']

In [None]:
nydCountDF = newDS[['NH CODE', 'DISCHARGE/ASSESSMENT YEAR', 'NOT YET DISCHARGED?']].groupby(['NH CODE', 'DISCHARGE/ASSESSMENT YEAR']).count()
nydCountDF = nydCountDF.reset_index()

In [None]:
nydCountDF[nydCountDF['NH CODE'] == 'NEWO']

In [None]:
adCountDF = newDS[['NH CODE', 'DISCHARGE/ASSESSMENT YEAR', 'ADD 45 DAYS?']].groupby(['NH CODE', 'DISCHARGE/ASSESSMENT YEAR']).count()
adCountDF = adCountDF.reset_index()

In [None]:
adCountDF[adCountDF['NH CODE'] == 'NEWO']

In [None]:
countsDF = rawCountDF.copy().rename(columns={'COUNTME':'TOTAL RECS'})
countsDF['NOT YET DISCHARGED?'] = nydCountDF['NOT YET DISCHARGED?']
countsDF['ADD 45 DAYS?'] = adCountDF['ADD 45 DAYS?']
countsDF['NH CODE'] = countsDF['NH CODE'].astype('category')
countsDF['FRAC LOST DISCHARGES'] = countsDF['ADD 45 DAYS?'].astype(np.float32)/countsDF['TOTAL RECS'].astype(np.float32)

In [None]:
countsDF[countsDF['NH CODE'] == 'NEWO']

In [None]:
countsDF.dtypes

In [None]:
sumsDF = countsDF.groupby(['NH CODE']).sum().drop(columns=['DISCHARGE/ASSESSMENT YEAR', 'FRAC LOST DISCHARGES']).reset_index()

sumsDF['FRAC LOST DISCHARGES'] = sumsDF['ADD 45 DAYS?'].astype(np.float32)/sumsDF['TOTAL RECS'].astype(np.float32)

In [None]:
sumsDF[sumsDF['NH CODE'] == 'NEWO']

In [None]:
print 'min: ', sumsDF['FRAC LOST DISCHARGES'].min()
print 'median: ', sumsDF['FRAC LOST DISCHARGES'].median()
print 'mean: ', sumsDF['FRAC LOST DISCHARGES'].mean()
print 'max: ', sumsDF['FRAC LOST DISCHARGES'].max()
