# Week 5 Exercise 2: Patient Cost Forecast
## Approach: Time-series forecast, one-step forward

In [1]:
import pandas as pd
import sqlite3 as sql

In [2]:
import os
con = sql.connect(os.path.join(os.getcwd(), 'synthea_and_county_ga.db')) #test.b has more than 2 years of history... AFTER testing costs are still VERY different every year, so can stick with current data

In [3]:
df = pd.read_excel('w4e2_patient_risk_morb_scores.xlsx')
df = df.set_index('id')

In [4]:
# Manually set a date to start from for 5 years in the past
from datetime import date
startdate = date(2017, 1, 1) # Date chosen for when data created

In [5]:
# Create the columns and zero fill
df['cost_yearminus5'] = 0
df['cost_yearminus4'] = 0
df['cost_yearminus3'] = 0
df['cost_yearminus2'] = 0
df['cost_yearminus1'] = 0
df['cost_year0'] = 0

In [6]:
# Note the start date is 5 years back. Change to python variable in the future
sql = f"""
    select patient as id, strftime('%Y', start) as enc_year, sum(TOTAL_CLAIM_COST) as enc_year_cost
    from encounters 
    where start > '{startdate}' and encounterclass not in ('wellness')
    group by patient, enc_year
    order by patient, enc_year
"""

df_temp = pd.read_sql_query(sql, con)
df_temp = df_temp.rename(columns=str.lower)
df_temp = df_temp.set_index('id')
df_temp = df_temp.round(2)

In [7]:
df_pivot = df_temp.pivot(columns='enc_year', values='enc_year_cost')
df_pivot = df_pivot.rename(columns = {'2017':'cost_yearminus5',
                                     '2018':'cost_yearminus4',
                                     '2019':'cost_yearminus3',
                                     '2020':'cost_yearminus2',
                                     '2021':'cost_yearminus1',
                                     '2022':'cost_year0'})

In [8]:
df_pivot

enc_year,cost_yearminus5,cost_yearminus4,cost_yearminus3,cost_yearminus2,cost_yearminus1,cost_year0,2023
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
00054016-4e0f-ca1b-1c9d-ef729b4015e5,29082.12,22255.95,18381.97,30109.99,41543.14,21358.97,2414.59
000a96b5-198b-2d1a-1df1-30c2b8e02dbf,,183.85,,1058.07,1048.51,253.92,
000cea9e-5559-4bdc-88ff-fa934dd0d4af,,3656.45,8895.82,,,,67.77
000e631f-48bd-e91f-c2ec-90045a19eadb,116.08,549.47,4395.76,1833.74,29387.29,2197.88,1098.94
00154845-1b6d-eb8f-7a0d-60915f642d23,,,685.01,24700.46,497.88,,
...,...,...,...,...,...,...,...
ffe5376c-1dca-8001-a4fa-0c62b2fc5dd5,,2716.09,17685.38,14349.46,20107.93,,
ffe5c5b8-c27e-648c-cdc8-2b60a3152585,,9076.18,,,701.19,1536.08,67.77
ffe935fd-f57f-39ac-f921-1d8555d08c33,,,,,497.88,,
ffea5d9b-4772-e24c-db62-5980f63b966d,,,67.77,3024.37,4921.84,,


In [9]:
df.update(df_pivot)

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 22632 entries, 53e40e98-c764-53a4-aaf6-6318a3c3c95d to 9fdfb702-0f46-8899-fe8c-363733532bb6
Columns: 137 entries, first to cost_year0
dtypes: bool(12), float64(39), int64(53), object(33)
memory usage: 22.0+ MB


## Create the prediction model for prediting year 0 (i.e., 2022)

In [11]:
y = df['cost_year0']

In [12]:
# Set up the predictors to include all 'numof' (utilization) columns and prior year costs
cols_numof = [col for col in df.columns if 'numof' in col]
cols_cost = [col for col in df.columns if 'cost_year' in col]
X = df[cols_numof + cols_cost]
# Drop last (target)
X = X.drop(columns=['cost_year0']) # don't need the current year as that is what we are trying to predict

In [13]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)

In [15]:
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)
model.score(X_test, y_test)






0.7605832641428127

## Now that we have a model
### Shift by one year and predict following year

In [16]:
X['cost_yearminus1'] = df['cost_year0']
X['cost_yearminus2'] = df['cost_yearminus1']
X['cost_yearminus3'] = df['cost_yearminus2']
X['cost_yearminus4'] = df['cost_yearminus3']
X['cost_yearminus5'] = df['cost_yearminus4']

0


In [17]:
df['cost_yearplus1'] = result.round(2)

In [18]:
df.to_excel('w5e2_patient_cost_forecast.xlsx')