In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/poverty-rate-forecast-prophet/__results__.html
/kaggle/input/poverty-rate-forecast-prophet/__notebook__.ipynb
/kaggle/input/poverty-rate-forecast-prophet/poverty_rate_forecast.csv
/kaggle/input/poverty-rate-forecast-prophet/__output__.json
/kaggle/input/poverty-rate-forecast-prophet/custom.css
/kaggle/input/homelessness-data-collection/zillow_house_prices.csv
/kaggle/input/homelessness-data-collection/pit_data_interpolated.csv
/kaggle/input/homelessness-data-collection/yearly_employment_rates.csv
/kaggle/input/homelessness-data-collection/__results__.html
/kaggle/input/homelessness-data-collection/zillow_home_price_index.csv
/kaggle/input/homelessness-data-collection/unified_data.csv
/kaggle/input/homelessness-data-collection/census_poverty_income.csv
/kaggle/input/homelessness-data-collection/__notebook__.ipynb
/kaggle/input/homelessness-data-collection/state_abbrevations.csv
/kaggle/input/homelessness-data-collection/__output__.json
/kaggle/input/homelessness-data-colle

In [2]:
DATA_DIR = "/kaggle/input/homelessness-data-collection"
filepath = f"{DATA_DIR}/unified_data.csv"
data_df = pd.read_csv(filepath)
data_df.shape, data_df.columns

((8007, 11),
 Index(['Date', 'Overall Homeless', 'State', 'State_Name', 'unemploy_rate',
        'employed_pop_rate', 'HomeValueIndex', 'MHHI', 'Poverty_Count',
        'Poverty_Rate', 'Min_Rent'],
       dtype='object'))

In [3]:
future_df = pd.read_csv("/kaggle/input/poverty-rate-forecast-prophet/poverty_rate_forecast.csv")
tx_pr = future_df[future_df['ds'] <= "2020-01-31"]
future_df = future_df[future_df['ds'] >= "2020-01-31"]
future_df.shape, future_df.columns

((21, 5),
 Index(['State', 'ds', 'HomeValueIndex', 'unemploy_rate', 'Poverty_Rate'], dtype='object'))

In [4]:
data_df.isna().sum()

Date                   0
Overall Homeless       0
State                  0
State_Name             0
unemploy_rate          0
employed_pop_rate      0
HomeValueIndex        24
MHHI                 612
Poverty_Count        612
Poverty_Rate         612
Min_Rent               0
dtype: int64

In [5]:
data_df['Date'].min(), data_df['Date'].max()

('2007-01-31', '2020-01-31')

In [6]:
tx_data = data_df[data_df['State'].isin(["TX"])]
tx_data = tx_data.drop(columns=['employed_pop_rate', 
                                'Poverty_Count'])
tx_data.loc[tx_data['Poverty_Rate'].isna(), 'Poverty_Rate'] = tx_pr['Poverty_Rate'].values
tx_data.shape, tx_data.columns

((157, 9),
 Index(['Date', 'Overall Homeless', 'State', 'State_Name', 'unemploy_rate',
        'HomeValueIndex', 'MHHI', 'Poverty_Rate', 'Min_Rent'],
       dtype='object'))

In [7]:
tx_data['HomeValueIndex'] = (tx_data.groupby('State')['HomeValueIndex']
                             .apply(lambda x:x.fillna(x.mean())))

# tx_data['Poverty_Rate'] = (tx_data.groupby('State')['Poverty_Rate']
#                              .apply(lambda x:x.fillna(x.mean())))

tx_data['Date'] = pd.to_datetime(tx_data.Date)

In [8]:
tx_data.isna().sum()

Date                 0
Overall Homeless     0
State                0
State_Name           0
unemploy_rate        0
HomeValueIndex       0
MHHI                12
Poverty_Rate         0
Min_Rent             0
dtype: int64

In [9]:
def forecast_accuracy(forecast, actual):
    mape = np.mean(np.abs(forecast - actual)/np.abs(actual))  # MAPE
    mae = np.mean(np.abs(forecast - actual))    # MAE
    rmse = np.mean((forecast - actual)**2)**.5  # RMSE
    corr = np.corrcoef(forecast, actual)[0,1]   # corr
    return({'mape':mape, 'mae': mae, 
            'rmse':rmse, 'corr':corr})

In [10]:
from sklearn.model_selection import TimeSeriesSplit
tx_train = tx_data[tx_data['Date'] <= "2018-01-31"]
tx_val = tx_data[tx_data['Date'] > "2018-01-31"]
tx_train = tx_train.set_index('Date').select_dtypes([float, int])
tx_val = tx_val.set_index('Date').select_dtypes([float, int])
tx_train.shape, tx_val.shape

((133, 6), (24, 6))

In [11]:
from fbprophet import Prophet

ptx_train = tx_train.reset_index()
ptx_val = tx_val.reset_index()

ptx_train = ptx_train.rename(columns={'Date':"ds",'Overall Homeless':'y'})
ptx_val = ptx_val.rename(columns={'Date':"ds",'Overall Homeless':'y'})

y_val_true = ptx_val['y']
ptx_val = ptx_val.drop(columns=['y'])
ptx_train.shape, ptx_val.shape

((133, 7), (24, 6))

### Simple Forecast Homelessness

In [12]:
# define the model
m0 = Prophet()
# fit the model
m0.fit(ptx_train)
print("\n\n")
# future 
future = m0.predict(ptx_val)
forecast_accuracy(future['yhat'], y_val_true)

Initial log joint probability = -2.29847



    Iter      log prob        ||dx||      ||grad||       alpha      alpha0  # evals  Notes 
      96       462.443   0.000256594       196.336   1.727e-06       0.001      152  LS failed, Hessian reset 
      99       462.498   0.000499418       135.837           1           1      155   
    Iter      log prob        ||dx||      ||grad||       alpha      alpha0  # evals  Notes 
     133       467.281   0.000212015       261.665   8.577e-07       0.001      240  LS failed, Hessian reset 
     199       473.901     0.0179056       359.399           1           1      319   
    Iter      log prob        ||dx||      ||grad||       alpha      alpha0  # evals  Notes 
     299       485.497     0.0889606       574.546           1           1      434   
    Iter      log prob        ||dx||      ||grad||       alpha      alpha0  # evals  Notes 
     399       503.173    0.00620163       375.119           1           1      550   
    Iter      log 

{'mape': 0.014004870004872596,
 'mae': 366.65246239568825,
 'rmse': 390.0609482868666,
 'corr': 0.9754618170359518}

### Train adding 'unemploy_rate', 'HomeValueIndex', 'Poverty_Rate' as regressors

In [13]:
# define the model
m = Prophet()

# add regressors
for col in ['unemploy_rate', 'HomeValueIndex', 'Poverty_Rate']:
    m.add_regressor(col)

# fit the model
m.fit(ptx_train)

Initial log joint probability = -2.29847


<fbprophet.forecaster.Prophet at 0x7f7a41025410>

    Iter      log prob        ||dx||      ||grad||       alpha      alpha0  # evals  Notes 
      99       467.259    0.00465266        1029.9           1           1      113   
    Iter      log prob        ||dx||      ||grad||       alpha      alpha0  # evals  Notes 
     199       487.635    0.00102178       225.366           1           1      216   
    Iter      log prob        ||dx||      ||grad||       alpha      alpha0  # evals  Notes 
     299       494.477   0.000265595       266.919      0.7384      0.7384      331   
    Iter      log prob        ||dx||      ||grad||       alpha      alpha0  # evals  Notes 
     399        501.21    0.00968516       542.723           1           1      447   
    Iter      log prob        ||dx||      ||grad||       alpha      alpha0  # evals  Notes 
     410       501.725   2.30011e-05       74.3483   1.872e-07       0.001      500  LS failed, Hessian reset 
     499       503.822   0.000866487        174.31           1           1      6

In [14]:
# define the period for which we want a prediction
val_pred = m.predict(ptx_val)
val_pred.shape, val_pred.columns

((24, 28),
 Index(['ds', 'trend', 'yhat_lower', 'yhat_upper', 'trend_lower', 'trend_upper',
        'HomeValueIndex', 'HomeValueIndex_lower', 'HomeValueIndex_upper',
        'Poverty_Rate', 'Poverty_Rate_lower', 'Poverty_Rate_upper',
        'additive_terms', 'additive_terms_lower', 'additive_terms_upper',
        'extra_regressors_additive', 'extra_regressors_additive_lower',
        'extra_regressors_additive_upper', 'unemploy_rate',
        'unemploy_rate_lower', 'unemploy_rate_upper', 'yearly', 'yearly_lower',
        'yearly_upper', 'multiplicative_terms', 'multiplicative_terms_lower',
        'multiplicative_terms_upper', 'yhat'],
       dtype='object'))

In [15]:
forecast_accuracy(val_pred['yhat'], y_val_true)

{'mape': 0.01024946962810625,
 'mae': 268.69001466408616,
 'rmse': 299.82573122701996,
 'corr': 0.9754420674515807}

### Train on whole data (train + val)

In [16]:
tx_data = tx_data.reset_index()
tx_data = tx_data.rename(columns={'Date':"ds",'Overall Homeless':'y'})

In [17]:
# define the model
m = Prophet()

# add regressors
for col in ['unemploy_rate', 'HomeValueIndex', 'Poverty_Rate']:
    m.add_regressor(col)

# fit the model
m.fit(tx_data)

Initial log joint probability = -2.89265


<fbprophet.forecaster.Prophet at 0x7f7a4056ad90>

    Iter      log prob        ||dx||      ||grad||       alpha      alpha0  # evals  Notes 
      99       555.964    0.00168672       413.585           1           1      114   
    Iter      log prob        ||dx||      ||grad||       alpha      alpha0  # evals  Notes 
     199       572.135   0.000182794       236.283      0.8669      0.8669      228   
    Iter      log prob        ||dx||      ||grad||       alpha      alpha0  # evals  Notes 
     299       583.608   0.000372169        306.74       0.123           1      339   
    Iter      log prob        ||dx||      ||grad||       alpha      alpha0  # evals  Notes 
     399       592.801   0.000702559       845.847           1           1      450   
    Iter      log prob        ||dx||      ||grad||       alpha      alpha0  # evals  Notes 
     499       598.206   0.000782118       170.235           1           1      562   
    Iter      log prob        ||dx||      ||grad||       alpha      alpha0  # evals  Notes 
     599     

In [18]:
### forecast for 2020 & 2021
future_forecasts = m.predict(future_df)
future_forecasts.shape, future_forecasts.columns

x||      ||grad||       alpha      alpha0  # evals  Notes 
    4709       669.759    2.4959e-06       82.3994   2.271e-08       0.001     5996  LS failed, Hessian reset 
    4799       669.777    1.3307e-05       48.1955           1           1     6103   
    Iter      log prob        ||dx||      ||grad||       alpha      alpha0  # evals  Notes 
    4816       669.778   2.36935e-06       102.083   3.182e-08       0.001     6172  LS failed, Hessian reset 
    4899       669.799   0.000215945       180.109           1           1     6266   
    Iter      log prob        ||dx||      ||grad||       alpha      alpha0  # evals  Notes 
    4999       670.481    0.00214545       503.546      0.9586      0.9586     6373   
    Iter      log prob        ||dx||      ||grad||       alpha      alpha0  # evals  Notes 
    5004       670.492   5.02922e-06       188.698   2.876e-08       0.001     6430  LS failed, Hessian reset 
    5099        670.97    0.00018037       203.776           1         

((21, 28),
 Index(['ds', 'trend', 'yhat_lower', 'yhat_upper', 'trend_lower', 'trend_upper',
        'HomeValueIndex', 'HomeValueIndex_lower', 'HomeValueIndex_upper',
        'Poverty_Rate', 'Poverty_Rate_lower', 'Poverty_Rate_upper',
        'additive_terms', 'additive_terms_lower', 'additive_terms_upper',
        'extra_regressors_additive', 'extra_regressors_additive_lower',
        'extra_regressors_additive_upper', 'unemploy_rate',
        'unemploy_rate_lower', 'unemploy_rate_upper', 'yearly', 'yearly_lower',
        'yearly_upper', 'multiplicative_terms', 'multiplicative_terms_lower',
        'multiplicative_terms_upper', 'yhat'],
       dtype='object'))

In [19]:
future_forecasts.to_csv("homelessness_forecasts.csv", index=False)

In [20]:
from IPython.display import FileLinks
FileLinks(".")

In [21]:
tx_data.tail()

Unnamed: 0,index,ds,y,State,State_Name,unemploy_rate,HomeValueIndex,MHHI,Poverty_Rate,Min_Rent
152,6903,2019-09-30,26764.0,TX,Texas,3.5,136261.0,,13.663371,606.0
153,6904,2019-10-31,26881.0,TX,Texas,3.5,136634.0,,13.585856,606.0
154,6905,2019-11-30,26994.0,TX,Texas,3.5,135749.0,,13.623926,607.0
155,6906,2019-12-31,27112.0,TX,Texas,3.5,135241.5,,13.613581,607.0
156,6907,2020-01-31,27229.0,TX,Texas,3.6,135487.0,,13.521736,608.0


In [22]:
future_forecasts[['ds', 'yhat']].tail()

Unnamed: 0,ds,yhat
16,2021-05-31,27873.043125
17,2021-06-30,27983.853225
18,2021-07-31,28116.263833
19,2021-08-31,28231.662236
20,2021-09-30,27998.267
