In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/homelessness-data-collection/zillow_house_prices.csv
/kaggle/input/homelessness-data-collection/pit_data_interpolated.csv
/kaggle/input/homelessness-data-collection/yearly_employment_rates.csv
/kaggle/input/homelessness-data-collection/__results__.html
/kaggle/input/homelessness-data-collection/zillow_home_price_index.csv
/kaggle/input/homelessness-data-collection/unified_data.csv
/kaggle/input/homelessness-data-collection/census_poverty_income.csv
/kaggle/input/homelessness-data-collection/__notebook__.ipynb
/kaggle/input/homelessness-data-collection/state_abbrevations.csv
/kaggle/input/homelessness-data-collection/__output__.json
/kaggle/input/homelessness-data-collection/yearly_house_rents.csv
/kaggle/input/homelessness-data-collection/custom.css


In [2]:
DATA_DIR = "/kaggle/input/homelessness-data-collection"
zdf = pd.read_csv(f"{DATA_DIR}/zillow_home_price_index.csv")
zdf.shape, zdf.columns

((13311, 3), Index(['State', 'Date', 'HomeValueIndex'], dtype='object'))

In [3]:
uedf = pd.read_csv(f"{DATA_DIR}/yearly_employment_rates.csv")
uedf = uedf[['State', 'Date', 'unemploy_rate']]
uedf.shape, uedf.columns

((13311, 3), Index(['State', 'Date', 'unemploy_rate'], dtype='object'))

In [4]:
pdf = pd.read_csv(f"{DATA_DIR}/census_poverty_income.csv")
pdf = pdf[['State', 'Date', 'Poverty_Rate']]
pdf.shape, pdf['Date'].max()

((11679, 3), '2019-01-31')

In [5]:
data_df = zdf.merge(uedf, how="left", on=['State', 'Date'])
data_df = data_df.merge(pdf, how="left", on=['State', 'Date'])
data_df.shape, data_df.columns

((13311, 5),
 Index(['State', 'Date', 'HomeValueIndex', 'unemploy_rate', 'Poverty_Rate'], dtype='object'))

In [6]:
data_df['HomeValueIndex'] = (data_df.groupby('State')['HomeValueIndex']
                             .apply(lambda x:x.fillna(x.mean())))
data_df['Date'] = pd.to_datetime(data_df.Date)
data_df['Date'].max()

Timestamp('2021-09-30 00:00:00')

In [7]:
data_df.isna().sum()

State                0
Date                 0
HomeValueIndex       0
unemploy_rate        0
Poverty_Rate      1632
dtype: int64

In [8]:
data_df['Date'].min(), data_df['Date'].max()

(Timestamp('2000-01-31 00:00:00'), Timestamp('2021-09-30 00:00:00'))

In [9]:
tx_data = data_df[data_df['State'].isin(["TX"])]
future_df = tx_data[tx_data['Date'] > "2019-01-31"]
tx_data = tx_data[tx_data['Date'] <= "2019-01-31"]
tx_data.shape, tx_data.columns

((229, 5),
 Index(['State', 'Date', 'HomeValueIndex', 'unemploy_rate', 'Poverty_Rate'], dtype='object'))

In [10]:
from sklearn.model_selection import TimeSeriesSplit
tx_train = tx_data[tx_data['Date'] <= "2017-01-31"]
tx_val = tx_data[tx_data['Date'] > "2017-01-31"]
tx_train = tx_train.set_index('Date').select_dtypes([float, int])
tx_val = tx_val.set_index('Date').select_dtypes([float, int])
tx_train.shape, tx_val.shape

((205, 3), (24, 3))

In [11]:
tx_train.isna().sum()

HomeValueIndex    0
unemploy_rate     0
Poverty_Rate      0
dtype: int64

In [12]:
from fbprophet import Prophet

def adjust(val, length= 6): return str(val).ljust(length)

def forecast_accuracy(forecast, actual):
    mape = np.mean(np.abs(forecast - actual)/np.abs(actual))  # MAPE
    mae = np.mean(np.abs(forecast - actual))    # MAE
    rmse = np.mean((forecast - actual)**2)**.5  # RMSE
    corr = np.corrcoef(forecast, actual)[0,1]   # corr
    return({'mape':mape, 'mae': mae, 
            'rmse':rmse, 'corr':corr})

In [13]:
ptx_train = tx_train.reset_index()
ptx_val = tx_val.reset_index()

ptx_train = ptx_train.rename(columns={'Date':"ds",'Poverty_Rate':'y'})
ptx_val = ptx_val.rename(columns={'Date':"ds",'Poverty_Rate':'y'})

y_val_true = ptx_val['y']
ptx_val = ptx_val.drop(columns=['y'])
ptx_train.shape, ptx_val.shape

((205, 4), (24, 3))

In [14]:
# define the model
m = Prophet()

# add regressors
for col in ['unemploy_rate', 'HomeValueIndex']:
    m.add_regressor(col)

# fit the model
m.fit(ptx_train)

Initial log joint probability = -3.07716
    Iter      log prob        ||dx||      ||grad||       alpha      alpha0  # evals  Notes 
      99        705.61     0.0232065       263.889           1           1      117   
    Iter      log prob        ||dx||      ||grad||       alpha      alpha0  # evals  Notes 
     199       707.927   0.000140159       62.3721      0.7521      0.7521      246   
    Iter      log prob        ||dx||      ||grad||       alpha      alpha0  # evals  Notes 
     273       709.856   0.000152188       168.898   8.305e-07       0.001      367  LS failed, Hessian reset 
     299       710.118    0.00013056       66.9335      0.2687           1      397   
    Iter      log prob        ||dx||      ||grad||       alpha      alpha0  # evals  Notes 
     327       710.236   0.000153848       125.739   2.383e-06       0.001      473  LS failed, Hessian reset 
     350       710.277   0.000114505       153.862   1.579e-06       0.001      534  LS failed, Hessian rese

<fbprophet.forecaster.Prophet at 0x7fb95ecf4090>

In [15]:
# define the period for which we want a prediction
res_df = m.predict(ptx_val)
res_df.columns

Index(['ds', 'trend', 'yhat_lower', 'yhat_upper', 'trend_lower', 'trend_upper',
       'HomeValueIndex', 'HomeValueIndex_lower', 'HomeValueIndex_upper',
       'additive_terms', 'additive_terms_lower', 'additive_terms_upper',
       'extra_regressors_additive', 'extra_regressors_additive_lower',
       'extra_regressors_additive_upper', 'unemploy_rate',
       'unemploy_rate_lower', 'unemploy_rate_upper', 'yearly', 'yearly_lower',
       'yearly_upper', 'multiplicative_terms', 'multiplicative_terms_lower',
       'multiplicative_terms_upper', 'yhat'],
      dtype='object')

In [16]:
forecast_accuracy(res_df['yhat'], y_val_true)

{'mape': 0.01835316646448042,
 'mae': 0.2729525886039663,
 'rmse': 0.3541014316404413,
 'corr': 0.842617074626346}

### Train on whole data (train + val)

In [17]:
tx_data = tx_data.reset_index()
tx_data = tx_data.rename(columns={'Date':"ds",'Poverty_Rate':'y'})

In [18]:
# define the model
m = Prophet()

# add regressors
for col in ['unemploy_rate', 'HomeValueIndex']:
    m.add_regressor(col)

# fit the model
m.fit(tx_data)

Initial log joint probability = -3.67016
    Iter      log prob        ||dx||      ||grad||       alpha      alpha0  # evals  Notes 
      99       784.538     0.0137682       316.397           1           1      117   
    Iter      log prob        ||dx||      ||grad||       alpha      alpha0  # evals  Notes 
     199       789.964    0.00273952       257.551           1           1      240   
    Iter      log prob        ||dx||      ||grad||       alpha      alpha0  # evals  Notes 
     254       790.992   8.85015e-05       125.916   1.006e-06       0.001      347  LS failed, Hessian reset 
     299       791.046   1.96266e-05       76.7317           1           1      411   
    Iter      log prob        ||dx||      ||grad||       alpha      alpha0  # evals  Notes 
     342       791.737    0.00011129       143.471   6.072e-07       0.001      533  LS failed, Hessian reset 
     399       792.449   3.75785e-06       68.9874      0.3392           1      603   
    Iter      log pro

<fbprophet.forecaster.Prophet at 0x7fb95ca92890>

      alpha0  # evals  Notes 
     461       792.612   0.000272461       83.4874   3.958e-06       0.001      720  LS failed, Hessian reset 
     499       792.698   5.54706e-05       64.8697      0.3889      0.3889      764   
    Iter      log prob        ||dx||      ||grad||       alpha      alpha0  # evals  Notes 
     523       792.706   3.77279e-05       77.3488   6.146e-07       0.001      828  LS failed, Hessian reset 
     553       792.709   1.68901e-08       58.1467      0.3354           1      870   
Optimization terminated normally: 
  Convergence detected: relative gradient magnitude is below tolerance


In [19]:
future_df = future_df.rename(columns={'Date':"ds",'Poverty_Rate':'y'})
futures = m.predict(future_df)
futures.shape, futures.columns

((32, 25),
 Index(['ds', 'trend', 'yhat_lower', 'yhat_upper', 'trend_lower', 'trend_upper',
        'HomeValueIndex', 'HomeValueIndex_lower', 'HomeValueIndex_upper',
        'additive_terms', 'additive_terms_lower', 'additive_terms_upper',
        'extra_regressors_additive', 'extra_regressors_additive_lower',
        'extra_regressors_additive_upper', 'unemploy_rate',
        'unemploy_rate_lower', 'unemploy_rate_upper', 'yearly', 'yearly_lower',
        'yearly_upper', 'multiplicative_terms', 'multiplicative_terms_lower',
        'multiplicative_terms_upper', 'yhat'],
       dtype='object'))

In [20]:
futures['ds'].min(), futures['ds'].max()

(Timestamp('2019-02-28 00:00:00'), Timestamp('2021-09-30 00:00:00'))

In [21]:
result_df = future_df.merge(futures[['ds', 'yhat']], how="left")
result_df = result_df.drop(columns=['y'])
result_df = result_df.rename(columns={'yhat': 'Poverty_Rate'})
result_df.shape, result_df.columns

((32, 5),
 Index(['State', 'ds', 'HomeValueIndex', 'unemploy_rate', 'Poverty_Rate'], dtype='object'))

In [22]:
result_df.to_csv("poverty_rate_forecast.csv", index=False)

In [23]:
from IPython.display import FileLinks
FileLinks(".")