# Notebook 07: Multivariate Time Series with VAR

## Cleaning final data in preparation for modeling

In [1]:
import pandas as pd
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.tsa.vector_ar.var_model import VAR

In [2]:
data = pd.read_csv('./data/time_series/pov_fi_10_yr_c.csv')

In [3]:
data.head()

Unnamed: 0.1,Unnamed: 0,time,state,pov,fi
0,1.0,1/1/2010,Alabama,18.9,20.379104
1,2.0,1/1/2011,Alabama,18.9,19.292537
2,3.0,1/1/2012,Alabama,21.3,19.497412
3,4.0,1/1/2013,Alabama,23.8,18.569592
4,5.0,1/1/2014,Alabama,20.6,18.800626


In [4]:
data = data.drop(columns = ['Unnamed: 0'])

In [5]:
data.head()

Unnamed: 0,time,state,pov,fi
0,1/1/2010,Alabama,18.9,20.379104
1,1/1/2011,Alabama,18.9,19.292537
2,1/1/2012,Alabama,21.3,19.497412
3,1/1/2013,Alabama,23.8,18.569592
4,1/1/2014,Alabama,20.6,18.800626


In [6]:
data.dtypes

time      object
state     object
pov      float64
fi       float64
dtype: object

In [7]:
data['time'] = pd.to_datetime(data['time'])

In [8]:
# data.set_index('time', inplace=True)

In [9]:
data.head()

Unnamed: 0,time,state,pov,fi
0,2010-01-01,Alabama,18.9,20.379104
1,2011-01-01,Alabama,18.9,19.292537
2,2012-01-01,Alabama,21.3,19.497412
3,2013-01-01,Alabama,23.8,18.569592
4,2014-01-01,Alabama,20.6,18.800626


## Modeling with Var

In [10]:
from statsmodels.tsa.vector_ar.var_model import VAR

In [11]:
# Code and methodology developed with help from Adi, from https://www.analyticsvidhya.com/blog/2018/09/multivariate-time-series-guide-forecasting-modeling-python-codes/ and from https://www.youtube.com/watch?v=sCl6CXZ2xBg

In [12]:
## Creating a full states list:

states = ["Alabama","Alaska","Arizona","Arkansas","California","Colorado",
  "Connecticut","District of Columbia", "Delaware","Florida","Georgia","Hawaii","Idaho","Illinois",
  "Indiana","Iowa","Kansas","Kentucky","Louisiana","Maine","Maryland",
  "Massachusetts","Michigan","Minnesota","Mississippi","Missouri","Montana",
  "Nebraska","Nevada","New Hampshire","New Jersey","New Mexico","New York",
  "North Carolina","North Dakota","Ohio","Oklahoma","Oregon","Pennsylvania",
  "Rhode Island","South Carolina","South Dakota","Tennessee","Texas","Utah",
  "Vermont","Virginia","Washington","West Virginia","Wisconsin","Wyoming"]

#For loop for all states

for s in states:
    #training data
    train_data = data[data['state'] == s]
    train_data = train_data[['time', 'fi', 'pov']]
    train_data = train_data.drop(train_data.tail(2).index)
    
    
    #valid data
    valid_data = data[data['state'] == s]
    valid_data = valid_data[['time', 'fi', 'pov']]
    valid_data = valid_data.drop(valid_data.head(7).index)
    
    
    #all data
    all_data = data[data['state'] == s]
    all_data = all_data[['time', 'fi', 'pov']]
    
    #Set time column to index
    train_data.set_index('time', inplace=True)
    valid_data.set_index('time', inplace=True)
    all_data.set_index('time', inplace=True)
    
    #Set valid index for 7 years
    index_7_years = pd.date_range(train_data.index[-1], freq = 'AS', periods = 7)
    
    #Future index - 7 years
    future_7_years = pd.date_range(valid_data.index[-1], freq = 'AS', periods = 7)
    
    ## Tricky bit of code in order to basically reset the forecasts from the previous state. 
    # Otherwise, if a state fails to work in the model, the forecast from the previous state is going to be passed in.
    
    #Drop all tables:
#     try:
#         del t_fcast1
#         del f_fcast1
        
#     except:
#         print("")

        
    
##################################################################################################################
################################################ VAR #########################################################
##################################################################################################################

################################################  VAR Validation Phase ###################################################

    model = VAR(endog = train_data, freq='AS-JAN')
    model_fit = model.fit()

    # make prediction on validation
    t_fcast1 = model_fit.forecast(model_fit.y, steps=7)
    t_fcast1 = pd.DataFrame(t_fcast1, index=index_7_years)



################################################  VAR Future Phase #######################################################


    model2 = VAR(endog = all_data, freq='AS-JAN')
    model2_fit = model.fit()
        
    f_fcast1 = model2_fit.forecast(model2_fit.y, steps=7)
        
    f_fcast1 = pd.DataFrame(f_fcast1, index=future_7_years)
        

##################################################################################################################
################################################ Plotting ######################################################
##################################################################################################################

#     fig, ax = plt.subplots(figsize =(15,5))
#     chart = sns.lineplot(x='time', y = s, data = train_data)
#     chart.set_title(s)
#     valid_data.plot(ax=ax, color = 'blue', marker = 'o', legend = True)
    
#     #Plotting val
#     try:
#         t_fcast1.plot(ax=ax, color = 'red', marker = 'o', legend = True)
#     except:
#         print('')

##################################################################################################################
################################################ Saving into DataFrame ##########################################
##################################################################################################################

################################################  DF VAR #######################################################
     
#     try:
#         #Creating df for forecast
    t_fcast1 = t_fcast1.reset_index()
    t_fcast1.columns = ['Year', "VAR FI", 'VAR POV']
    
    f_fcast1 = f_fcast1.reset_index()
    f_fcast1.columns = ['Year', 'VAR Future FI', 'VAR Future Pov']
        
    #Extra Columns
    t_fcast1[['VAR Future FI', 'VAR Future Pov']] = np.nan
    f_fcast1[["VAR FI", 'VAR POV']] = np.nan
        
    #Reordering
    t_fcast1 = t_fcast1[['Year', 'VAR Future FI', 'VAR Future Pov', 'VAR FI', 'VAR POV']]
        
    # Joining them togther
    df_fcast1 = pd.concat([t_fcast1, f_fcast1], axis = 0)
    df_fcast1['State'] = s

################################################ Aggregating and Joining #######################################################

    df_fcast1 = df_fcast1.groupby(['Year', 'State'], as_index = False).agg({'VAR Future FI': 'sum', 'VAR Future Pov': 'sum'})   
        
    all_forecasts = df_fcast1.copy()

    #Save predictions in df. First time this will fail and just give all_forecasts, after that will concat for each state
    try: 
        final_forecasts = pd.concat([final_forecasts, all_forecasts], ignore_index = True).copy()
    except:
        final_forecasts = all_forecasts.copy()


  obj = getattr(results, attr)


## Preparing original data and forecast data for concatenation

In [13]:
data.head()

Unnamed: 0,time,state,pov,fi
0,2010-01-01,Alabama,18.9,20.379104
1,2011-01-01,Alabama,18.9,19.292537
2,2012-01-01,Alabama,21.3,19.497412
3,2013-01-01,Alabama,23.8,18.569592
4,2014-01-01,Alabama,20.6,18.800626


In [15]:
#Renaming Columns
final_forecasts.rename(columns = {'VAR Future FI': 'pov',
                                  'VAR Future Pov': 'fi',
                                  'State': 'state',
                                  'Year': 'year'
                                 }, inplace = True)

data.rename(columns = {'time' : 'year'}, inplace = True)

In [16]:
final_forecasts.head()

Unnamed: 0,year,state,pov,fi
0,2017-01-01,Alabama,0.0,0.0
1,2018-01-01,Alabama,0.0,0.0
2,2019-01-01,Alabama,18.48537,21.869745
3,2020-01-01,Alabama,18.557806,21.571658
4,2021-01-01,Alabama,18.602132,21.542153


In [17]:
#Dropping 0's
final_forecasts['pov'][final_forecasts['pov'] == 0] = np.nan
final_forecasts['fi'][final_forecasts['fi'] == 0] = np.nan

final_forecasts.dropna(inplace = True)

In [20]:
#Dropping 2019 in forecasts as we have it in the original
final_forecasts.drop(final_forecasts[(final_forecasts['year'] == '2019-01-01')].index, inplace=True)

In [24]:
#Concatenating and dropping NA
output_df = pd.concat([data, final_forecasts], axis = 0)

output_df.dropna(inplace=True)

In [25]:
output_df.head()

Unnamed: 0,year,state,pov,fi
0,2010-01-01,Alabama,18.9,20.379104
1,2011-01-01,Alabama,18.9,19.292537
2,2012-01-01,Alabama,21.3,19.497412
3,2013-01-01,Alabama,23.8,18.569592
4,2014-01-01,Alabama,20.6,18.800626
5,2015-01-01,Alabama,19.1,18.952239
6,2016-01-01,Alabama,23.0,18.492537
7,2017-01-01,Alabama,21.6,18.092537
8,2018-01-01,Alabama,24.1,17.155224
9,2019-01-01,Alabama,21.2,17.719403


In [26]:
#Exporting to CSV for use in tableau
# output_df.to_csv('./data/time_series/var_model_preds.csv')