## Employment Claims and Housing Prices

In [1]:
# Load up necessary packagaes
import pandas as pd
import numpy as np

from scipy.stats import ttest_ind
from scipy import stats

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score

from sklearn.tree import DecisionTreeRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import MinMaxScaler

import warnings
warnings.filterwarnings('ignore')

## Housing Price Data
Load Housing Data from file and pre-process it

In [2]:
raw_zHousing_df = pd.read_csv('State_Zhvi_AllHomes.csv')

In [3]:
zHousing_df = raw_zHousing_df.drop(['RegionID','RegionType','StateName', 'SizeRank'], axis = 1)
zHousing_df = zHousing_df.set_index(['RegionName'])
zHousing_df = zHousing_df.T
zHousing_df['Date'] = pd.to_datetime(zHousing_df.index)
#zHousing_df['month'] = zHousing_df['Date'].dt.month
zHousing_df['year'] = zHousing_df['Date'].dt.year
zHousing_df['week'] = zHousing_df['Date'].dt.week
zHousing_df = zHousing_df.set_index(['year','week'])

zHousing_df = zHousing_df.drop(['Date'], axis=1)
#zHousing_df.head()

Load data for individual counties

In [4]:
raw_cityHousing_df = pd.read_csv('City_Zhvi_AllHomes.csv')

In [5]:
tmp_countyHP_df = raw_cityHousing_df.drop(['RegionID','RegionType','RegionName', 'SizeRank','Metro','State'], axis = 1)

# get full state name from state housing price datafile
state_abbrvMap = raw_zHousing_df[['StateName','RegionName']]
countyHP_df = pd.merge(tmp_countyHP_df, state_abbrvMap, how='left', left_on='StateName', right_on='StateName')
countyHP_df = countyHP_df.drop(['StateName'], axis=1)

In [6]:
def get_CountyPrices(State):
    
    df = countyHP_df[countyHP_df['RegionName'] == State]
    df = df.drop(['RegionName'], axis=1)
    
    df = df.set_index(['CountyName'])
    df = df.T
    df['Date'] = pd.to_datetime(df.index)
    df['week'] = df['Date'].dt.week
    df['year'] = df['Date'].dt.year
    df = df.set_index(['year','week'])
    
    df = df.drop(['Date'], axis=1)
    
    df.fillna(method='ffill', inplace=True)
    df.fillna(0, inplace=True)
    
    return df

#get_CountyPrices('Michigan')

## Insurance Claims Data

Pre-process the insurance claims data

In [9]:
raw_Uinsurance_df = pd.read_csv('r539cy-master.xls.csv')

In [10]:
Uinsurance_df = raw_Uinsurance_df
Uinsurance_df['Date'] = pd.to_datetime(raw_Uinsurance_df['Filed week ended'])
#Uinsurance_df['month'] = Uinsurance_df['Date'].dt.month
Uinsurance_df['year'] = Uinsurance_df['Date'].dt.year
Uinsurance_df['week'] = Uinsurance_df['Date'].dt.week

Uinsurance_df = Uinsurance_df.drop(['Date'], axis=1)
# drop the records before 1996, to match the data  that is available from Zillow
Uinsurance_df = Uinsurance_df[Uinsurance_df['year'] > 1995]

Uinsurance_df['Initial Claims'] = Uinsurance_df['Initial Claims'].str.replace(',', '').astype(int)
Uinsurance_df['Continued Claims'] = Uinsurance_df['Continued Claims'].str.replace(',', '').astype(int)
Uinsurance_df['Covered Employment'] = Uinsurance_df['Covered Employment'].str.replace(',', '').astype(int)

#Uinsurance_df.head()

Transform unemployment insurance data into format that can be merged with the housing data

In [11]:
ui_initclaims_df = pd.pivot_table(Uinsurance_df, values='Initial Claims', index=['year', 'week'], columns=['State'], aggfunc=np.sum)
ui_contclaims_df = pd.pivot_table(Uinsurance_df, values='Continued Claims', index=['year', 'week'], columns=['State'], aggfunc=np.sum)
ui_rate_df = pd.pivot_table(Uinsurance_df, values='Insured Unemployment Rate', index=['year', 'week'], columns=['State'], aggfunc=np.mean)
ui_covEmp_df = pd.pivot_table(Uinsurance_df, values='Covered Employment', index=['year', 'week'], columns=['State'], aggfunc=np.sum)

### Combine Data
Combine UI and Housing Price data for each State into DataFrame and Engineer training featues

In [25]:
def get_State_df (stateName):    
    
    tmp_sHousing_df = zHousing_df[stateName]
    
    df = pd.DataFrame([])
    df['Initial Claims'] = ui_initclaims_df[stateName]
    df['Continued Claims'] = ui_contclaims_df[stateName]
    df['Covered Employment'] = ui_covEmp_df[stateName]
    df['Insured Unemployment Rate'] = ui_rate_df[stateName]
    
    df = pd.merge(df, tmp_sHousing_df, how='left', left_index=True, right_index=True)
    #df = pd.merge(df, tmp_sHousing_df, how='inner', left_index=True, right_index=True)
    df[stateName].fillna(method='ffill', inplace=True)
    df.fillna(0, inplace=True)
    
    df['housingTarget'] = round(df[stateName].astype(int) / 1000, 1)
    
    # Marginally better result with county data
    #county_df = get_CountyPrices(stateName)
    #df = pd.merge(df, county_df, how='left', left_index=True, right_index=True)
    
    tmp_arr = []
    if stateName in df.columns:
        df.sort_index(ascending=True, inplace=True)
        df = df.reset_index()

        df.fillna(0, inplace=True)        
        df['PrevHousePrc'] = df['housingTarget'].rolling(2, min_periods=1).sum() - df['housingTarget']
        df['MvngAvgPrc15'] = round(df['PrevHousePrc'].rolling(10).mean())
        df['MvngAvgPrc30'] = round(df['PrevHousePrc'].rolling(15).mean())    
        #df['PercentChnge'] = round((df['housingTarget'] - df['PrevHousePrc']) / (df['PrevHousePrc'] + .1), 4) * 1000
        
        df = df.drop([stateName], axis=1)
        
        
    df.fillna(method='ffill', inplace=True)
    df.fillna(0, inplace=True)
    
    df = df.iloc[1:]  
    test_df = df.tail(1)
    df.drop(df.tail(1).index, inplace=True)
    
    
    return df, test_df

In [26]:
df, test_df = get_State_df ('Michigan')
test_df.head()

Unnamed: 0,year,week,Initial Claims,Continued Claims,Covered Employment,Insured Unemployment Rate,housingTarget,PrevHousePrc,MvngAvgPrc15,MvngAvgPrc30
1264,2020,15,222207,749011,4305711,17.4,177.0,177.0,176.0,175.0


#### Correlation Test Function
Test how strongly the values are co-related

In [16]:
def get_State_Prop (df):
    
    fin_r = stats.ttest_ind(df['Initial Claims'], df['housingTarget'], equal_var=False)
    return fin_r[0], fin_r[1], len(df['housingTarget'].unique())

### Train GBT Model

In [17]:
def train_StateModels():

    state_Results = []    
    offset = 1
    
    states = zHousing_df.columns
    #states = ['Alaska','Michigan']
    
    for state in states:

        df, f_test = get_State_df(state)
        
        # features
        X = df[['Initial Claims','Continued Claims','Insured Unemployment Rate','Covered Employment','PrevHousePrc']]
        # target
        y = df[['housingTarget']]
        
        X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)

        #clf = DecisionTreeRegressor(learning_rate = 0.015, random_state = 42, max_depth = 2, n_estimators = 110)
        clf = DecisionTreeRegressor(random_state = 42, max_depth = 4)
        clf.fit(X_train, y_train)

        y_predicted = clf.predict(f_test[['Initial Claims','Continued Claims','Insured Unemployment Rate','Covered Employment','PrevHousePrc']]) 

        #statistic, pvalue, unique_targets = get_State_Prop(X, y)
        state_Results.append([state, clf.score(X_train, y_train), clf.score(X_test, y_test), 
                              #statistic, pvalue, unique_targets, 
                              y_predicted[0], f_test['housingTarget'].to_numpy()[0]])
        
    return state_Results

#train_StateModels()

#### Feature Assessment for GBDT
Call the grid search to identify which parameters work be

## MLP Regressor

In [18]:
# MLP CLassifier Model
def train_StateMLPMdls():

    state_Results = []
    factor = 1 #5000
    
    states = sorted(zHousing_df.columns)
    #states = ['Alaska','Michigan']
    
    for state in states:
        
        df, f_test = get_State_df(state)

        X = df.drop(['housingTarget'], axis=1)
        y = df[['housingTarget']]

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

        scaler = MinMaxScaler()

        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)
        X_ftest_scaled = scaler.transform(X.iloc[-1:])
                               
        mlp_clf = MLPRegressor(hidden_layer_sizes = [100, 100], solver='lbfgs', alpha=0.15, random_state=42).fit(X_train_scaled, y_train)                
        y_predicted = mlp_clf.predict(X_ftest_scaled) 

        #statistic, pvalue, unique_targets = get_State_Prop(X, y)
        state_Results.append([state, mlp_clf.score(X_train_scaled, y_train), mlp_clf.score(X_test_scaled, y_test), 
                              #statistic, pvalue, unique_targets, 
                              y_predicted[0], f_test['housingTarget'].to_numpy()[0]])
        
    return state_Results


In [27]:
def save_results(model):

    if 'mlp' in model:
        State_results_df = pd.DataFrame(train_StateMLPMdls())
        #State_results_df.columns = ['State','Train Score', 'Test Score', 'Statistic', 'Pvalue', 'Unique Targets','Predicted', 'Actual']
        State_results_df.to_csv('stateResults_mlp.csv', index=False)
    else: 
        State_results_df = pd.DataFrame(train_StateModels())
        #State_results_df.sort_values(by=['Test Score'], ascending=False, inplace=True)  
        #State_results_df.columns = ['State','Train Score', 'Test Score', 'Statistic', 'Pvalue', 'Unique Targets','Predicted', 'Actual']
        State_results_df.to_csv('stateResults_gbc.csv', index=False)
    
    return State_results_df[2].to_numpy().mean()
    
save_results('mlp')

0.9961994964809477

# Sources

GDP
#https://apps.bea.gov/regional/Downloadzip.cfm
#https://apps.bea.gov/itable/iTable.cfm?ReqID=70&step=1

Unemployment
#https://oui.doleta.gov/unemploy/claims_arch.asp