### Wesley Janson and Drew Keller
## STAT 27420 Final Project
# Modeling Code

In [1]:
# Load in relevant packages

import pandas as pd
from statsmodels.miscmodels.ordinal_model import OrderedModel
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
import xgboost as xgb
from sklearn.ensemble import GradientBoostingClassifier
from data_utils import read_data, prep_features
import numpy as np

DATA_PATH = '../paper_replication_data/new_data.csv'  # Drew's path

In [8]:
from load_data import data, categorical_vars, cts_vars, other_vars

# loading data from online takes ~20 seconds
# to speed up, save data locally and load from there:

#data.to_csv(DATA_PATH,index=False)  # run this once



In [11]:
categorical_vars

['first_interview',
 'personal_finances_yr_ago',
 'price_related_yr_ago',
 'personal_finances_5yr_ago',
 'personal_finances_next_yr',
 'personal_finances_next_5yr',
 'income_change_next_yr',
 'real_income_expectations',
 'conditions_yr_ago',
 'conditions_next_yr',
 'unemployment_next_yr',
 'govt_policy_efficacy',
 'interest_rates_next_yr',
 'price_change_next_yr',
 'price_change_next_5yr',
 'durable_purchase',
 'car_purchase',
 'income_quintile',
 'region',
 'sex',
 'education',
 'vehicle_ownership',
 'treatment_pctile',
 'treatment_bins']

In [4]:
data = read_data(DATA_PATH)  # use this over pd.read_csv, because this handles types

""" Categorical_vars and cts_vars are lists of vars in each category.
Other_vars are ID and date variables (categorical_vars + cts_vars + other_vars = all vars)"""

' Categorical_vars and cts_vars are lists of vars in each category.\nOther_vars are ID and date variables (categorical_vars + cts_vars + other_vars = all vars)'

In [3]:
data.treatment_bins.value_counts(dropna=False)  # check that we have a balanced dataset

0-5      210475
5-10      47431
NaN       24168
10-15     11780
20+        5376
15-20      4984
Name: treatment_bins, dtype: int64

In [4]:
data.durable_purchase.value_counts(dropna=False)  # check that we have a balanced dataset

Good          204553
Bad            71471
Neutral        12945
Don't know     12599
Refused         2646
Name: durable_purchase, dtype: int64

In [4]:
data[data.treatment_bins.isnull()].price_change_amt_next_yr.value_counts(dropna=False)

NaN    24168
Name: price_change_amt_next_yr, dtype: int64

In [5]:
# prep features for modeling; use regression=True for regression models
data_regression, treatment_vars, confounder_vars = prep_features(data,regression=True)  

Excluding 15245 observations that did not answer durable purchase question.
Excluding 21490 observations that did not answer price change amount question.


In [10]:
data_regression.durable_purchase.value_counts(dropna=False)  # check that we have a balanced dataset

 1.0    191595
-1.0     64499
 0.0     11385
Name: durable_purchase, dtype: int64

In [6]:
len(data_regression.dropna(subset=confounder_vars))/len(data_regression)

0.7146280642592503

In [7]:
data_regression = data_regression.dropna(subset=confounder_vars)

In [8]:
# First model, ordered probit-same as Bachmann et al.
mod_prob = OrderedModel(data_regression['durable_purchase'],
                        data_regression[confounder_vars+treatment_vars],
                        distr='probit')

res_prob = mod_prob.fit(method='bfgs')
res_prob.summary()

Optimization terminated successfully.
         Current function value: 0.650917
         Iterations: 114
         Function evaluations: 115
         Gradient evaluations: 115


0,1,2,3
Dep. Variable:,durable_purchase,Log-Likelihood:,-124420.0
Model:,OrderedModel,AIC:,248900.0
Method:,Maximum Likelihood,BIC:,249300.0
Date:,"Wed, 30 Nov 2022",,
Time:,21:06:31,,
No. Observations:,191148,,
Df Residuals:,191112,,
Df Model:,36,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
fed_funds_rate,0.2373,0.005,50.506,0.000,0.228,0.247
unemployment_rate,-0.1566,0.003,-48.724,0.000,-0.163,-0.150
cpi_1mo_lag,-0.1763,0.007,-26.770,0.000,-0.189,-0.163
cpi_durable_1mo_lag,-0.1506,0.005,-29.500,0.000,-0.161,-0.141
personal_finances_next_yr_Don't know,-0.1997,0.024,-8.278,0.000,-0.247,-0.152
personal_finances_next_yr_Refused,-0.1907,0.058,-3.304,0.001,-0.304,-0.078
personal_finances_next_yr_Same,-0.0524,0.007,-7.347,0.000,-0.066,-0.038
personal_finances_next_yr_Worse,-0.1756,0.011,-15.987,0.000,-0.197,-0.154
income_change_amt_next_yr,-0.0308,0.003,-9.162,0.000,-0.037,-0.024


In [9]:
# First model, ordered probit-same as Bachmann et al.
mod_prob = OrderedModel(data_regression['durable_purchase'],
                        data_regression[confounder_vars+["price_change_amt_next_yr"]],
                        distr='probit')

res_prob = mod_prob.fit(method='bfgs')
res_prob.summary()

Optimization terminated successfully.
         Current function value: 0.650927
         Iterations: 94
         Function evaluations: 95
         Gradient evaluations: 95


0,1,2,3
Dep. Variable:,durable_purchase,Log-Likelihood:,-124420.0
Model:,OrderedModel,AIC:,248900.0
Method:,Maximum Likelihood,BIC:,249200.0
Date:,"Wed, 30 Nov 2022",,
Time:,21:08:59,,
No. Observations:,191148,,
Df Residuals:,191115,,
Df Model:,33,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
fed_funds_rate,0.2372,0.005,50.494,0.000,0.228,0.246
unemployment_rate,-0.1572,0.003,-48.943,0.000,-0.163,-0.151
cpi_1mo_lag,-0.1776,0.007,-27.066,0.000,-0.190,-0.165
cpi_durable_1mo_lag,-0.1507,0.005,-29.516,0.000,-0.161,-0.141
personal_finances_next_yr_Don't know,-0.2000,0.024,-8.293,0.000,-0.247,-0.153
personal_finances_next_yr_Refused,-0.1871,0.058,-3.241,0.001,-0.300,-0.074
personal_finances_next_yr_Same,-0.0520,0.007,-7.299,0.000,-0.066,-0.038
personal_finances_next_yr_Worse,-0.1755,0.011,-15.974,0.000,-0.197,-0.154
income_change_amt_next_yr,-0.0308,0.003,-9.158,0.000,-0.037,-0.024


In [None]:
# Second model - some sort of XGBoost?