### Wesley Janson and Drew Keller
## STAT 27420 Final Project
# Modeling Code

In [2]:
# Load in relevant packages

import pandas as pd
from statsmodels.miscmodels.ordinal_model import OrderedModel
from sklearn.model_selection import KFold, StratifiedKFold, train_test_split
from sklearn import tree
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
import xgboost as xgb
from sklearn.ensemble import GradientBoostingClassifier
from data_utils import read_data, prep_features
import numpy as np
import os

# set random seed for numpy
RANDOM_SEED=69
np.random.seed(RANDOM_SEED)


DATA_PATH = '../paper_replication_data/new_data.csv'  # Drew's path

In [3]:
from load_data import data, categorical_vars, cts_vars, other_vars

# loading data from online takes ~20 seconds
# to speed up, save data locally and load from there:

data.to_csv(DATA_PATH,index=False)  # run this once

Excluding 0 observations that did not answer 1 year price change question.


In [4]:
# Categorical_vars and cts_vars are lists of vars in each category.
# Other_vars are ID and date variables (categorical_vars + cts_vars + other_vars = all vars)

data = read_data(DATA_PATH)  # use this over pd.read_csv, because this handles types

' Categorical_vars and cts_vars are lists of vars in each category.\nOther_vars are ID and date variables (categorical_vars + cts_vars + other_vars = all vars)'

In [4]:
data.treatment_bins.value_counts(dropna=False)  # check that we have a balanced dataset

0-5      210475
5-10      47431
NaN       24168
10-15     11780
20+        5376
15-20      4984
Name: treatment_bins, dtype: int64

In [5]:
data.durable_purchase.value_counts(dropna=False)  # check that we have a balanced dataset

Good          204553
Bad            71471
Neutral        12945
Don't know     12599
Refused         2646
Name: durable_purchase, dtype: int64

In [5]:
data[data.treatment_bins.isnull()].price_change_amt_next_yr.value_counts(dropna=False)

NaN    24168
Name: price_change_amt_next_yr, dtype: int64

In [6]:
# prep features for modeling; use regression=True for regression models
data_regression, treatment_vars, confounder_vars = prep_features(data,regression=True)  

Excluding 15245 observationsthat did not answer durable purchase question.
Excluding 102089 observations that did not answerconfounder questions.
Excluding 10354 observationsthat did not answer price change amount question.


In [7]:
data_regression.durable_purchase.value_counts(dropna=False)  # check that we have a balanced dataset

2    191595
0     64499
1     11385
Name: durable_purchase, dtype: int64

In [9]:
data_regression = data_regression.dropna(subset=confounder_vars)

In [10]:
# First model, ordered probit-same as Bachmann et al.
mod_prob = OrderedModel(data_regression['durable_purchase'],
                        data_regression[confounder_vars+["price_change_amt_next_yr"]],
                        distr='probit')

res_prob = mod_prob.fit(method='bfgs')
res_prob.summary()

Optimization terminated successfully.
         Current function value: 0.650848
         Iterations: 91
         Function evaluations: 92
         Gradient evaluations: 92


0,1,2,3
Dep. Variable:,durable_purchase,Log-Likelihood:,-124410.0
Model:,OrderedModel,AIC:,248900.0
Method:,Maximum Likelihood,BIC:,249300.0
Date:,"Fri, 02 Dec 2022",,
Time:,17:50:18,,
No. Observations:,191148,,
Df Residuals:,191111,,
Df Model:,37,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
fed_funds_rate,0.2196,0.006,37.726,0.000,0.208,0.231
unemployment_rate,-0.1462,0.004,-38.508,0.000,-0.154,-0.139
cpi_1mo_lag,-0.1783,0.007,-27.010,0.000,-0.191,-0.165
cpi_durable_1mo_lag,-0.1421,0.005,-26.449,0.000,-0.153,-0.132
personal_finances_next_yr_Don't know,-0.1994,0.024,-8.266,0.000,-0.247,-0.152
personal_finances_next_yr_Refused,-0.1898,0.058,-3.286,0.001,-0.303,-0.077
personal_finances_next_yr_Same,-0.0517,0.007,-7.248,0.000,-0.066,-0.038
personal_finances_next_yr_Worse,-0.1748,0.011,-15.909,0.000,-0.196,-0.153
income_change_amt_next_yr,-0.0311,0.003,-9.241,0.000,-0.038,-0.024


In [11]:
# model 1b: ordered probit (same as Bachmann et al.) using binned treatment
mod_prob = OrderedModel(data_regression['durable_purchase'],
                        data_regression[confounder_vars+treatment_vars],
                        distr='probit')

res_prob = mod_prob.fit(method='bfgs')
res_prob.summary()

Optimization terminated successfully.
         Current function value: 0.650858
         Iterations: 91
         Function evaluations: 92
         Gradient evaluations: 92


0,1,2,3
Dep. Variable:,durable_purchase,Log-Likelihood:,-124410.0
Model:,OrderedModel,AIC:,248900.0
Method:,Maximum Likelihood,BIC:,249200.0
Date:,"Fri, 02 Dec 2022",,
Time:,17:53:19,,
No. Observations:,191148,,
Df Residuals:,191114,,
Df Model:,34,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
fed_funds_rate,0.2194,0.006,37.686,0.000,0.208,0.231
unemployment_rate,-0.1467,0.004,-38.647,0.000,-0.154,-0.139
cpi_1mo_lag,-0.1795,0.007,-27.301,0.000,-0.192,-0.167
cpi_durable_1mo_lag,-0.1421,0.005,-26.461,0.000,-0.153,-0.132
personal_finances_next_yr_Don't know,-0.1995,0.024,-8.270,0.000,-0.247,-0.152
personal_finances_next_yr_Refused,-0.1882,0.058,-3.259,0.001,-0.301,-0.075
personal_finances_next_yr_Same,-0.0513,0.007,-7.198,0.000,-0.065,-0.037
personal_finances_next_yr_Worse,-0.1744,0.011,-15.874,0.000,-0.196,-0.153
income_change_amt_next_yr,-0.0310,0.003,-9.227,0.000,-0.038,-0.024


In [15]:
# Second model - some sort of XGBoost?
X = data_regression[confounder_vars+treatment_vars]
Y = data_regression['durable_purchase']
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=7)

model = xgb.XGBClassifier()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# predictions = [round(value) for value in y_pred]
# # evaluate predictions
# accuracy = accuracy_score(y_test, predictions)
# print("Accuracy: %.2f%%" % (accuracy * 100.0))

array([2, 2, 0, ..., 2, 2, 2])

## Helper Functions

In [None]:
# Conditional outcome models (Q models)
def make_Q_model():
    ''' A function that returns a general ML q model for later use in k-folding'''
    return xgb.XGBRegressor()

# Propensity score models (g models)
def make_g_model():
    ''' A function that returns a g model for computing propensity scores'''
    return xgb.XGBClassifier()

In [None]:
# Functions for K-fold cross-fitting
def treatment_k_fold_fit_and_predict(make_model, X:pd.DataFrame, A:np.array, n_splits:int):
    '''
    Implements K fold cross-fitting for the model predicting the treatment A. 
    That is, 
    1. Split data into K folds
    2. For each fold j, the model is fit on the other K-1 folds
    3. The fitted model is used to make predictions for each data point in fold j
    Returns an array containing the predictions  

    Args:
    model: function that returns sklearn model (which implements fit and predict_prob)
    X: dataframe of variables to adjust for
    A: array of treatments
    n_splits: number of splits to use
    '''

    predictions = np.full_like(A, np.nan, dtype=float)
    kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=RANDOM_SEED)
    
    for train_index, test_index in kf.split(X, A):
        X_train = X.loc[train_index]
        A_train = A.loc[train_index]
        g = make_model()
        g.fit(X_train, A_train)

        # get predictions for split
        predictions[test_index] = g.predict_proba(X.loc[test_index])[:, 1]
    
    # sanity check that overlap holds
    assert np.isnan(predictions).sum() == 0
    return predictions

def outcome_k_fold_fit_and_predict(make_model, X:pd.DataFrame, y:np.array, A:np.array, n_splits:int, output_type:str):
    '''
    Implements K fold cross-fitting for the model predicting the outcome Y. 
    That is, 
    1. Split data into K folds
    2. For each fold j, the model is fit on the other K-1 folds
    3. The fitted model is used to make predictions for each data point in fold j
    Returns two arrays containing the predictions for all units untreated, all units treated  

    Args:
    model: function that returns sklearn model (that implements fit and either predict_prob or predict)
    X: dataframe of variables to adjust for
    y: array of outcomes
    A: array of treatments
    n_splits: number of splits to use
    output_type: type of outcome, "binary" or "continuous"
    '''

    predictions0 = np.full_like(A, np.nan, dtype=float)
    predictions1 = np.full_like(y, np.nan, dtype=float)
    if output_type == 'binary':
        kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=RANDOM_SEED)
    elif output_type == 'continuous':
        kf = KFold(n_splits=n_splits, shuffle=True, random_state=RANDOM_SEED)

    # include the treatment as input feature
    X_w_treatment = X.copy()
    X_w_treatment["A"] = A

    # for predicting effect under treatment / control status for each data point 
    X0 = X_w_treatment.copy()
    X0["A"] = 0
    X1 = X_w_treatment.copy()
    X1["A"] = 1
    X2 = X_w_treatment.copy()
    X2["A"] = 2
    X3 = X_w_treatment.copy()
    X3["A"] = 3
    X4 = X_w_treatment.copy()
    X4["A"] = 4

    
    for train_index, test_index in kf.split(X_w_treatment, y):
        X_train = X_w_treatment.loc[train_index]
        y_train = y.loc[train_index]
        q = make_model()
        q.fit(X_train, y_train)

        if output_type =='binary':
            predictions0[test_index] = q.predict_proba(X0.loc[test_index])[:, 1]
            predictions1[test_index] = q.predict_proba(X1.loc[test_index])[:, 1]
        elif output_type == 'continuous':
            predictions0[test_index] = q.predict(X0.loc[test_index])
            predictions1[test_index] = q.predict(X1.loc[test_index])
        elif output_type == 'categorical':
            

    assert np.isnan(predictions0).sum() == 0
    assert np.isnan(predictions1).sum() == 0
    return predictions0, predictions1

In [None]:
# Fit single Q() model
# get conditional outcomes
Q0_lm, Q1_lm = outcome_k_fold_fit_and_predict(make_Q_model, X=confounders, y=outcome, A=treatment, \
                                        n_splits=5, output_type="continuous")

# Fir 4 g(x) models
g = treatment_k_fold_fit_and_predict(make_g_model, X=confounders, A=treatment, n_splits=5)