### OLS 
The purpose of this script is to run OLS on age at death in the HRS study. \
This is to have a baseline in which to judge the prediction capability of all the machine learning methods we will try. Normally fixed or random effects would be used here to deal with the longitudinal format. This script is only here to provide an initial baseline model to compare against.

In [1]:
# Show all columns.
options(repr.matrix.max.cols=150, repr.matrix.max.rows=200)
library(plm)

df = read.csv('le_111422.csv')
dim(df)
head(df)

"package 'plm' was built under R version 3.6.3"

hhidpn,wave,mstat,cendiv,gender,rahispan,raracem,iwbeg,dage_m,dage_y,raedyrs,rarelig,ravetrn,agey_m,shlt,shltc,depres,effort,sleepr,cesd,bmi,smokev,smoken,drinkn,hibp,diab,cancr,lung,heart,strok,psych,arthr,conde,cogtot,slfmem,pstmem,spcfac,hsptim,puff,puffpos,timwlk,hatotb,iearn,isret,covs,hiltc,lbrf,logiearn,logisret,loghspti,loghatotb,id,nt,n2
3010,3,1.married,9.pacific,1.male,0.not hispanic,1.white/caucasian,13345,931,77,12,1.protestant,0.no,60,3.good,-1,1.yes,1.yes,1.yes,3,28.0,0.no,0.no,1,0.no,0.no,0.no,0.no,1.yes,0.no,0.no,0.no,1,35,4.fair,2.same,0.no,0,,,,490500,4000,0,1.yes,1.yes,4.partly retired,8.294049,0.0,0,13.10318,2,123702,7
3010,6,1.married,9.pacific,1.male,0.not hispanic,1.white/caucasian,15445,931,77,12,1.protestant,0.no,66,3.good,-1,0.no,0.no,0.no,1,28.3,0.no,0.no,0,0.no,0.no,0.no,0.no,1.yes,0.no,0.no,0.no,1,31,3.good,2.same,0.no,0,,,,704000,10000,13728,0.no,1.yes,4.partly retired,9.21034,9.527193,0,13.46453,3,123702,7
3010,7,1.married,9.pacific,1.male,0.not hispanic,1.white/caucasian,16267,931,77,12,1.protestant,0.no,68,3.good,0,0.no,0.no,0.no,0,26.6,0.no,0.no,0,0.no,0.no,0.no,0.no,1.yes,0.no,0.no,0.no,1,18,3.good,2.same,0.no,0,,,,756000,6000,15600,0.no,1.yes,4.partly retired,8.699514,9.655026,0,13.5358,4,123702,7
3010,8,1.married,9.pacific,1.male,0.not hispanic,1.white/caucasian,16875,931,77,12,1.protestant,0.no,70,3.good,0,0.no,0.no,0.no,0,27.1,0.no,0.no,0,0.no,0.no,0.no,0.no,1.yes,0.no,0.no,0.no,1,20,4.fair,2.same,0.no,0,490,1.standing,2.78,914000,0,14040,0.no,1.yes,5.retired,0.0,9.549665,0,13.72559,5,123702,7
3010,9,1.married,9.pacific,1.male,0.not hispanic,1.white/caucasian,17577,931,77,12,1.protestant,0.no,72,3.good,0,0.no,0.no,0.no,0,24.7,0.no,0.no,1,0.no,0.no,0.no,0.no,1.yes,0.no,0.no,0.no,1,26,3.good,2.same,0.no,1,.a,.a=r not asked phys meas this wave,.a,1156000,0,15600,0.no,1.yes,5.retired,0.0,9.655026,0,13.96048,6,123702,7
3010,10,1.married,9.pacific,1.male,0.not hispanic,1.white/caucasian,18520,931,77,12,1.protestant,0.no,74,3.good,0,0.no,0.no,0.no,0,24.0,0.no,0.no,0,0.no,0.no,0.no,0.no,1.yes,0.no,0.no,0.no,1,17,4.fair,3.worse,0.no,0,330,1.standing,3.0999999,1240000,0,16644,0.no,1.yes,5.retired,0.0,9.719805,0,14.03062,7,123702,7


In [2]:
# Specify data as numeric or factor
# Doesn't like dents
df = na.omit(df)
names_factor = c('gender', 'raracem', 'rahispan', 'mstat', 'shlt', 
                 'depres', 'cendiv' , 'effort', 
                 'sleepr', 'arthr','heart', 'strok', 'psych', 'cancr', 'wave',
                  'slfmem', 'covs',
                  'lbrf', 'smokev',  'hiltc',
                  'spcfac', 'rarelig','ravetrn')
df[,names_factor] = lapply(df[,names_factor], factor)
names_numeric = c('iearn', 'logiearn', 'bmi', 'cogtot','drinkn',
                   'dage_y','conde', 'raedyrs',
                  'logisret', 'loghatotb','loghspti')
df[,names_numeric] = lapply(df[,names_numeric], as.numeric)
dim(df)

In [3]:
clean_var = df$hiltc
df = df[clean_var != '.m=oth missing' ,]

In [4]:
# Drop timwlk, puff, and puffpos.
df = subset(df, select = -c(timwlk,puff, puffpos,dage_m, hhidpn, iwbeg, cesd,hsptim, hatotb, iearn, isret) )

In [5]:
# Split data
n = dim(df)[1]
training_ratio = .8
train_size = sort(sample(1:n, training_ratio*n))
train = na.omit(df[train_size,])
test = na.omit(df[-train_size,])

In [6]:
# Perform OLS
ols = lm(dage_y ~ ., data = train)
ols_pred = predict(ols, test)
ols_rmse = sqrt(mean((ols_pred - df$dage_y)^2))
models_rmse = data.frame(OLS = ols_rmse) 
models_rmse 

"longer object length is not a multiple of shorter object length"

OLS
11.86039


In [16]:
# Perform OLS
ols1 = lm(dage_y ~ cogtot+ gender + raracem + rahispan + agey_m*agey_m + agey_m +logiearn +
                         shlt + shltc*raedyrs +raedyrs +mstat +
                         raedyrs*bmi + hibp*cogtot+hibp*smoken + hibp*drinkn + hibp*pstmem + 
                         + depres + cendiv + effort + bmi*bmi
                        +  sleepr + bmi + smoken + drinkn + arthr + heart + strok + psych + cancr
                         + diab  + lung + slfmem + pstmem  + covs
                          + lbrf + raedyrs + smokev + hibp + conde 
                          + hiltc  + spcfac + loghspti + rarelig + ravetrn  
                          + loghatotb + loghspti , data = train)
ols_pred = predict(ols1, test)
ols_rmse = sqrt(mean((ols_pred - df$dage_y)^2))
models_rmse['LM Interactions'] = ols_rmse
models_rmse 

"longer object length is not a multiple of shorter object length"

OLS,LM Interactions
11.86039,10.89913


In [23]:
summary(ols)


Call:
lm(formula = dage_y ~ ., data = train)

Residuals:
     Min       1Q   Median       3Q      Max 
-25.5914  -1.7422   0.4955   3.1534  11.5225 

Coefficients: (2 not defined because of singularities)
                                       Estimate Std. Error t value Pr(>|t|)    
(Intercept)                          -3.434e+01  2.322e+00 -14.790  < 2e-16 ***
wave4                                -4.995e-01  3.235e-01  -1.544  0.12262    
wave5                                -2.148e+00  3.016e-01  -7.122 1.13e-12 ***
wave6                                -4.109e+00  2.959e-01 -13.886  < 2e-16 ***
wave7                                -6.158e+00  2.882e-01 -21.366  < 2e-16 ***
wave8                                -8.137e+00  2.795e-01 -29.112  < 2e-16 ***
wave9                                -9.672e+00  2.724e-01 -35.506  < 2e-16 ***
wave10                               -1.180e+01  2.924e-01 -40.349  < 2e-16 ***
wave11                               -1.314e+01  3.035e-01 -43.287  < 2e-1