In [1]:
library(haven)
library(dplyr)

VARS_OF_INTEREST = c("AF66"
,"AJ32"
,"AF79"
,"AF86"
,"AF80" 
,"AB17"
,"AB34"
,"AG10"
,"AG11"
,"AG22"
,"AH33NEW"
,"AH34NEW"
,"AH35NEW"
,"AK25"
,"AK4"
,"AKWKLNG"
,"AM19"
,"AM20"
,"AM21"
,"BMI_P"
,"CITIZEN2"
,"DIABETES"
,"DSTRS12"
,"DSTRS30"
,"OMBSRR_P1"
,"RACECN_P1"
,"RACEDF_P1"
,"RACEHP2_P1"
,"SRAGE_P1"
,"SRSEX"
,"AA5C"
,"MARIT"
,"AB1"
,"AE15A"
,"AD32_P1"
,"OVRWT"
,"DISTRESS"
,"DSTRS_P1"
,"AH44"
,"AH43A"
,"AL22"
,"AK28"
,"AH130"
,"AH85B"
,"AK22_P1"
,"AE_FRUIT"
,"AE_VEGI"
,"UR_RHP"
,"UR_CLRT6"
,"AE15"
,"AJ29"
,"AJ30"
,"AJ31"
,"AJ33"
,"AJ34"
,"AHEDC_P1"
,"UR_BG6"
,'YEAR' # added by me to have the year of each observation
,'YEAR_NAME'
,"RAKEDW0" # RAKED WEIGHT - FULLSAMPLE
)

# DISTRESS: AJ29, AJ30, AJ31, AJ32, AJ33, AJ34

length(VARS_OF_INTEREST)
VARS_OF_INTEREST


Attaching package: ‘dplyr’


The following objects are masked from ‘package:stats’:

    filter, lag


The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union




In [2]:
# create a function that receives a dataframe and returns a dataframe with the variables of interest
get_vars_of_interest <- function(df){

    # add replicate weights to the list of variables of interest
    vars <- c(VARS_OF_INTEREST, paste0("RAKEDW",1:80))

    # subset the array of variables of interest with only the variables that are in the data frame
    vars_of_interest_df = vars[vars %in% names(df)]

    # subset the data frame with only the variables of interest
    df_interest = df[vars_of_interest_df]

    return(df_interest)
}


In [3]:
# Import the adult SAS data file
# import each dataset from c(2015, 2019) and add a column with the year
# create empy array of dataframes
df_array = list()
for (year in c(2015, 2016, 2017, 2018, 2019)){
    # import the data
    df = read_sas(paste0("data/",year,"/adult_",year,".sas7bdat"))

    # add a column with the year
    df$YEAR_NAME = year
    df$YEAR = year - 2015
    

    #append df to array of dataframes
    df_array[[length(df_array)+1]] = get_vars_of_interest(df)
}

In [4]:
# concatanate the data frames
df = dplyr::bind_rows(df_array)

dim(df)
head(df)

AF66,AJ32,AB17,AB34,AG10,AG11,AG22,AH33NEW,AH34NEW,AH35NEW,⋯,RAKEDW79,RAKEDW80,AK22_P1,RACEHP2_P1,AH130,AH85B,AE_FRUIT,AE_VEGI,UR_CLRT6,UR_BG6
<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
-1,5,2,2,-1,-1,2,1,1,1,⋯,3573.3994,3504.7796,,,,,,,,
-1,5,2,2,-1,-1,2,1,2,1,⋯,1362.1386,1319.0091,,,,,,,,
-1,5,2,2,-1,-1,2,1,1,1,⋯,7998.7357,7943.4022,,,,,,,,
-1,5,2,2,-1,-1,1,1,1,1,⋯,200.4105,203.312,,,,,,,,
-1,3,2,1,-1,2,1,1,1,1,⋯,1071.8171,1022.7021,,,,,,,,
-1,5,2,2,-1,2,1,2,-1,-1,⋯,900.8584,911.7079,,,,,,,,


In [5]:
# save the data frame as csv file
write.csv(df, "data/adult_2015_2019_vars_interest.csv", row.names = FALSE)

In [6]:
# read csv file to check if it was saved correctly
df = read.csv("data/adult_2015_2019_vars_interest.csv")

dim(df)
head(df)

Unnamed: 0_level_0,AF66,AJ32,AB17,AB34,AG10,AG11,AG22,AH33NEW,AH34NEW,AH35NEW,⋯,RAKEDW79,RAKEDW80,AK22_P1,RACEHP2_P1,AH130,AH85B,AE_FRUIT,AE_VEGI,UR_CLRT6,UR_BG6
Unnamed: 0_level_1,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,⋯,<dbl>,<dbl>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>
1,-1,5,2,2,-1,-1,2,1,1,1,⋯,3573.3994,3504.7796,,,,,,,,
2,-1,5,2,2,-1,-1,2,1,2,1,⋯,1362.1386,1319.0091,,,,,,,,
3,-1,5,2,2,-1,-1,2,1,1,1,⋯,7998.7357,7943.4022,,,,,,,,
4,-1,5,2,2,-1,-1,1,1,1,1,⋯,200.4105,203.312,,,,,,,,
5,-1,3,2,1,-1,2,1,1,1,1,⋯,1071.8171,1022.7021,,,,,,,,
6,-1,5,2,2,-1,2,1,2,-1,-1,⋯,900.8584,911.7079,,,,,,,,


In [14]:
table(df$YEAR_NAME)


 2015  2016  2017  2018  2019 
21034 21055 21153 21177 22160 

In [36]:
names(df)