In [None]:
library(tidyverse)
library(haven)
library(lubridate)

# Read and Tidy Address Data

In [None]:
add <- read_csv(str_c("C:\\Users\\jc4673\\Documents\\Columbia\\CHS_Lexis\\distance_measures\\",
"chs_lexisnexis_distance_measures_2\\chs_addresses_geocode__chs_addresses_geocode.csv"), 
                col_types = cols_only(ssn_altkey = "c",
                                     yrdeath = "i",
                                     dxid1 = "i",
                                     dxid2 = "i",
                                     chs_entrydate1 = col_date(),
                                     chs_entrydate2 = col_date()
                                     ))

In [None]:
print(add)

In [None]:
    # Separate table for year of death
    yrdeath <- add %>%
        select(ssn_altkey, yrdeath) %>%
        distinct()

    # Separate tables for id1 & id2, to be joined later on
    id1 <- add %>%
        select(ssn_altkey, dxid1, chs_entrydate1) %>%
        distinct() %>%
        rename(id = dxid1, chs_entrydate = chs_entrydate1)

    id2 <- add %>%
        select(ssn_altkey, dxid2, chs_entrydate2) %>%
        distinct() %>%
        rename(id = dxid2, chs_entrydate = chs_entrydate2)

    # Tidy version of address start dates
    tidy_entry <- bind_rows(id1, id2) %>%
        distinct() %>%
        select(-id) %>%
        group_by(ssn_altkey) %>%
        mutate(id = row_number()) %>%
        arrange(ssn_altkey, id) %>%
        select(ssn_altkey, id, chs_entrydate)

In [None]:
print(tidy_entry)

# Contact Date Data

In [None]:
df <- read_dta("C:\\Users\\jc4673\\Documents\\Data\\CHS\\contactsbyalt_keywdates.dta")

In [None]:
glimpse(df)

In [None]:
head(df)

In [None]:
df <- df %>%
    group_by(ssn_altkey) %>%
    mutate(next_date = lead(contactdate),
          gap = next_date - contactdate) %>%
    na.omit %>%
    select(ssn_altkey, contactdate, next_date, gap) #%>%
    #arrange(ssn_altkey, contactdate)

In [None]:
glimpse(df)

In [None]:
print(df)

In [None]:
ggplot(df, aes(x = gap, y = ..density..)) +
 geom_histogram()

This makes it kind of hard to see.  Let's split data into before and after the point we're interested in: **1 year**

In [None]:
l_1yr <- filter(df, gap <= days(400))
g_1yr <- filter(df, gap > days(400))

In [None]:
glimpse(l_1yr)

In [None]:
glimpse(g_1yr)

In [None]:
ggplot(l_1yr, aes(x = gap)) +
 geom_histogram()

In [None]:
ggplot(g_1yr, aes(x = gap)) +
 geom_histogram()