# Explore Data

In [None]:
library(arrow, warn.conflicts = FALSE)
library(dplyr, warn.conflicts = FALSE)
library(ggplot2, warn.conflicts = FALSE)
library(rlang, warn.conflicts = FALSE)
library(table1, warn.conflicts = FALSE)
library(lubridate, warn.conflicts = FALSE)
library(xtable)

In [None]:
library(truveta.research)

In [None]:
source(here::here("wrangle_scripts", "R", "write_as_xtable.r"))
source(here::here("wrangle_scripts", "R", "survival_helpers.r"))
source(here::here("wrangle_scripts", "R", "analyze_time_indep.r"))
source(here::here("wrangle_scripts", "R", "analyze_time_dependent.r"))

In [None]:
tracking_dir <- here::here("tracking")
datadefs_dir <- file.path(tracking_dir, "datadefs")
hashsum_dir <- file.path(tracking_dir, "hashsums")
dir.create(hashsum_dir, recursive = TRUE, showWarnings = FALSE)

results_dir <- here::here("results")
data_dir <- here::here("data")
dir.create(results_dir, recursive = TRUE, showWarnings = FALSE)
dir.create(data_dir, recursive = TRUE, showWarnings = FALSE)

In [None]:
one_year <- as.numeric(lubridate::dyears(1))

two_weeks <- as.numeric(lubridate::dweeks(2))

## Load Data

In [None]:
df <- read_parquet_table(file.path(data_dir, "feature_table.parquet"), hashsum_dir)
dim(df)
names(df)

In [None]:
table(df$race)

In [None]:
# useful footnote
black_footnote <- "*Black refers to Black or African American Individuals"

In [None]:
colnames(df)

## Reference tools

In [None]:
summary_values <- list()

summary_values$n <- nrow(df)

# ever
summary_values$n_vaccinated <- sum(!is.na(df$age_vaccine_completed_years))
summary_values$n_unvaccinated <- sum(is.na(df$age_vaccine_completed_years))
summary_values$n_boost <- sum(!is.na(df$age_vaccine_boost_years))


# before covid
temp <- 
  df |>
  dplyr::transmute(
    pre_covid_vax = 
      dplyr::if_else(
        age_covid < age_vaccine_completed_years | 
          is.na(age_vaccine_completed_years), 
        FALSE, 
        TRUE
      ),
    post_covid_vax = 
      dplyr::if_else(
        !is.na(age_vaccine_completed_years) & 
          age_covid > age_vaccine_completed_years, 
        TRUE, 
        FALSE
      ),
    pre_covid_boost = 
      dplyr::if_else(
        age_covid < age_vaccine_boost_years | 
          is.na(age_vaccine_boost_years), 
        FALSE, 
        TRUE
      ),
    post_covid_boost = 
      dplyr::if_else(
        !is.na(age_vaccine_boost_years) & 
          age_covid > age_vaccine_boost_years, 
        TRUE, 
        FALSE
      ),
    long_covid_symptoms = !is.na(time_long_covid),
    long_covid_diagnosis = !is.na(time_long_covid_diagnosis)
  )

summary_values$n_pre_vaccinated <- sum(temp$pre_covid_vax)
summary_values$n_pre_boost <- sum(temp$pre_covid_boost)

summary_values$n_pre_unvaccinated <- sum(!temp$pre_covid_vax)
summary_values$n_pre_noboost <- sum(!temp$pre_covid_boost)

summary_values$n_long_covid_symptoms <- sum(temp$long_covid_symptoms)
summary_values$n_long_covid_diagnosis <- sum(temp$long_covid_diagnosis)

# total ever vaccinated

# total ever boosted


save(summary_values, file = file.path(results_dir, 'summary_values.rdata'))

## Exploratory Data Analysis

In [None]:
initialize_theme_truveta(figsize = c(8, 8))

## demographic break down

### age

In [None]:
age_counts <- 
  df |>
  ggplot(aes(x = age, group = long_covid, fill = long_covid)) + 
  geom_histogram(bins = 50) +
  geom_vline(xintercept = 18) +
  theme_truveta()

write_ggplot(age_counts, file.path(results_dir, "age_counts.png"))

age_counts

In [None]:
age_bucket_counts <- 
  df |>
  ggplot(aes(x = age_group, group = long_covid, fill = long_covid)) +
  geom_bar() +
  theme_truveta()

write_ggplot(age_bucket_counts, file.path(results_dir, "age_bucket_counts.png"))

age_bucket_counts

### race

In [None]:
race_counts <-
  df |> 
  dplyr::group_by(long_covid) |>
  dplyr::count(race) |>
  ggplot(aes(x = race, y = n, group = long_covid, fill = long_covid)) +
  geom_bar(stat = 'identity') +
  theme_truveta()

write_ggplot(race_counts, file.path(results_dir, "race_counts.png"))

race_counts

### ethnicty

In [None]:
ethnicity_counts <-
  df |> 
  dplyr::group_by(long_covid) |>
  dplyr::count(ethnicity) |>
  ggplot(aes(x = ethnicity, y = n, group = long_covid, fill = long_covid)) +
  geom_bar(stat = 'identity') +
  theme_truveta()

write_ggplot(ethnicity_counts, file.path(results_dir, "ethnicity_counts.png"))

ethnicity_counts

### sex

In [None]:
sex_counts <-
  df |> 
  group_by(long_covid) |>
  dplyr::count(sex) |>
  ggplot(aes(x = sex, y = n, group = long_covid, fill = long_covid)) +
  geom_bar(stat = 'identity') +
  theme_truveta()

write_ggplot(sex_counts, file.path(results_dir, "sex_counts.png"))

sex_counts

### count fully vaccinated before covid vs those unvaccinated when they got covid

In [None]:
vax_state <- 
  df |> 
  dplyr::transmute(
    pre_vax = 
      dplyr::if_else(
        age_covid < age_vaccine_completed_years | 
          is.na(age_vaccine_completed_years), 
        'no', 
        'yes'
      ),
    pre_boost = 
      dplyr::if_else(
        age_covid < age_vaccine_boost_years | is.na(age_vaccine_boost_years),
        'no',
        'yes'
      )
  ) |>
  tidyr::pivot_longer(everything()) |>
  dplyr::group_by(name, value) |>
  dplyr::count() |> 
  tidyr::pivot_wider(id_cols = value, names_from = name, values_from = n) |>
  setNames(nm = c('vaccine_at_covid', 'booster', 'primary'))

write_table(vax_state, file.path(results_dir, 'vaccine_state_at_covid.csv'))

vax_state

### flu vaccinations

In [None]:
df |> 
  group_by(long_covid) |>
  dplyr::count(flu_vaccination) |>
  ggplot2::ggplot(
    ggplot2::aes(
      x = flu_vaccination, 
      y = n, 
      group = long_covid, 
      fill = long_covid
    )
  ) +
  ggplot2::geom_bar(stat = 'identity') +
  theme_truveta()

### inpatient encounters

In [None]:
#df |> 
#  group_by(long_covid) |>
#  dplyr::count(inpatient_count)

summary(df$inpatient_count)

### concordance between "cdc long covid" and "code long covid"

In [None]:
lc_crosstab <- table(df$long_covid, df$long_covid_diagnosis)
lc_crosstab

write.table(lc_crosstab, file = file.path(results_dir, 'long_covid_deff_crosstab.csv'))


caption <- 'Contingecy table comparing overlap in differeing Long COVID outcomes among patients.'
label <- 'tab:contingency'

lc_crosstab_df <- 
  as.data.frame.matrix(lc_crosstab) |>
  tibble::rownames_to_column() |>
  tibble::as_tibble() |>
  dplyr::mutate(
    rowname = 
      dplyr::if_else(
        rowname == 'long covid', 
        'Long COVID Symptoms', 
        'No Long COVID Symptoms'
      )
  ) |>
  dplyr::rename(
    `Long COVID Diagnosis` = `long covid diagnosis`,
    `No Long COVID Diagnosis` = `no long covid diagnosis`,
    `Symptoms / Diagnosis` = rowname
  )

lc_crosstab_df

write_as_xtable(
  lc_crosstab_df, 
  filepath = file.path(results_dir, 'long_covid_deff_crosstab.tex'), 
  caption = caption, 
  label = label
)

In [None]:
df_crosstab <- 
  as.data.frame(lc_crosstab) |>
  setNames(c('cdc', 'diagnosis', 'count'))

write_table(df_crosstab, file.path(results_dir, 'long_covid_deff_crosstab_long.csv'))
df_crosstab

### year month

In [None]:
year_month_count <- 
  df |>
  dplyr::select(year_month) |>
  dplyr::group_by(year_month) |>
  dplyr::count() |>
  dplyr::mutate(
    year_month = stringr::str_replace_all(year_month, '_', '-'),
    year_month = as.Date(paste(year_month, "-01", sep=""))
  )

year_month_count_plot <- 
  ggplot(year_month_count, aes(x = year_month, y = n)) +
  geom_line() +
  geom_point() +
  coord_trans(y = 'log') +
  theme_truveta()

write_ggplot(year_month_count_plot, file.path(results_dir, "year_month_counts.png"))

year_month_count_plot

### time between covid and fully vaccinated, of those who were vaccinated at some point

In [None]:
vaccine_time <- 
  df |> 
  dplyr::filter(!is.na(age_vaccine_completed)) |>
  dplyr::mutate(covid_vaccine_diff = age_vaccine_completed_years - age_covid) |>
  ggplot(aes(x = covid_vaccine_diff, group = long_covid, fill = long_covid)) +
  geom_histogram(bins = 50) +
  theme_truveta() +
  labs(
    title = 'age_vaccine_completed_years - age_covid',
    subtitle = '(only individuals who have been vaccinated)',
    x = 'Time difference (years)'
  )

write_ggplot(vaccine_time, file.path(results_dir, "covid_vaccinated_counts.png"))

vaccine_time

## explore analysis data stuff

In [None]:
df_simple <- make_simple_data(df, time_long_covid)

In [None]:
df_time <- make_timedep_data(df, time_long_covid)

# Table 1

In [None]:
names(df)

In [None]:
df_tab1 <- 
  df |> 
  dplyr::mutate(
    race = dplyr::if_else(race == 'Black', 'Black or African American', race),
    dplyr::across(anxiety:depression, ~ as.logical(.x)),
    flu_vaccinated = dplyr::if_else(flu_vaccination == 0, FALSE, TRUE),
    vaccinated_at_covid = 
      dplyr::if_else(
        age_vaccine_completed_years > age_covid | is.na(age_vaccine_completed_years),
        FALSE,
        TRUE
      ),
    boosted_at_covid = 
      dplyr::if_else(
        age_vaccine_boost_years > age_covid | is.na(age_vaccine_boost_years),
        FALSE,
        TRUE
      ),
    vaccinated_and_boosted_at_covid = vaccinated_at_covid & boosted_at_covid,
    vaccine_state = 
      dplyr::case_when(
        vaccinated_and_boosted_at_covid ~ 'Vaccinated and boosted',
        vaccinated_at_covid & !boosted_at_covid ~ 'Vaccinated',
        !vaccinated_at_covid ~ 'Unvaccinated'
      ),
    long_covid_symptoms = !is.na(time_long_covid),
    long_covid_diagnosis = !is.na(time_long_covid_diagnosis)
  )

In [None]:
# this part sucks to do but gives pretty printing
table1::label(df_tab1$sex) <- 'Sex'
table1::label(df_tab1$age_covid) <- 'Age (y)'
table1::label(df_tab1$age_group) <- 'Age group'
table1::label(df_tab1$race) <- 'Race'
table1::label(df_tab1$ethnicity) <- 'Ethnicity'
table1::label(df_tab1$anxiety) <- 'Anxiety'
table1::label(df_tab1$cancer) <- 'Cancer'
table1::label(df_tab1$cardiovascular_disease) <- 'Cardiovascular Disease'
table1::label(df_tab1$cerebrovascular_disease_stroke_tia) <- 'Cerebrovascular Disease/Stroke/TIA'
table1::label(df_tab1$copd) <- 'COPD'
table1::label(df_tab1$ckd) <- 'CKD'
table1::label(df_tab1$dementia) <- 'Dementia'
table1::label(df_tab1$depression) <- 'Depression'
table1::label(df_tab1$diabetes) <- 'Diabetes'
table1::label(df_tab1$immunocompromised) <- 'Immunocompromised'
table1::label(df_tab1$pad) <- 'PAD'
table1::label(df_tab1$smoking) <- 'Smoking Status'
table1::label(df_tab1$flu_vaccinated) <- '1+ influenza vaccines within 2 years prior'
table1::label(df_tab1$inpatient_count) <- 'Number of inpatient encounters within last 2 years'
table1::label(df_tab1$outpatient_count) <- 'Number of outpatient encounters within last 2 years'
table1::label(df_tab1$year_month) <- 'Year-month of COVID infection'
table1::label(df_tab1$blood_panel_count) <- 'Number of unique blood panel labs within 2 years prior'
table1::label(df_tab1$vaccinated_at_covid) <- 'Vaccinated at time of COVID'
table1::label(df_tab1$boosted_at_covid) <- 'Boosted at time of COVID'
table1::label(df_tab1$vaccine_state) <- 'Vaccination status at time of COVID-19 infection'
table1::label(df_tab1$long_covid_symptoms) <- 'Developed Long COVID symptoms'
table1::label(df_tab1$long_covid_diagnosis) <- 'Diagnosed with Long COVID'
    

tab1 <- 
  table1::table1(
    ~ sex + 
      age_covid +
      #age_group +
      year_month +
      race + 
      ethnicity +
      anxiety + 
      cardiovascular_disease + 
      cancer + 
      cerebrovascular_disease_stroke_tia +
      ckd +
      copd +
      dementia +
      depression +
      diabetes +
      immunocompromised +
      pad  +
      smoking +
      flu_vaccinated +
      inpatient_count +
      outpatient_count +
      blood_panel_count +
      long_covid_symptoms +
      long_covid_diagnosis |
      vaccine_state,
    data = df_tab1,
    big.mark = ','
  ) 


In [None]:
tab1_df <- 
  as.data.frame(tab1, make.names = FALSE) |>
  setNames(c('Feature', 'Unvaccinated', 'Vaccinated', 'Vaccinated and boosted', 'Overall'))

In [None]:
add.to.row <- list()
add.to.row$pos <- list()
add.to.row$pos[[1]] <- c(0)
add.to.row$command <- 
  paste0(
    "\\hline \n",
    "\\endfirsthead \n",
    "\\multicolumn{5}{p{\\textwidth}}{{ \\bfseries \\tablename\ \\thetable{} -- continued from previous page}} \\ \n",
    "\\hline Feature & Unvaccinated & Vaccinated & Vaccinated and boosted & Overall \\\\ \\hline \n",
    "\\endhead \n",
    "\\hline \\multicolumn{5}{p{\\textwidth}}{{Continued on next page}} \\\\ \\hline \n",
    "\\endfoot \n",
    "\\hline \n",
    "\\endlastfoot \n"
  )

#align <- c('', "p{0.2\\textwidth}", "p{0.2\\textwidth}", "p{0.6\\textwidth}")

caption <- 
  paste0(
    'Overall summary statistics of our analyzed population of patients who experienced a COVID-19 infection, ',
    'stratified by vaccination status at time of COVID-19 infection.'
  )

label <- 'tab:table_1'

xtable::print.xtable(
  xtable::xtable(
    tab1_df,
    label = label,
    caption = caption,
    floating = FALSE#,
    #align = align
  ),
  type = 'latex', 
  file = file.path(results_dir, "table_1.tex"), 
  include.rownames = FALSE,
  comment = FALSE,
  timestamp = NULL,
  floating = FALSE,
  add.to.row = add.to.row,
  tabular.environment = "longtable",
  hline.after = FALSE,
  format.args = list(digits = 3, big.mark = ",")
)

In [None]:
tab1_df

In [None]:
write_table(tab1_df, file.path(results_dir, 'table_1.csv'))

In [None]:
tab1_alt <- 
  table1::table1(
    ~ sex + 
      age_covid +
      #age_group +
      race + 
      ethnicity +
      anxiety + 
      cardiovascular_disease + 
      cancer + 
      cerebrovascular_disease_stroke_tia +
      ckd +
      copd +
      dementia +
      depression +
      diabetes +
      immunocompromised +
      pad  +
      smoking +
      flu_vaccinated +
      inpatient_count |
      vaccine_state,
    data = df_tab1
  ) 

as.data.frame(tab1_alt)

write_table(as.data.frame(tab1_alt), file.path(results_dir, 'table_1_alt.csv'))

caption <- 
  paste0(
    'Overall summary statistics of our analyzed population of patients who experienced a COVID-19 infection, ',
    'stratified by vaccination status at time of COVID-19 infection.'
  )
label <- 'tab:table_1'

write_as_xtable(
  as.data.frame(tab1_alt), 
  file.path(results_dir, 'table_1_alt.tex'), 
  caption = caption, 
  label = label
)