# Visualize Data

In [None]:
library(arrow, warn.conflicts = FALSE)
library(dplyr, warn.conflicts = FALSE)
library(ggplot2)
library(scales)
library(survival)
library(survminer, quietly = TRUE, warn.conflicts = FALSE)
library(table1, warn.conflicts = FALSE)

In [None]:
library(truveta.research)

In [None]:
source(here::here("wrangle_scripts", "R", "_.R"))

In [None]:
results_dir <- here::here("results")
data_dir <- here::here("data")
dir.create(data_dir, recursive = TRUE, showWarnings = FALSE)

In [None]:
initialize_theme_truveta(figsize = c(8, 8))

## Load Data

In [None]:
df <- read_parquet_table(file.path(data_dir, "feature_table.parquet"), results_dir)
head(df)

In [None]:
colnames(df)

In [None]:
temp_df <-
  df |> 
  dplyr::mutate(
    dplyr::across(
      ckd:lung, ~ if_else(.x == 1, deparse(substitute(.x)), NA_character_)
    ),
    comorbid_free = 
      dplyr::if_else(
        is.na(ckd) & is.na(diabetes) & is.na(immunocompromised) & is.na(lung), 
        'None of the studied comorbidities', 
        NA_character_
      )
  )


key_values <- 
  tibble(
    n_ckd = sum(df$ckd),
    n_diabetes = sum(df$diabetes),
    n_immunocompromised = sum(df$immunocompromised),
    n_lung = sum(df$lung),
    n_comorbid_free = sum(!is.na(temp_df$comorbid_free)),
    n_person = dplyr::n_distinct(df$person_id)
  )


key_values

write.csv(key_values, file = file.path(results_dir, 'summary_stats.csv'))

In [None]:
comorbid_outcome_summarize <- function(df, comorbid) {
  out <- 
    df |> 
    dplyr::filter({{ comorbid }} == 1) |>
    dplyr::summarize(
      n = n(),
      breakthrough = sum(covid),
      breakthrough_percent = paste0(breakthrough / n() * 100),
      hospital = sum(hospital),
      hospital_percent = paste0(hospital / breakthrough * 100)
    )

  out
}



outcome_summary <- 
  list(
    ckd = comorbid_outcome_summarize(df, ckd),
    diabetes = comorbid_outcome_summarize(df, diabetes),
    immunocompromised = comorbid_outcome_summarize(df, immunocompromised),
    lung = comorbid_outcome_summarize(df, lung),
    comorbid_free = 
      temp_df |>
      dplyr::filter(!is.na(comorbid_free)) |>
      dplyr::summarize(
        n = n(),
        breakthrough = sum(covid),
        breakthrough_percent = paste0(breakthrough / n() * 100),
        hospital = sum(hospital),
        hospital_percent = paste0(hospital / breakthrough * 100)
      )
  )


save(outcome_summary, file = file.path(results_dir, 'outcome_summary.rdata'))

## Table 1

In [None]:
df_tab_covid <- 
  df |> 
  dplyr::select(
    sex, 
    race, 
    ethnicity, 
    vaccination_years, 
    vaccination_bracket, 
    ckd, 
    diabetes, 
    immunocompromised, 
    lung, 
    covid, 
    hospital
  ) |>
  dplyr::mutate(
    dplyr::across(
      ckd:lung, ~ if_else(.x == 1, deparse(substitute(.x)), NA_character_)
    ),
    covid = dplyr::if_else(covid == 1, 'Breakthrough COVID', 'No breakthrough'),
    hospital = dplyr::if_else(hospital == 1, 'Hospitalized', 'Not hospitalized'),
    comorbid_free = 
      dplyr::if_else(
        is.na(ckd) & is.na(diabetes) & is.na(immunocompromised) & is.na(lung), 
        'None of the studied comorbidities', 
        NA_character_
      )
  )

In [None]:
label(df_tab_covid$sex) <- 'Sex'
label(df_tab_covid$race) <- 'Race'
label(df_tab_covid$ethnicity) <- 'Ethnicity'
label(df_tab_covid$vaccination_years) <- 'Age'
label(df_tab_covid$vaccination_bracket) <- 'Age bracket'
label(df_tab_covid$ckd) <- 'Chronic kidney disease'
label(df_tab_covid$diabetes) <- 'Diabetes'
label(df_tab_covid$immunocompromised) <- 'Immunocompromised'
label(df_tab_covid$lung) <- 'Chronic lung disease'
label(df_tab_covid$covid) <- 'Breakthrough COVID-19 infection'
label(df_tab_covid$hospital) <- 'Hospitalization following breakthrough'

units(df_tab_covid$vaccination_years) <- 'years'

tab_covid_ckd <- 
  table1(
    ~ sex + 
      race + 
      ethnicity + 
      vaccination_years + 
      vaccination_bracket +
      covid + 
      hospital | 
      ckd, 
    data = df_tab_covid
  )

tab_covid_diabetes <- 
  table1(
    ~ sex + 
      race + 
      ethnicity + 
      vaccination_years + 
      vaccination_bracket +
      covid + 
      hospital | 
      diabetes, 
    data = df_tab_covid
  )

tab_covid_immuno <- 
  table1(
    ~ sex + 
      race + 
      ethnicity + 
      vaccination_years + 
      vaccination_bracket +
      covid + 
      hospital | 
      immunocompromised, 
    data = df_tab_covid
  )

tab_covid_lung <- 
  table1(
    ~ sex + 
      race + 
      ethnicity + 
      vaccination_years + 
      vaccination_bracket +
      covid + 
      hospital | 
      lung, 
    data = df_tab_covid
  )

tab_covid_free <- 
  table1(
    ~ sex + 
      race + 
      ethnicity + 
      vaccination_years + 
      vaccination_bracket +
      covid + 
      hospital | 
      comorbid_free, 
    data = df_tab_covid
  )


tab_elements <- 
  list(
    ckd = tab_covid_ckd, 
    diabetes = tab_covid_diabetes, 
    immunocompromised = tab_covid_immuno,
    lung = tab_covid_lung,
    free = tab_covid_free
  )

f_reduce <- \(x, ...) Reduce(..., x)

tab_1 <- 
  Map(
    \(x) {
      x <- as_tibble(x)
      colnames(x)[1] <- 'variable'
      x 
    }, 
    tab_elements
  ) |>
  f_reduce(
    \(x, y) inner_join(x, y, by = c('variable' = 'variable', 'Overall' = 'Overall'))
  ) |> 
  dplyr::relocate(Overall, .after = everything()) |> 
  dplyr::relocate(lung, .after = ckd) |>
  dplyr::rename(
    ` ` = variable,
    `Chronic kidney disease` = ckd,
    `Diabetes` = diabetes,
    `Immunocompromised` = immunocompromised,
    `Chronic lung disease` = lung
  )


write_table(tab_1, file.path(results_dir, 'table_1.csv'))


write_table_1(
  tab_1, file.path(results_dir, 'table_1.tex'), 
  caption = 'Overall summary statistics of our analyzed population of patients who have completed there primary COVID-19 vaccination sequence.',
  label = 'tab:table_1'
)

head(tab_1)

In [None]:
tab_covid_ckd_short <- 
  table1(
    ~ covid + 
      hospital | 
      ckd, 
    data = df_tab_covid
  )

tab_covid_diabetes_short <- 
  table1(
    ~ covid + 
      hospital | 
      diabetes, 
    data = df_tab_covid
  )

tab_covid_immuno_short <- 
  table1(
    ~ covid + 
      hospital | 
      immunocompromised, 
    data = df_tab_covid
  )

tab_covid_lung_short <- 
  table1(
    ~ covid + 
      hospital | 
      lung, 
    data = df_tab_covid
  )

tab_covid_free_short <- 
  table1(
    ~ covid + 
      hospital | 
      comorbid_free, 
    data = df_tab_covid
  )


tab_elements_short <- 
  list(
    ckd = tab_covid_ckd_short, 
    diabetes = tab_covid_diabetes_short, 
    immunocompromised = tab_covid_immuno_short,
    lung = tab_covid_lung_short,
    free = tab_covid_free_short
  )


tab_1_short <- 
  Map(
    \(x) {
      x <- as_tibble(x)
      colnames(x)[1] <- 'variable'
      x 
    }, 
    tab_elements_short
  ) |>
  f_reduce(
    \(x, y) inner_join(x, y, by = c('variable' = 'variable', 'Overall' = 'Overall'))
  ) |> 
  dplyr::relocate(Overall, .after = everything()) |> 
  dplyr::relocate(lung, .after = ckd) |>
  dplyr::rename(
    ` ` = variable,
    `Chronic kidney disease` = ckd,
    `Diabetes` = diabetes,
    `Immunocompromised` = immunocompromised,
    `Chronic lung disease` = lung
  )

tab_1_short

write_table_1(
  tab_1_short, file.path(results_dir, 'table_1_short.tex'), 
  caption = 'Overall summary statistics of our analyzed population of patients who have completed there primary COVID-19 vaccination sequence.',
  label = 'tab:table_1_short'
)

write.csv(
  as.data.frame(tab_1_short, make.names = FALSE),
  file = file.path(results_dir, 'tab_1_short.csv'),
  row.names = FALSE
)

## Demographics

In [None]:
g <- 
  ggplot(df, aes(x = vaccination_years)) +
  geom_histogram(bins = 50) +
  labs(title = "Age", x = "Age (years)") +
  theme_truveta()

write_ggplot(g, file.path(results_dir, "age_continuous.png"))

g

In [None]:
g <- 
  ggplot(df, aes(x = vaccination_bracket)) +
  geom_bar() +
  labs(title = "Age Groups", x = "Age bracket") +
  theme_truveta()

write_ggplot(g, file.path(results_dir, "age_groups.png"))

g

In [None]:
g <- 
  ggplot(df, aes(x = sex)) +
  geom_bar() +
  labs(title = "Sex") +
  theme_truveta()

write_ggplot(g, file.path(results_dir, "sex.png"))

g

In [None]:
df_temp <-
  df |>
  dplyr::mutate(race = stringr::str_wrap(race, 20))

g <- 
  ggplot(df_temp, aes(x = race)) +
  geom_bar() +
  labs(title = "Race") +
  theme_truveta() +
  scale_x_discrete(guide = guide_axis(angle = 45))

write_ggplot(g, file.path(results_dir, "race.png"))

g

In [None]:
g <- 
  ggplot(df, aes(x = ethnicity)) +
  geom_bar() +
  labs(title = "Ethnicity") +
  theme_truveta()

write_ggplot(g, file.path(results_dir, "ethncity.png"))

g

### Month-Year

In [None]:
month_year_count <- 
  df |> 
  dplyr::group_by(monitoring_date) |> 
  dplyr::count() |> 
  dplyr::ungroup() |>
  dplyr::mutate(n_log = log(n))

In [None]:
df |> 
  dplyr::group_by(monitoring_date) |>
  dplyr::summarize(
    n_covid = sum(covid),
    n_hospital = sum(hospital)
  )

In [None]:
month_year_line <- 
  ggplot(month_year_count, aes(x = monitoring_date, y = n)) +
  geom_line() +
  geom_point() +
  theme_truveta() +
  labs(
    x = 'Month-Year', 
    y = 'Count of patients finishing primary vaccine series'
  )

write_ggplot(month_year_line, file.path(results_dir, "month_year_count.png"))

month_year_line

In [None]:
month_year_log_line <- 
  ggplot(month_year_count, aes(x = monitoring_date, y = n_log)) +
  geom_line() +
  geom_point() +
  theme_truveta() +
  labs(
    x = 'Month-Year', 
    y = 'Log-count of patients finishing primary vaccine series'
  )

write_ggplot(month_year_log_line, file.path(results_dir, "month_year_log_count.png"))

month_year_log_line

### Comorbidities

In [None]:
condition_breakthrough_tab <- 
  df |> 
  dplyr::select(ckd, diabetes, immunocompromised, lung, covid) |> 
  group_by(ckd, diabetes, immunocompromised, lung, covid) |>
  dplyr::count(name = 'count') |>
  dplyr::ungroup() |> 
  tidyr::complete(
    ckd, diabetes, immunocompromised, lung, covid,
    fill = list(count = 0)
  ) |>
  dplyr::mutate(dplyr::across(everything(), ~ as.integer(.x)))

write_table(
  condition_breakthrough_tab, 
  file.path(results_dir, "condition_breakthrough_counts.csv")
)

label <- 'tab:breakthrough_counts'
caption <- 'Counts of patients by comorbidity and breakthrough COVID-19 infection outcome status. 0 indicates absence of comorbidity or no breakthrough infection. 1 indicates present of comorbidity or breakthrough infeciton.'

xtable::print.xtable(
  xtable::xtable(
    as.data.frame(condition_breakthrough_tab, make.names = FALSE), 
    label = label,
    caption = caption
  ),
  type = 'latex', 
  file = here::here('results', 'condition_breakthrough_counts.tex'), 
  include.rownames = FALSE,
  comment = FALSE,
  timestamp = NULL,
  table.placement = '!htbp'
)

condition_breakthrough_tab

In [None]:
condition_hospital_tab <- 
  df |> 
  dplyr::filter(covid == 1) |>
  dplyr::select(ckd, diabetes, immunocompromised, lung, hospital) |> 
  group_by(ckd, diabetes, immunocompromised, lung, hospital) |>
  dplyr::count(name = 'count') |>
  dplyr::ungroup() |> 
  tidyr::complete(
    ckd, diabetes, immunocompromised, lung, hospital,
    fill = list(count = 0)
  ) |>
  dplyr::mutate(dplyr::across(everything(), ~ as.integer(.x)))

write_table(
  condition_hospital_tab, 
  file.path(results_dir, "condition_hospital_counts.csv")
)

label <- 'tab:hospital_counts'
caption <- 'Counts of patients by comorbidity and hospitalization following breakthrough COVID-19 infection outcome status. 0 indicates absence of comorbidity or not hospitalized. 1 indicates present of comorbidity or hospitalized.'

xtable::print.xtable(
  xtable::xtable(
    as.data.frame(condition_hospital_tab, make.names = FALSE), 
    label = label,
    caption = caption
  ),
  type = 'latex', 
  file = here::here('results', 'condition_hospital_counts.tex'), 
  include.rownames = FALSE,
  comment = FALSE,
  timestamp = NULL,
  table.placement = '!htbp',
  format.args = list(big.mark = ',')
)

condition_hospital_tab

## Outcomes

In [None]:
race_breakthrough_count <- 
  df |> 
  dplyr::group_by(race) |>
  dplyr::count(covid, name = 'count') |> 
  dplyr::ungroup() |>
  tidyr::complete(
    race, covid,
    fill = list(count = 0)
  )

write_table(race_breakthrough_count, file.path(results_dir, "race_breakthrough_counts.csv"))

race_breakthrough_count

In [None]:
race_hospital_count <- 
  df |> 
  dplyr::filter(covid == 1) |>
  dplyr::group_by(race) |>
  dplyr::count(hospital, name = 'count') |> 
  dplyr::ungroup() |>
  tidyr::complete(
    race, hospital,
    fill = list(count = 0)
  )

write_table(race_hospital_count, file.path(results_dir, "race_hospital_counts.csv"))

race_hospital_count

In [None]:
df_temp <-
  df |>
  dplyr::mutate(covid_state = dplyr::if_else(covid == 1, 'breakthrough', 'no COVID'))

g <- 
  ggplot(df_temp, aes(x = covid_state)) +
  geom_bar() +
  labs(title = "COVID within 180 days of vaccination") +
  theme_truveta() +
  scale_x_discrete(guide = guide_axis(angle = 45))

write_ggplot(g, file.path(results_dir, "covid_breakthrough.png"))

g

In [None]:
df_temp <-
  df |>
  dplyr::filter(covid == 1) |>
  dplyr::mutate(hospital_state = dplyr::if_else(hospital == 1, 'hospitalized', 'no hospital'))

g <- 
  ggplot(df_temp, aes(x = hospital_state)) +
  geom_bar() +
  labs(title = "Hospitalized after breakthrough COVID") +
  theme_truveta() +
  scale_x_discrete(guide = guide_axis(angle = 45))

write_ggplot(g, file.path(results_dir, "covid_hospitalized.png"))

g

### K-M curves (not very helpful)

In [None]:
km <- survfit(Surv(time = outcome_time, event = covid) ~ ckd, data = df)
g <- ggsurvplot(km, data = df, risk.table = TRUE)

In [None]:
km <- survfit(Surv(time = outcome_time, event = covid) ~ diabetes, data = df)
g <- ggsurvplot(km, data = df, risk.table = TRUE)

In [None]:
km <- survfit(Surv(time = outcome_time, event = covid) ~ immunocompromised, data = df)
g <- ggsurvplot(km, data = df, risk.table = TRUE)

In [None]:
km <- survfit(Surv(time = outcome_time, event = covid) ~ lung, data = df)
g <- ggsurvplot(km, data = df, risk.table = TRUE)

In [None]:
km <- survfit(Surv(time = outcome_time, event = covid) ~ ckd + diabetes + immunocompromised + lung, data = df)
g <- ggsurvplot(km, data = df, risk.table = TRUE)