# Set Up

In [None]:
library(tidyverse)
library(fst)
library(bigrquery)
library(stringr)
library(lubridate)

In [None]:
# This snippet assumes that you run setup first

# This code copies a file from your Google Bucket into a dataframe

# replace 'test.csv' with the name of the file in your google bucket (don't delete the quotation marks)
name_of_file_in_bucket <- 'control_propensity_df_11032023_AG.fst'

########################################################################
##
################# DON'T CHANGE FROM HERE ###############################
##
########################################################################

# Get the bucket name
my_bucket <- Sys.getenv('WORKSPACE_BUCKET')

# Copy the file from current workspace to the bucket
system(paste0("gsutil cp ", my_bucket, "/data/", name_of_file_in_bucket, " ."), intern=T)

# Load the file into a dataframe
control_propensity_df  <- read_fst(name_of_file_in_bucket)
head(control_propensity_df)

In [None]:
# This snippet assumes that you run setup first

# This code copies a file from your Google Bucket into a dataframe

# replace 'test.csv' with the name of the file in your google bucket (don't delete the quotation marks)
name_of_file_in_bucket <- 'cases_propensity_df_11032023_AG.fst'

########################################################################
##
################# DON'T CHANGE FROM HERE ###############################
##
########################################################################

# Get the bucket name
my_bucket <- Sys.getenv('WORKSPACE_BUCKET')

# Copy the file from current workspace to the bucket
system(paste0("gsutil cp ", my_bucket, "/data/", name_of_file_in_bucket, " ."), intern=T)

# Load the file into a dataframe
cases_propensity_df  <- read_fst(name_of_file_in_bucket)
head(cases_propensity_df)

In [None]:
# This snippet assumes that you run setup first

# This code copies a file from your Google Bucket into a dataframe

# replace 'test.csv' with the name of the file in your google bucket (don't delete the quotation marks)
name_of_file_in_bucket <- 'controls_demo_df_11032023_AG.fst'

########################################################################
##
################# DON'T CHANGE FROM HERE ###############################
##
########################################################################

# Get the bucket name
my_bucket <- Sys.getenv('WORKSPACE_BUCKET')

# Copy the file from current workspace to the bucket
system(paste0("gsutil cp ", my_bucket, "/data/", name_of_file_in_bucket, " ."), intern=T)

# Load the file into a dataframe
controls_demo_df  <- read_fst(name_of_file_in_bucket)
head(controls_demo_df)

In [None]:
# This snippet assumes that you run setup first

# This code copies a file from your Google Bucket into a dataframe

# replace 'test.csv' with the name of the file in your google bucket (don't delete the quotation marks)
name_of_file_in_bucket <- 'cases_demo_df_11032023_AG.fst'

########################################################################
##
################# DON'T CHANGE FROM HERE ###############################
##
########################################################################

# Get the bucket name
my_bucket <- Sys.getenv('WORKSPACE_BUCKET')

# Copy the file from current workspace to the bucket
system(paste0("gsutil cp ", my_bucket, "/data/", name_of_file_in_bucket, " ."), intern=T)

# Load the file into a dataframe
cases_demo_df  <- read_fst(name_of_file_in_bucket)
head(cases_demo_df)

In [None]:
dim(control_propensity_df)

In [None]:
dim(controls_demo_df)

In [None]:
dim(cases_propensity_df)

In [None]:
dim(cases_demo_df)

In [None]:
total_propensity <- rbind(control_propensity_df, cases_propensity_df)
head(total_propensity)
dim(total_propensity)

In [None]:
total_treatment <- total_propensity[,c("person_id","Treatment")]
head(total_treatment)
dim(total_treatment)

In [None]:
total_demo <- rbind(controls_demo_df, cases_demo_df)
head(total_demo)
dim(total_demo)

In [None]:
colnames(total_demo)[1] <- "person_id"
head(total_demo)

In [None]:
merged_treat_demo <- merge(total_treatment, total_demo, by="person_id", all.x = TRUE)
head(merged_treat_demo)
dim(merged_treat_demo)

# Factor

## Sex At Birth

In [None]:
table(merged_treat_demo$Sex_at_Birth)

In [None]:
merged_treat_demo$Sex_at_Birth[merged_treat_demo$Sex_at_Birth
                               == "PMI_PreferNotToAnswer" |
                               merged_treat_demo$Sex_at_Birth
                               == "PMI_Skip" | 
                               merged_treat_demo$Sex_at_Birth
                               == "SexAtBirth_Intersex" |
                               merged_treat_demo$Sex_at_Birth
                               == "SexAtBirth_SexAtBirthNoneOfThese" |
                               merged_treat_demo$Sex_at_Birth
                               == "Unspecified" ] <- "Other"

table(merged_treat_demo$Sex_at_Birth)

In [None]:
merged_treat_demo$Sex_at_Birth[merged_treat_demo$Sex_at_Birth
                               == "SexAtBirth_Female"] <- "Female"
merged_treat_demo$Sex_at_Birth[merged_treat_demo$Sex_at_Birth
                               == "SexAtBirth_Male"] <- "Male"

table(merged_treat_demo$Sex_at_Birth)

In [None]:
merged_treat_demo$f.sex <- factor(merged_treat_demo$Sex_at_Birth,
                levels = c("Male", "Female", "Other"))

## Race

In [None]:
table(merged_treat_demo$Race)

In [None]:
merged_treat_demo$Race[merged_treat_demo$Race
                               == "AoUDRC_NoneIndicated" |
                               merged_treat_demo$Race
                               == "PMI_PreferNotToAnswer" | 
                               merged_treat_demo$Race
                               == "WhatRaceEthnicity_GeneralizedMultPopulations" |
                               merged_treat_demo$Race
                               == "WhatRaceEthnicity_MENA" |
                               merged_treat_demo$Race
                               == "WhatRaceEthnicity_NHPI" | 
                               merged_treat_demo$Race
                               == "PMI_Skip" |
                               merged_treat_demo$Race
                               == "WhatRaceEthnicity_RaceEthnicityNoneOfThese" |
                               merged_treat_demo$Race
                               == "WhatRaceEthnicity_NHPI"] <- "Other"

table(merged_treat_demo$Race)

In [None]:
merged_treat_demo$Race[merged_treat_demo$Race
                               == "WhatRaceEthnicity_Asian"] <- "Asian"
merged_treat_demo$Race[merged_treat_demo$Race
                               == "WhatRaceEthnicity_Black"] <- "Black"
merged_treat_demo$Race[merged_treat_demo$Race
                               == "WhatRaceEthnicity_White"] <- "White"

table(merged_treat_demo$Race)

In [None]:
merged_treat_demo$f.Race <- factor(merged_treat_demo$Race,
                levels = c("White", "Black", "Asian", "Other"))

## Ethnicity

In [None]:
table(merged_treat_demo$Hispanic)

In [None]:
merged_treat_demo$Hispanic[merged_treat_demo$Hispanic
                               == "PMI_PreferNotToAnswer" |
                               merged_treat_demo$Hispanic
                               == "PMI_Skip" | 
                               merged_treat_demo$Hispanic
                               == "WhatRaceEthnicity_RaceEthnicityNoneOfThese"] <- "Other"

table(merged_treat_demo$Hispanic)

In [None]:
merged_treat_demo$f.Ethnicity <- factor(merged_treat_demo$Hispanic,
                levels = c("Not Hispanic", "Hispanic", "Other"))

In [None]:
####### TAKE OUT NON-NECESSARY COLUMNS FROM MERGED_TREAT_DEMO

In [None]:
regression <- merged_treat_demo
head(regression)

In [None]:
regression$gender_concept_id <- NULL
regression$Gender <- NULL
regression$gender_source_concept_id <- NULL
regression$sex_at_birth_concept_id <- NULL
regression$sex_at_birth_source_concept_id <- NULL
regression$race_concept_id <- NULL
regression$race_source_concept_id <- NULL
regression$ethnicity_concept_id <- NULL
regression$ethnicity_source_concept_id <- NULL
regression$Age <- NULL
regression$Age_Group <- NULL

head(regression)
dim(regression)

## Age

In [None]:
# This snippet assumes that you run setup first

# This code copies a file from your Google Bucket into a dataframe

# replace 'test.csv' with the name of the file in your google bucket (don't delete the quotation marks)
name_of_file_in_bucket <- 'acidosis_emergent_conditions_AG_10172023.csv'

########################################################################
##
################# DON'T CHANGE FROM HERE ###############################
##
########################################################################

# Get the bucket name
my_bucket <- Sys.getenv('WORKSPACE_BUCKET')

# Copy the file from current workspace to the bucket
system(paste0("gsutil cp ", my_bucket, "/data/", name_of_file_in_bucket, " ."), intern=T)

# Load the file into a dataframe
acidosis.emergent  <- read_csv(name_of_file_in_bucket)
head(acidosis.emergent)

In [None]:
acidosis <- acidosis.emergent %>%
  select(PERSON_ID, VISIT_START_DATETIME)
head(acidosis)
dim(acidosis)

In [None]:
# This snippet assumes that you run setup first

# This code copies a file from your Google Bucket into a dataframe

# replace 'test.csv' with the name of the file in your google bucket (don't delete the quotation marks)
name_of_file_in_bucket <- 'demographic_all.csv'


########################################################################
##
################# DON'T CHANGE FROM HERE ###############################
##
########################################################################

 
# Get the bucket name
my_bucket <- Sys.getenv('WORKSPACE_BUCKET')


# Copy the file from current workspace to the bucket
system(paste0("gsutil cp ", my_bucket, "/data/", name_of_file_in_bucket, " ."), intern=T)


# Load the file into a dataframe
demo  <- read_csv(name_of_file_in_bucket)
head(demo)

In [None]:
acidosis <- acidosis %>% 
       rename(person_id = PERSON_ID)
head(acidosis)

In [None]:
acid_demo <- merge(acidosis, demo, by="person_id", all.x = TRUE)
head(acid_demo)
dim(acid_demo)

In [None]:
acid_demo <- acid_demo %>%
  select(person_id, VISIT_START_DATETIME, year_of_birth)
head(acid_demo)
dim(acid_demo)

In [None]:
acid_demo$Index_Age <- year(acid_demo$VISIT_START_DATETIME) - as.numeric(acid_demo$year_of_birth)
head(acid_demo)
# Index age: age at the initial acidosis date

In [None]:
acid_demo$Index_Date <- as.Date(acid_demo$VISIT_START_DATETIME)
head(acid_demo)
dim(acid_demo)

In [None]:
summary(acid_demo$Index_Date)

In [None]:
# This snippet assumes that you run setup first

# This code copies a file from your Google Bucket into a dataframe

# replace 'test.csv' with the name of the file in your google bucket (don't delete the quotation marks)
name_of_file_in_bucket <- 'case_with_4controls_df_11032023_AG.fst'

########################################################################
##
################# DON'T CHANGE FROM HERE ###############################
##
########################################################################

# Get the bucket name
my_bucket <- Sys.getenv('WORKSPACE_BUCKET')

# Copy the file from current workspace to the bucket
# system(paste0("gsutil cp ", my_bucket, "/data/", name_of_file_in_bucket, " ."), intern=T)

# Load the file into a dataframe
cases_with_4controls_df  <- read_fst(name_of_file_in_bucket)
head(cases_with_4controls_df)

In [None]:
dim(cases_with_4controls_df)

In [None]:
cases_with_4controls_df$Cases1 <- cases_with_4controls_df$Cases
head(cases_with_4controls_df)

In [None]:
library(tidyr)
library(dplyr)
library(readr)

In [None]:
cases_4controls_df <- cases_with_4controls_df %>% 
  pivot_longer(
    cols = !Cases)
head(cases_4controls_df)
dim(cases_4controls_df)

In [None]:
# Trying to find patient's age at the index date 
# need to merge the cases - name - value with the df that contains index date - merge based on 
## the cases + patient ID

In [None]:
colnames(acid_demo)[1] <- "Cases"

In [None]:
head(acid_demo)
dim(acid_demo)

In [None]:
test <- merge(cases_4controls_df, acid_demo, by="Cases", all.x = TRUE)
head(test)
dim(test)

In [None]:
test$year_of_birth <- NULL
test$Index_Age <- NULL
head(test)

In [None]:
length(unique(test$Cases))

In [None]:
length(unique(test$value))

In [None]:
test$Cases <- NULL
test$name <- NULL
head(test)

In [None]:
colnames(test)[1] <- "person_id"
head(test)

In [None]:
# Now need to merge with demo data for age at index date
index_date_age <- merge(test, demo[,c("person_id", "year_of_birth")], by="person_id", all.x = TRUE)
head(index_date_age)
dim(index_date_age)

In [None]:
index_date_age$Index_Age <- year(index_date_age$Index_Date) - as.numeric(index_date_age$year_of_birth)
head(index_date_age)

In [None]:
# This snippet assumes that you run setup first

# This code saves your dataframe into a csv file in a "data" folder in Google Bucket

# Replace df with THE NAME OF YOUR DATAFRAME
my_dataframe <- index_date_age

# Replace 'test.csv' with THE NAME of the file you're going to store in the bucket (don't delete the quotation marks)
destination_filename <- 'index_date_age_11032023_AG.fst'

########################################################################
##
################# DON'T CHANGE FROM HERE ###############################
##
########################################################################

# store the dataframe in current workspace
write_fst(my_dataframe, destination_filename)

# Get the bucket name
my_bucket <- Sys.getenv('WORKSPACE_BUCKET')

# Copy the file from current workspace to the bucket
system(paste0("gsutil cp ./", destination_filename, " ", my_bucket, "/data/"), intern=T)

# Check if file is in the bucket
system(paste0("gsutil ls ", my_bucket, "/data/*.fst"), intern=T)

In [None]:
# Create a factor
test_case <- case_when(
  index_date_age$Index_Age < 45 ~ "<45",
  index_date_age$Index_Age >= 45 & index_date_age$Index_Age < 55 ~ "45-54",
  index_date_age$Index_Age >= 55 & index_date_age$Index_Age < 65 ~ "55-64",
  index_date_age$Index_Age >= 65 ~ "65+")

In [None]:
table(test_case)

In [None]:
index_date_age$Index_Age_Group <- case_when(
  index_date_age$Index_Age < 45 ~ "<45",
  index_date_age$Index_Age >= 45 & index_date_age$Index_Age < 55 ~ "45-54",
  index_date_age$Index_Age >= 55 & index_date_age$Index_Age < 65 ~ "55-64",
  index_date_age$Index_Age >= 65 ~ "65+")
head(index_date_age)

In [None]:
table(index_date_age$Index_Age_Group[index_date_age$person_id %in% 
                                     regression$person_id[regression$Treatment == 1]])

In [None]:
head(regression)

In [None]:
regression <- merge(regression, index_date_age[,c("person_id", "Index_Age", "Index_Age_Group")], by="person_id", all.x = TRUE)
head(regression)
dim(regression)

In [None]:
regression$f.age <- factor(regression$Index_Age_Group,
                levels = c("<45", "45-54", "55-64", "65+"))
head(regression)

## Comorbidities

### Load Dataset

In [None]:
# Upload the comorbidities csv file

# This snippet assumes that you run setup first

# This code copies a file from your Google Bucket into a dataframe

# replace 'test.csv' with the name of the file in your google bucket (don't delete the quotation marks)
name_of_file_in_bucket <- 'AUD_Summary_Comorbidity_v2.csv'

########################################################################
##
################# DON'T CHANGE FROM HERE ###############################
##
########################################################################

# Get the bucket name
my_bucket <- Sys.getenv('WORKSPACE_BUCKET')

# Copy the file from current workspace to the bucket
system(paste0("gsutil cp ", my_bucket, "/data/", name_of_file_in_bucket, " ."), intern=T)

# Load the file into a dataframe
como  <- read_csv(name_of_file_in_bucket)
head(como)

In [None]:
colnames(como)

In [None]:
como_regression <- como[,c("person_id", "Myocardial_Infarction", "Congestive_Heart_Failure", 
                           "Peripheral_Vascular_Disease", "Cerebrovascular_Disease",
                           "Dementia", "Chronic_Pulmonary_Disease",
                           "Rheumatic_Disease", "Peptic_Ulcer_Disease",
                           "Liver_Disease_Mild", "Liver_Disease_Moderate_Severe",
                           "Diabetes_without_Chronic_Complications", "Diabetes_with_Chronic_Complications",
                           "Hemiplegia_Paraplegia", "Renal_Disease_Mild_Moderate",
                           "Renal_Disease_Severe", "HIV", 
                           "Metastatic_Solid_Tumor", "Malignancy", "AIDS")]
head(como_regression)

### Trump Rules

In [None]:
# Hemiplegia/paraplegia trumps cerebrovascular disease
como_regression$Cerebrovascular_Disease[como_regression$Hemiplegia_Paraplegia %in% 1] = 0

In [None]:
table(como_regression$Cerebrovascular_Disease)

In [None]:
table(como$Cerebrovascular_Disease)

In [None]:
# Liver disease, moderate-severe trumps liver disease, mild
como_regression$Liver_Disease_Mild[como_regression$Liver_Disease_Moderate_Severe %in% 1] = 0

In [None]:
table(como_regression$Liver_Disease_Mild)

In [None]:
table(como$Liver_Disease_Mild)

In [None]:
# Diabetes with complications trumps Diabetes, uncomplicated
como_regression$Diabetes_without_Chronic_Complications[como_regression$Diabetes_with_Chronic_Complications %in% 1] = 0

In [None]:
table(como_regression$Diabetes_without_Chronic_Complications)

In [None]:
table(como$Diabetes_without_Chronic_Complications)

In [None]:
# Renal disease severe trumps Renal disease, mild moderate
como_regression$Renal_Disease_Mild_Moderate[como_regression$Renal_Disease_Severe %in% 1] = 0

In [None]:
table(como_regression$Renal_Disease_Mild_Moderate)

In [None]:
table(como$Renal_Disease_Mild_Moderate)

In [None]:
# Metastatic Solid Tumor trumps malignancy
como_regression$Malignancy[como_regression$Metastatic_Solid_Tumor %in% 1] = 0

In [None]:
table(como_regression$Malignancy)

In [None]:
table(como$Malignancy)

In [None]:
# AIDS Trumps HIV
como_regression$HIV[como_regression$AIDS %in% 1] = 0

In [None]:
table(como_regression$HIV)

In [None]:
table(como$HIV)

In [None]:
# merge this table with regression
regression <- merge(regression, como_regression, by="person_id", all.x = TRUE)
head(regression)
dim(regression)

## Metformin Use

In [None]:
download_data <- function(query) {
    tb <- bq_project_query(Sys.getenv('GOOGLE_PROJECT'), query)
    bq_table_download(tb)
}

In [None]:
dataset <- Sys.getenv("WORKSPACE_CDR")
drug_pid <- download_data(str_glue("SELECT distinct person_id
                                FROM {dataset}.drug_exposure ORDER BY person_id"))

In [None]:
dim(drug_pid)
head(drug_pid)

In [None]:
length(intersect(regression$person_id, unlist(drug_pid)))

In [None]:
class(regression$person_id[1])

In [None]:
metformin_regression <- regression[regression$person_id %in% unlist(drug_pid),]

In [None]:
dim(metformin_regression)

In [None]:
# This snippet assumes that you run setup first

# This code copies a file from your Google Bucket into a dataframe

# replace 'test.csv' with the name of the file in your google bucket (don't delete the quotation marks)
name_of_file_in_bucket <- 'Metformin_Medication_AG_10122023.csv'

########################################################################
##
################# DON'T CHANGE FROM HERE ###############################
##
########################################################################

# Get the bucket name
my_bucket <- Sys.getenv('WORKSPACE_BUCKET')

# Copy the file from current workspace to the bucket
system(paste0("gsutil cp ", my_bucket, "/data/", name_of_file_in_bucket, " ."), intern=T)

# Load the file into a dataframe
metformin_df  <- read_csv(name_of_file_in_bucket)
head(metformin_df)

In [None]:
dim(metformin_df)

In [None]:
metformin_cohort <- metformin_df[metformin_df$person_id %in% regression$person_id,]
head(metformin_cohort)
dim(metformin_cohort)

In [None]:
length(unique(metformin_cohort$person_id))

In [None]:
head(index_date_age)

In [None]:
merged_metformin_index_date <- merge(metformin_cohort[,c("person_id", "drug_exposure_start_datetime")],
                                     index_date_age[,c("person_id", "Index_Date")],
                                     by="person_id", all.x = TRUE)
head(merged_metformin_index_date)
dim(merged_metformin_index_date)

In [None]:
sum(is.na(merged_metformin_index_date$drug_exposure_start_datetime))

In [None]:
length(unique(merged_metformin_index_date$person_id))

In [None]:
merged_metformin_index_date$Drug_Exposure_Date <- as.Date(merged_metformin_index_date$drug_exposure_start_datetime)
head(merged_metformin_index_date)
dim(merged_metformin_index_date)

In [None]:
merged_metformin_index_date$Time_Diff <- difftime(merged_metformin_index_date$Drug_Exposure_Date,
                                                  merged_metformin_index_date$Index_Date,
                                                 units = c("days"))
head(merged_metformin_index_date)

In [None]:
# unit = days, convert to as.numeric 
merged_metformin_index_date$Time_Diff <- as.numeric(merged_metformin_index_date$Time_Diff)
head(merged_metformin_index_date)

In [None]:
summary(merged_metformin_index_date$Time_Diff)

In [None]:
metformin_evidence <- subset(merged_metformin_index_date,
                   merged_metformin_index_date$Time_Diff >= -180
                   & merged_metformin_index_date$Time_Diff <= 5)
head(metformin_evidence)
dim(metformin_evidence)

In [None]:
metformin_evidence <- metformin_evidence[!duplicated(metformin_evidence$person_id),]
head(metformin_evidence)
dim(metformin_evidence)

In [None]:
# Create new column in which Metformin = 1
metformin_evidence <- metformin_evidence %>%
  mutate(Metformin = 1)
head(metformin_evidence)

In [None]:
# This snippet assumes that you run setup first

# This code saves your dataframe into a csv file in a "data" folder in Google Bucket

# Replace df with THE NAME OF YOUR DATAFRAME
my_dataframe <- metformin_evidence

# Replace 'test.csv' with THE NAME of the file you're going to store in the bucket (don't delete the quotation marks)
destination_filename <- 'metformin_evidence_11032023_AG.fst'

########################################################################
##
################# DON'T CHANGE FROM HERE ###############################
##
########################################################################

# store the dataframe in current workspace
write_fst(my_dataframe, destination_filename)

# Get the bucket name
my_bucket <- Sys.getenv('WORKSPACE_BUCKET')

# Copy the file from current workspace to the bucket
system(paste0("gsutil cp ./", destination_filename, " ", my_bucket, "/data/"), intern=T)

# Check if file is in the bucket
system(paste0("gsutil ls ", my_bucket, "/data/*.fst"), intern=T)

In [None]:
# Find the patient IDs that do not have metformin use
# Upload the propensity dataframe

# This snippet assumes that you run setup first

# This code copies a file from your Google Bucket into a dataframe

# replace 'test.csv' with the name of the file in your google bucket (don't delete the quotation marks)
name_of_file_in_bucket <- 'propensity_matching_df_11032023AG.fst'

########################################################################
##
################# DON'T CHANGE FROM HERE ###############################
##
########################################################################

# Get the bucket name
my_bucket <- Sys.getenv('WORKSPACE_BUCKET')

# Copy the file from current workspace to the bucket
system(paste0("gsutil cp ", my_bucket, "/data/", name_of_file_in_bucket, " ."), intern=T)

# Load the file into a dataframe
propensity_complete  <- read_fst(name_of_file_in_bucket)
head(propensity_complete)

In [None]:
dim(propensity_complete)

In [None]:
no_metformin <- setdiff(propensity_complete$person_id, metformin_evidence$person_id)
head(no_metformin)

In [None]:
length(intersect(unlist(no_metformin), metformin_evidence$person_id))

In [None]:
no_metformin <- data.frame(no_metformin)
colnames(no_metformin)[1] <- "person_id"
head(no_metformin)

In [None]:
no_metformin <- no_metformin %>%
  mutate(Metformin = 0)
head(no_metformin)
dim(no_metformin)

In [None]:
head(metformin_evidence)

In [None]:
metformin_evidence_pid <- metformin_evidence[,c("person_id","Metformin")]
head(metformin_evidence_pid)
dim(metformin_evidence_pid)

In [None]:
combined_metformin_usage <- rbind(no_metformin, metformin_evidence_pid)
head(combined_metformin_usage)
dim(combined_metformin_usage)

In [None]:
head(regression)
dim(regression)

In [None]:
regression <- merge(regression, combined_metformin_usage, by="person_id", all.x = TRUE)
head(regression)
dim(regression)

In [None]:
table(regression$Metformin)

# Regression Base Model

In [None]:
# THIS DATAFRAME HAS NA VALUES: DO NOT USE IN FINAL ANALYSIS. SEE C-LOGIT BM FOR OFFICIAL DF.

# This snippet assumes that you run setup first

# This code saves your dataframe into a csv file in a "data" folder in Google Bucket

# Replace df with THE NAME OF YOUR DATAFRAME
my_dataframe <- regression

# Replace 'test.csv' with THE NAME of the file you're going to store in the bucket (don't delete the quotation marks)
destination_filename <- 'regression_df_11032023_AG.fst'

########################################################################
##
################# DON'T CHANGE FROM HERE ###############################
##
########################################################################

# store the dataframe in current workspace
write_fst(my_dataframe, destination_filename)

# Get the bucket name
my_bucket <- Sys.getenv('WORKSPACE_BUCKET')

# Copy the file from current workspace to the bucket
system(paste0("gsutil cp ./", destination_filename, " ", my_bucket, "/data/"), intern=T)

# Check if file is in the bucket
system(paste0("gsutil ls ", my_bucket, "/data/*.fst"), intern=T)

In [None]:
# This snippet assumes that you run setup first

# This code copies a file from your Google Bucket into a dataframe

# replace 'test.csv' with the name of the file in your google bucket (don't delete the quotation marks)
name_of_file_in_bucket <- 'regression_df_11032023_AG.fst'

########################################################################
##
################# DON'T CHANGE FROM HERE ###############################
##
########################################################################

# Get the bucket name
my_bucket <- Sys.getenv('WORKSPACE_BUCKET')

# Copy the file from current workspace to the bucket
system(paste0("gsutil cp ", my_bucket, "/data/", name_of_file_in_bucket, " ."), intern=T)

# Load the file into a dataframe
regression  <- read_fst(name_of_file_in_bucket)
head(regression)

In [None]:
# For chenxi: Create a table that just has the PIDs
complete_cohort_pid_only <- regression[,c("person_id")]
dim(complete_cohort_pid_only)
head(complete_cohort_pid_only)
length(unique(complete_cohort_pid_only))

In [None]:
complete_cohort_pid_only <- data.frame(complete_cohort_pid_only)
head(complete_cohort_pid_only)
dim(complete_cohort_pid_only)

In [None]:
# This snippet assumes that you run setup first

# This code saves your dataframe into a csv file in a "data" folder in Google Bucket

# Replace df with THE NAME OF YOUR DATAFRAME
my_dataframe <- complete_cohort_pid_only

# Replace 'test.csv' with THE NAME of the file you're going to store in the bucket (don't delete the quotation marks)
destination_filename <- 'complete_cohort_pid_only_11032023_AG.csv'

########################################################################
##
################# DON'T CHANGE FROM HERE ###############################
##
########################################################################

# store the dataframe in current workspace
write_csv(my_dataframe, destination_filename)

# Get the bucket name
my_bucket <- Sys.getenv('WORKSPACE_BUCKET')

# Copy the file from current workspace to the bucket
system(paste0("gsutil cp ./", destination_filename, " ", my_bucket, "/data/"), intern=T)

# Check if file is in the bucket
system(paste0("gsutil ls ", my_bucket, "/data/*.csv"), intern=T)

# Logistic Regression: BM

In [None]:
head(regression)
dim(regression)

In [None]:
colnames(regression)

In [None]:
input <- regression[,c("Treatment","f.sex","f.Race","f.Ethnicity", "f.age", "Metformin",
                      "Myocardial_Infarction",
                      "Congestive_Heart_Failure", "Peripheral_Vascular_Disease", 
                      "Cerebrovascular_Disease", "Cerebrovascular_Disease", "Dementia",
                      "Chronic_Pulmonary_Disease", "Rheumatic_Disease", "Peptic_Ulcer_Disease",
                      "Liver_Disease_Mild", "Liver_Disease_Moderate_Severe", 
                      "Diabetes_without_Chronic_Complications",
                      "Diabetes_with_Chronic_Complications",
                      "Hemiplegia_Paraplegia", "Renal_Disease_Mild_Moderate",
                      "Renal_Disease_Severe", "HIV", "Metastatic_Solid_Tumor", "Malignancy",
                      "AIDS")]

am.data = glm(formula = Treatment ~ f.sex + f.Race + f.Ethnicity + f.age + Metformin +
                Myocardial_Infarction +
                Congestive_Heart_Failure + Peripheral_Vascular_Disease +
                Cerebrovascular_Disease + Cerebrovascular_Disease + Dementia +
                Chronic_Pulmonary_Disease + Rheumatic_Disease + Peptic_Ulcer_Disease +
                Liver_Disease_Mild + Liver_Disease_Moderate_Severe +  
                Diabetes_without_Chronic_Complications + Diabetes_with_Chronic_Complications + 
                Hemiplegia_Paraplegia + Renal_Disease_Mild_Moderate + 
                Renal_Disease_Severe + HIV + Metastatic_Solid_Tumor + Malignancy + AIDS,
              data = input, family = binomial)

print(summary(am.data))

# C-Logit: BM

In [None]:
library(survival)

In [None]:
# This snippet assumes that you run setup first

# This code copies a file from your Google Bucket into a dataframe

# replace 'test.csv' with the name of the file in your google bucket (don't delete the quotation marks)
name_of_file_in_bucket <- 'case_with_4controls_df_11032023_AG.fst'

########################################################################
##
################# DON'T CHANGE FROM HERE ###############################
##
########################################################################

# Get the bucket name
my_bucket <- Sys.getenv('WORKSPACE_BUCKET')

# Copy the file from current workspace to the bucket
# system(paste0("gsutil cp ", my_bucket, "/data/", name_of_file_in_bucket, " ."), intern=T)

# Load the file into a dataframe
cases_with_4controls_wide  <- read_fst(name_of_file_in_bucket)
head(cases_with_4controls_wide)

In [None]:
cases_with_4controls_wide$stratum <- 1:nrow(cases_with_4controls_wide)
head(cases_with_4controls_wide)

In [None]:
head(cases_4controls_df)

In [None]:
dim(cases_4controls_df)

In [None]:
merged_stratum_cases_4controls <- merge(cases_4controls_df,
                                        cases_with_4controls_wide[,c("Cases", "stratum")],
                                        by="Cases", all.x = TRUE)
head(merged_stratum_cases_4controls)
dim(merged_stratum_cases_4controls)

In [None]:
colnames(merged_stratum_cases_4controls)[3] <- "person_id"
head(merged_stratum_cases_4controls)
dim(merged_stratum_cases_4controls)

In [None]:
# merge the stratum back into the regression table 
regression <- merge(regression, merged_stratum_cases_4controls[,c("person_id","stratum")],
                    by="person_id", all.x = TRUE)
head(regression)
dim(regression)

In [None]:
length(unique(regression$person_id))
length(unique(regression$stratum))

In [None]:
sum(is.na(regression))

In [None]:
regression[is.na(regression)] <- 0
sum(is.na(regression))

In [None]:
# Save base model for regression analysis: no NA values
## THIS IS THE DF TO USE IN YOUR ANALYSIS

# This snippet assumes that you run setup first

# This code saves your dataframe into a csv file in a "data" folder in Google Bucket

# Replace df with THE NAME OF YOUR DATAFRAME
my_dataframe <- regression

# Replace 'test.csv' with THE NAME of the file you're going to store in the bucket (don't delete the quotation marks)
destination_filename <- 'regression_bm_11032023_AG.fst'

########################################################################
##
################# DON'T CHANGE FROM HERE ###############################
##
########################################################################

# store the dataframe in current workspace
write_fst(my_dataframe, destination_filename)

# Get the bucket name
my_bucket <- Sys.getenv('WORKSPACE_BUCKET')

# Copy the file from current workspace to the bucket
system(paste0("gsutil cp ./", destination_filename, " ", my_bucket, "/data/"), intern=T)

# Check if file is in the bucket
system(paste0("gsutil ls ", my_bucket, "/data/*.fst"), intern=T)

In [None]:
# This snippet assumes that you run setup first

# This code copies a file from your Google Bucket into a dataframe

# replace 'test.csv' with the name of the file in your google bucket (don't delete the quotation marks)
name_of_file_in_bucket <- 'regression_bm_11032023_AG.fst'

########################################################################
##
################# DON'T CHANGE FROM HERE ###############################
##
########################################################################

# Get the bucket name
my_bucket <- Sys.getenv('WORKSPACE_BUCKET')

# Copy the file from current workspace to the bucket
# system(paste0("gsutil cp ", my_bucket, "/data/", name_of_file_in_bucket, " ."), intern=T)

# Load the file into a dataframe
regression_bm  <- read_fst(name_of_file_in_bucket)
head(regression_bm)

In [None]:
# BASE CLOGIT MODEL:

In [None]:
clogit(Treatment ~ f.sex + f.Race + f.Ethnicity + f.age + Metformin +
                Myocardial_Infarction +
                Congestive_Heart_Failure + Peripheral_Vascular_Disease +
                Cerebrovascular_Disease + Cerebrovascular_Disease + Dementia +
                Chronic_Pulmonary_Disease + Rheumatic_Disease + Peptic_Ulcer_Disease +
                Liver_Disease_Mild + Liver_Disease_Moderate_Severe +  
                Diabetes_without_Chronic_Complications + Diabetes_with_Chronic_Complications + 
                Hemiplegia_Paraplegia + Renal_Disease_Mild_Moderate + 
                Renal_Disease_Severe + HIV + Metastatic_Solid_Tumor + Malignancy + AIDS +
                strata(stratum), data=regression_bm)

In [None]:
regression_bm_clogit <- clogit(Treatment ~ f.sex + f.Race + f.Ethnicity + f.age + Metformin +
                Myocardial_Infarction +
                Congestive_Heart_Failure + Peripheral_Vascular_Disease +
                Cerebrovascular_Disease + Cerebrovascular_Disease + Dementia +
                Chronic_Pulmonary_Disease + Rheumatic_Disease + Peptic_Ulcer_Disease +
                Liver_Disease_Mild + Liver_Disease_Moderate_Severe +  
                Diabetes_without_Chronic_Complications + Diabetes_with_Chronic_Complications + 
                Hemiplegia_Paraplegia + Renal_Disease_Mild_Moderate + 
                Renal_Disease_Severe + HIV + Metastatic_Solid_Tumor + Malignancy + AIDS +
                strata(stratum), data=regression_bm)

In [None]:
save(regression_bm_clogit, file = "regression_bm_clogit_11042023_AG.RData")

In [None]:
(summary(regression_bm_clogit))$coefficients

In [None]:
write.csv((summary(regression_bm_clogit))$coefficients)

# C-Logit: SDOH

In [None]:
library(survival)

In [None]:
# This snippet assumes that you run setup first

# This code copies a file from your Google Bucket into a dataframe

# replace 'test.csv' with the name of the file in your google bucket (don't delete the quotation marks)
name_of_file_in_bucket <- 'regression_bm_11032023_AG.fst'

########################################################################
##
################# DON'T CHANGE FROM HERE ###############################
##
########################################################################

# Get the bucket name
my_bucket <- Sys.getenv('WORKSPACE_BUCKET')

# Copy the file from current workspace to the bucket
system(paste0("gsutil cp ", my_bucket, "/data/", name_of_file_in_bucket, " ."), intern=T)

# Load the file into a dataframe
regression_bm  <- read_fst(name_of_file_in_bucket)
head(regression_bm)

In [None]:
# Load SDOH data:

In [None]:
# This snippet assumes that you run setup first

# This code copies a file from your Google Bucket into a dataframe

# replace 'test.csv' with the name of the file in your google bucket (don't delete the quotation marks)
name_of_file_in_bucket <- 'Survey_Basics_Co_Lifestyle_shihui_1031.csv'

########################################################################
##
################# DON'T CHANGE FROM HERE ###############################
##
########################################################################

# Get the bucket name
my_bucket <- Sys.getenv('WORKSPACE_BUCKET')

# Copy the file from current workspace to the bucket
system(paste0("gsutil cp ", my_bucket, "/data/", name_of_file_in_bucket, " ."), intern=T)

# Load the file into a dataframe
survey_basics  <- read_csv(name_of_file_in_bucket)
head(survey_basics)

In [None]:
# all of them and then lumping all the disability together 
# factor for disability - yes, no, or missing

### lumped disability 
# any disability? - general disability would be yes, if all answers are no, then no disability
#### missing if they have any missing data
## generate a column for disability - put default value as missing then - for pts who have
# yes in the first diability or second or third, then set that column as yes
# if pt has no in first AND second AND thired, then set that column as no
# indexing 

## via email - he said what the reference one is 

## insurance - do in one model 



### include the base model and then add the extra thing

 

## Insurance

In [None]:
head(regression_bm)
dim(regression_bm)

In [None]:
# Upload the insurance data:

# This snippet assumes that you run setup first

# This code copies a file from your Google Bucket into a dataframe

# replace 'test.csv' with the name of the file in your google bucket (don't delete the quotation marks)
name_of_file_in_bucket <- 'insu.fst'

########################################################################
##
################# DON'T CHANGE FROM HERE ###############################
##
########################################################################

# Get the bucket name
my_bucket <- Sys.getenv('WORKSPACE_BUCKET')

# Copy the file from current workspace to the bucket
system(paste0("gsutil cp ", my_bucket, "/data/", name_of_file_in_bucket, " ."), intern=T)

# Load the file into a dataframe
insu  <- read_fst(name_of_file_in_bucket)
head(insu)

In [None]:
dim(insu)

In [None]:
# used the merged ones as your three variables 
insu <- insu[,c("person_id","empl.merge","medcare.merge", "medicaid.merge")]
head(insu)

In [None]:
regression_insu <- merge(regression_bm, insu, by="person_id", all.x = TRUE)
head(regression_insu)
dim(regression_insu)

In [None]:
clogit(Treatment ~ f.sex + f.Race + f.Ethnicity + f.age + Metformin +
                Myocardial_Infarction +
                Congestive_Heart_Failure + Peripheral_Vascular_Disease +
                Cerebrovascular_Disease + Cerebrovascular_Disease + Dementia +
                Chronic_Pulmonary_Disease + Rheumatic_Disease + Peptic_Ulcer_Disease +
                Liver_Disease_Mild + Liver_Disease_Moderate_Severe +  
                Diabetes_without_Chronic_Complications + Diabetes_with_Chronic_Complications + 
                Hemiplegia_Paraplegia + Renal_Disease_Mild_Moderate + 
                Renal_Disease_Severe + HIV + Metastatic_Solid_Tumor + Malignancy + AIDS +
                empl.merge + medcare.merge + medicaid.merge + 
                strata(stratum), data=regression_insu)

In [None]:
insu_clogit <- clogit(Treatment ~ f.sex + f.Race + f.Ethnicity + f.age + Metformin +
                Myocardial_Infarction +
                Congestive_Heart_Failure + Peripheral_Vascular_Disease +
                Cerebrovascular_Disease + Cerebrovascular_Disease + Dementia +
                Chronic_Pulmonary_Disease + Rheumatic_Disease + Peptic_Ulcer_Disease +
                Liver_Disease_Mild + Liver_Disease_Moderate_Severe +  
                Diabetes_without_Chronic_Complications + Diabetes_with_Chronic_Complications + 
                Hemiplegia_Paraplegia + Renal_Disease_Mild_Moderate + 
                Renal_Disease_Severe + HIV + Metastatic_Solid_Tumor + Malignancy + AIDS +
                empl.merge + medcare.merge + medicaid.merge + 
                strata(stratum), data=regression_insu)

In [None]:
save(insu_clogit, file = "insu_clogit_11052023_AG.RData")

In [None]:
(summary(insu_clogit))$coefficients

In [None]:
write.csv((summary(insu_clogit))$coefficients)

## Lumped Disability

In [None]:
library(survival)

In [None]:
disa <- data.frame(person_id = survey_basics[,'person_id'])
disa$disa <- 'Missing'
disa$disa[survey_basics$disability_deaf %in% 'Yes' 
          | survey_basics$disability_blind %in% 'Yes'
          | survey_basics$disability_walking_climbing %in% 'Yes'
          | survey_basics$disability_dressing_bathing %in% 'Yes'
          | survey_basics$disability_errands_alone %in% 'Yes'
          | survey_basics$disability_concentrating %in% 'Yes'] <- "Yes"
disa$disa[survey_basics$disability_deaf %in% 'No' 
          & survey_basics$disability_blind %in% 'No'
          & survey_basics$disability_walking_climbing %in% 'No'
          & survey_basics$disability_dressing_bathing %in% 'No'
          & survey_basics$disability_errands_alone %in% 'No'
          & survey_basics$disability_concentrating %in% 'No'] <- "No"
head(disa)

In [None]:
head(regression_bm)
dim(regression_bm)

In [None]:
table(disa$disa)

In [None]:
disa_regression <- merge(regression_bm, disa, by="person_id", all.x = TRUE)
head(disa_regression)
dim(disa_regression)

In [None]:
table(disa_regression$disa)

In [None]:
# Factor disability
disa_regression$f.disa <- factor(disa_regression$disa,
                levels = c("No", "Yes", "Missing"))

In [None]:
# Save regression_sdoh file round 1 --> NO MERGING OF FACTORS
# This snippet assumes that you run setup first

# This code saves your dataframe into a csv file in a "data" folder in Google Bucket

# Replace df with THE NAME OF YOUR DATAFRAME
my_dataframe <- disa_regression

# Replace 'test.csv' with THE NAME of the file you're going to store in the bucket (don't delete the quotation marks)
destination_filename <- 'lumped_disa_regression_11122023_AG.fst'

########################################################################
##
################# DON'T CHANGE FROM HERE ###############################
##
########################################################################

# store the dataframe in current workspace
write_fst(my_dataframe, destination_filename)

# Get the bucket name
my_bucket <- Sys.getenv('WORKSPACE_BUCKET')

# Copy the file from current workspace to the bucket
system(paste0("gsutil cp ./", destination_filename, " ", my_bucket, "/data/"), intern=T)

# Check if file is in the bucket
system(paste0("gsutil ls ", my_bucket, "/data/*.fst"), intern=T)

In [None]:
clogit(Treatment ~ f.sex + f.Race + f.Ethnicity + f.age + Metformin +
                Myocardial_Infarction +
                Congestive_Heart_Failure + Peripheral_Vascular_Disease +
                Cerebrovascular_Disease + Cerebrovascular_Disease + Dementia +
                Chronic_Pulmonary_Disease + Rheumatic_Disease + Peptic_Ulcer_Disease +
                Liver_Disease_Mild + Liver_Disease_Moderate_Severe +  
                Diabetes_without_Chronic_Complications + Diabetes_with_Chronic_Complications + 
                Hemiplegia_Paraplegia + Renal_Disease_Mild_Moderate + 
                Renal_Disease_Severe + HIV + Metastatic_Solid_Tumor + Malignancy + AIDS +
                f.disa +
                strata(stratum), data=disa_regression)

In [None]:
# Save as R.Data for exportation for Forest Plot

In [None]:
disa_regression_clogit <- clogit(Treatment ~ f.sex + f.Race + f.Ethnicity + f.age + Metformin +
                Myocardial_Infarction +
                Congestive_Heart_Failure + Peripheral_Vascular_Disease +
                Cerebrovascular_Disease + Cerebrovascular_Disease + Dementia +
                Chronic_Pulmonary_Disease + Rheumatic_Disease + Peptic_Ulcer_Disease +
                Liver_Disease_Mild + Liver_Disease_Moderate_Severe +  
                Diabetes_without_Chronic_Complications + Diabetes_with_Chronic_Complications + 
                Hemiplegia_Paraplegia + Renal_Disease_Mild_Moderate + 
                Renal_Disease_Severe + HIV + Metastatic_Solid_Tumor + Malignancy + AIDS +
                f.disa +
                strata(stratum), data=disa_regression)

In [None]:
save(disa_regression_clogit, file = "disa_regression_clogit_11042023_AG.RData")

In [None]:
(summary(disa_regression_clogit))$coefficients

In [None]:
write.csv((summary(disa_regression_clogit))$coefficients)

## Education

In [None]:
head(survey_basics)

In [None]:
#Education: college graduate

In [None]:
table(survey_basics$education)

In [None]:
head(regression_bm)
dim(regression_bm)

In [None]:
regression_sdoh2 <- merge(regression_bm, survey_basics, by="person_id", all.x = TRUE)
head(regression_sdoh2)
dim(regression_sdoh2)

In [None]:
table(regression_sdoh2$education)

In [None]:
colnames(regression_sdoh2)

In [None]:
regression_sdoh2$education[regression_sdoh2$education %in% 'GED' 
          | regression_sdoh2$education %in% 'College'] <- "GED and College"
regression_sdoh2$education[regression_sdoh2$education %in% 'Advanced Degree' 
          | regression_sdoh2$education %in% 'College Graduate'] <- "Advanced"
table(regression_sdoh2$education)

In [None]:
# From original table: 2615 (ged) + 3660 (college) = 6275 (ged and college in new table)
## success!

In [None]:
# Factor disability
regression_sdoh2$f.education <- factor(regression_sdoh2$education,
                levels = c("Advanced", "Never Attended", "Below GED",
                           "GED and College", "Missing"))

In [None]:
clogit(Treatment ~ f.sex + f.Race + f.Ethnicity + f.age + Metformin +
                Myocardial_Infarction +
                Congestive_Heart_Failure + Peripheral_Vascular_Disease +
                Cerebrovascular_Disease + Cerebrovascular_Disease + Dementia +
                Chronic_Pulmonary_Disease + Rheumatic_Disease + Peptic_Ulcer_Disease +
                Liver_Disease_Mild + Liver_Disease_Moderate_Severe +  
                Diabetes_without_Chronic_Complications + Diabetes_with_Chronic_Complications + 
                Hemiplegia_Paraplegia + Renal_Disease_Mild_Moderate + 
                Renal_Disease_Severe + HIV + Metastatic_Solid_Tumor + Malignancy + AIDS +
                f.education +
                strata(stratum), data=regression_sdoh2)

In [None]:
education_clogit3 <- clogit(Treatment ~ f.sex + f.Race + f.Ethnicity + f.age + Metformin +
                Myocardial_Infarction +
                Congestive_Heart_Failure + Peripheral_Vascular_Disease +
                Cerebrovascular_Disease + Cerebrovascular_Disease + Dementia +
                Chronic_Pulmonary_Disease + Rheumatic_Disease + Peptic_Ulcer_Disease +
                Liver_Disease_Mild + Liver_Disease_Moderate_Severe +  
                Diabetes_without_Chronic_Complications + Diabetes_with_Chronic_Complications + 
                Hemiplegia_Paraplegia + Renal_Disease_Mild_Moderate + 
                Renal_Disease_Severe + HIV + Metastatic_Solid_Tumor + Malignancy + AIDS +
                f.education +
                strata(stratum), data=regression_sdoh2)

In [None]:
save(education_clogit2, file = "education_clogit3_11052023_AG.RData")

In [None]:
(summary(education_clogit3))$coefficients

In [None]:
write.csv((summary(education_clogit3))$coefficients)

## Disability: Deaf

In [None]:
head(regression_sdoh)
dim(regression_sdoh)

In [None]:
colnames(regression_sdoh)

In [None]:
table(regression_sdoh2$disability_deaf)

In [None]:
# Factor disability
regression_sdoh2$f.disability_deaf <- factor(regression_sdoh2$disability_deaf,
                levels = c("No", "Yes", "Missing"))

In [None]:
colnames(regression_sdoh2)

In [None]:
clogit(Treatment ~ f.sex + f.Race + f.Ethnicity + f.age + Metformin +
                Myocardial_Infarction +
                Congestive_Heart_Failure + Peripheral_Vascular_Disease +
                Cerebrovascular_Disease + Cerebrovascular_Disease + Dementia +
                Chronic_Pulmonary_Disease + Rheumatic_Disease + Peptic_Ulcer_Disease +
                Liver_Disease_Mild + Liver_Disease_Moderate_Severe +  
                Diabetes_without_Chronic_Complications + Diabetes_with_Chronic_Complications + 
                Hemiplegia_Paraplegia + Renal_Disease_Mild_Moderate + 
                Renal_Disease_Severe + HIV + Metastatic_Solid_Tumor + Malignancy + AIDS +
                f.disability_deaf +
                strata(stratum), data=regression_sdoh)

In [None]:
disability_deaf_clogit <- clogit(Treatment ~ f.sex + f.Race + f.Ethnicity + f.age + Metformin +
                Myocardial_Infarction +
                Congestive_Heart_Failure + Peripheral_Vascular_Disease +
                Cerebrovascular_Disease + Cerebrovascular_Disease + Dementia +
                Chronic_Pulmonary_Disease + Rheumatic_Disease + Peptic_Ulcer_Disease +
                Liver_Disease_Mild + Liver_Disease_Moderate_Severe +  
                Diabetes_without_Chronic_Complications + Diabetes_with_Chronic_Complications + 
                Hemiplegia_Paraplegia + Renal_Disease_Mild_Moderate + 
                Renal_Disease_Severe + HIV + Metastatic_Solid_Tumor + Malignancy + AIDS +
                f.disability_deaf +
                strata(stratum), data=regression_sdoh)

In [None]:
save(disability_deaf_clogit, file = "disability_deaf_clogit_11042023_AG.RData")

In [None]:
(summary(disability_deaf_clogit))$coefficients

In [None]:
write.csv((summary(disability_deaf_clogit))$coefficients)

## Disability: Blind

In [None]:
head(regression_sdoh)
dim(regression_sdoh)
colnames(regression_sdoh)

In [None]:
table(regression_sdoh2$disability_blind)

In [None]:
# Factor disability
regression_sdoh2$f.disability_blind <- factor(regression_sdoh2$disability_blind,
                levels = c("No", "Yes", "Missing"))

In [None]:
clogit(Treatment ~ f.sex + f.Race + f.Ethnicity + f.age + Metformin +
                Myocardial_Infarction +
                Congestive_Heart_Failure + Peripheral_Vascular_Disease +
                Cerebrovascular_Disease + Cerebrovascular_Disease + Dementia +
                Chronic_Pulmonary_Disease + Rheumatic_Disease + Peptic_Ulcer_Disease +
                Liver_Disease_Mild + Liver_Disease_Moderate_Severe +  
                Diabetes_without_Chronic_Complications + Diabetes_with_Chronic_Complications + 
                Hemiplegia_Paraplegia + Renal_Disease_Mild_Moderate + 
                Renal_Disease_Severe + HIV + Metastatic_Solid_Tumor + Malignancy + AIDS +
                f.disability_blind +
                strata(stratum), data=regression_sdoh)

In [None]:
disability_blind_clogit <- clogit(Treatment ~ f.sex + f.Race + f.Ethnicity + f.age + Metformin +
                Myocardial_Infarction +
                Congestive_Heart_Failure + Peripheral_Vascular_Disease +
                Cerebrovascular_Disease + Cerebrovascular_Disease + Dementia +
                Chronic_Pulmonary_Disease + Rheumatic_Disease + Peptic_Ulcer_Disease +
                Liver_Disease_Mild + Liver_Disease_Moderate_Severe +  
                Diabetes_without_Chronic_Complications + Diabetes_with_Chronic_Complications + 
                Hemiplegia_Paraplegia + Renal_Disease_Mild_Moderate + 
                Renal_Disease_Severe + HIV + Metastatic_Solid_Tumor + Malignancy + AIDS +
                f.disability_blind +
                strata(stratum), data=regression_sdoh)

In [None]:
save(disability_blind_clogit, file = "disability_blind_clogit_11042023_AG.RData")

In [None]:
(summary(disability_blind_clogit))$coefficients

In [None]:
write.csv((summary(disability_blind_clogit))$coefficients)

## Disability: Walking, Climbing

In [None]:
head(regression_sdoh)
dim(regression_sdoh)
colnames(regression_sdoh)

In [None]:
table(regression_sdoh$disability_walking_climbing)

In [None]:
# Factor disability
regression_sdoh2$f.disability_walking_climbing <- factor(regression_sdoh2$disability_walking_climbing,
                levels = c("No", "Yes", "Missing"))

In [None]:
clogit(Treatment ~ f.sex + f.Race + f.Ethnicity + f.age + Metformin +
                Myocardial_Infarction +
                Congestive_Heart_Failure + Peripheral_Vascular_Disease +
                Cerebrovascular_Disease + Cerebrovascular_Disease + Dementia +
                Chronic_Pulmonary_Disease + Rheumatic_Disease + Peptic_Ulcer_Disease +
                Liver_Disease_Mild + Liver_Disease_Moderate_Severe +  
                Diabetes_without_Chronic_Complications + Diabetes_with_Chronic_Complications + 
                Hemiplegia_Paraplegia + Renal_Disease_Mild_Moderate + 
                Renal_Disease_Severe + HIV + Metastatic_Solid_Tumor + Malignancy + AIDS +
                f.disability_walking_climbing +
                strata(stratum), data=regression_sdoh)

In [None]:
disability_walking_climbing_clogit <- clogit(Treatment ~ f.sex + f.Race + f.Ethnicity + f.age + Metformin +
                Myocardial_Infarction +
                Congestive_Heart_Failure + Peripheral_Vascular_Disease +
                Cerebrovascular_Disease + Cerebrovascular_Disease + Dementia +
                Chronic_Pulmonary_Disease + Rheumatic_Disease + Peptic_Ulcer_Disease +
                Liver_Disease_Mild + Liver_Disease_Moderate_Severe +  
                Diabetes_without_Chronic_Complications + Diabetes_with_Chronic_Complications + 
                Hemiplegia_Paraplegia + Renal_Disease_Mild_Moderate + 
                Renal_Disease_Severe + HIV + Metastatic_Solid_Tumor + Malignancy + AIDS +
                f.disability_walking_climbing +
                strata(stratum), data=regression_sdoh)

In [None]:
save(disability_walking_climbing_clogit, file = "disability_walking_climbing_clogit_11042023_AG.RData")

In [None]:
(summary(disability_walking_climbing_clogit))$coefficients

In [None]:
write.csv((summary(disability_walking_climbing_clogit))$coefficients)

## Disability: Dressing, Bathing

In [None]:
head(regression_sdoh)
dim(regression_sdoh)
colnames(regression_sdoh)

In [None]:
table(survey_basics$disability_dressing_bathing)

In [None]:
# Factor disability
regression_sdoh2$f.disability_dressing_bathing <- factor(regression_sdoh2$disability_dressing_bathing,
                levels = c("No", "Yes", "Missing"))

In [None]:
clogit(Treatment ~ f.sex + f.Race + f.Ethnicity + f.age + Metformin +
                Myocardial_Infarction +
                Congestive_Heart_Failure + Peripheral_Vascular_Disease +
                Cerebrovascular_Disease + Cerebrovascular_Disease + Dementia +
                Chronic_Pulmonary_Disease + Rheumatic_Disease + Peptic_Ulcer_Disease +
                Liver_Disease_Mild + Liver_Disease_Moderate_Severe +  
                Diabetes_without_Chronic_Complications + Diabetes_with_Chronic_Complications + 
                Hemiplegia_Paraplegia + Renal_Disease_Mild_Moderate + 
                Renal_Disease_Severe + HIV + Metastatic_Solid_Tumor + Malignancy + AIDS +
                f.disability_dressing_bathing +
                strata(stratum), data=regression_sdoh)

In [None]:
disability_dressing_bathing_clogit <- clogit(Treatment ~ f.sex + f.Race + f.Ethnicity + f.age + Metformin +
                Myocardial_Infarction +
                Congestive_Heart_Failure + Peripheral_Vascular_Disease +
                Cerebrovascular_Disease + Cerebrovascular_Disease + Dementia +
                Chronic_Pulmonary_Disease + Rheumatic_Disease + Peptic_Ulcer_Disease +
                Liver_Disease_Mild + Liver_Disease_Moderate_Severe +  
                Diabetes_without_Chronic_Complications + Diabetes_with_Chronic_Complications + 
                Hemiplegia_Paraplegia + Renal_Disease_Mild_Moderate + 
                Renal_Disease_Severe + HIV + Metastatic_Solid_Tumor + Malignancy + AIDS +
                f.disability_dressing_bathing +
                strata(stratum), data=regression_sdoh)

In [None]:
save(disability_dressing_bathing_clogit, file = "disability_dressing_bathing_clogit_11042023_AG.RData")

In [None]:
(summary(disability_dressing_bathing_clogit))$coefficients

In [None]:
write.csv((summary(disability_dressing_bathing_clogit))$coefficients)

## Disability: Errands Alone

In [None]:
head(regression_sdoh2)
dim(regression_sdoh2)
colnames(regression_sdoh2)

In [None]:
table(regression_sdoh2$disability_errands_alone)

In [None]:
# Factor disability
regression_sdoh2$f.disability_errands_alone <- factor(regression_sdoh2$disability_errands_alone,
                levels = c("No", "Yes", "Missing"))

In [None]:
clogit(Treatment ~ f.sex + f.Race + f.Ethnicity + f.age + Metformin +
                Myocardial_Infarction +
                Congestive_Heart_Failure + Peripheral_Vascular_Disease +
                Cerebrovascular_Disease + Cerebrovascular_Disease + Dementia +
                Chronic_Pulmonary_Disease + Rheumatic_Disease + Peptic_Ulcer_Disease +
                Liver_Disease_Mild + Liver_Disease_Moderate_Severe +  
                Diabetes_without_Chronic_Complications + Diabetes_with_Chronic_Complications + 
                Hemiplegia_Paraplegia + Renal_Disease_Mild_Moderate + 
                Renal_Disease_Severe + HIV + Metastatic_Solid_Tumor + Malignancy + AIDS +
                f.disability_errands_alone +
                strata(stratum), data=regression_sdoh)

In [None]:
disability_errands_alone_clogit <- clogit(Treatment ~ f.sex + f.Race + f.Ethnicity + f.age + Metformin +
                Myocardial_Infarction +
                Congestive_Heart_Failure + Peripheral_Vascular_Disease +
                Cerebrovascular_Disease + Cerebrovascular_Disease + Dementia +
                Chronic_Pulmonary_Disease + Rheumatic_Disease + Peptic_Ulcer_Disease +
                Liver_Disease_Mild + Liver_Disease_Moderate_Severe +  
                Diabetes_without_Chronic_Complications + Diabetes_with_Chronic_Complications + 
                Hemiplegia_Paraplegia + Renal_Disease_Mild_Moderate + 
                Renal_Disease_Severe + HIV + Metastatic_Solid_Tumor + Malignancy + AIDS +
                f.disability_errands_alone +
                strata(stratum), data=regression_sdoh)

In [None]:
save(disability_errands_alone_clogit, file = "disability_errands_alone_clogit_11042023_AG.RData")

In [None]:
(summary(disability_errands_alone_clogit))$coefficients

In [None]:
write.csv((summary(disability_errands_alone_clogit))$coefficients)

## Disability: Concentrating

In [None]:
head(regression_sdoh)
dim(regression_sdoh)
colnames(regression_sdoh)

In [None]:
table(regression_sdoh2$disability_concentrating)

In [None]:
# Factor disability
regression_sdoh2$f.disability_concentrating <- factor(regression_sdoh2$disability_concentrating,
                levels = c("No", "Yes", "Missing"))

In [None]:
clogit(Treatment ~ f.sex + f.Race + f.Ethnicity + f.age + Metformin +
                Myocardial_Infarction +
                Congestive_Heart_Failure + Peripheral_Vascular_Disease +
                Cerebrovascular_Disease + Cerebrovascular_Disease + Dementia +
                Chronic_Pulmonary_Disease + Rheumatic_Disease + Peptic_Ulcer_Disease +
                Liver_Disease_Mild + Liver_Disease_Moderate_Severe +  
                Diabetes_without_Chronic_Complications + Diabetes_with_Chronic_Complications + 
                Hemiplegia_Paraplegia + Renal_Disease_Mild_Moderate + 
                Renal_Disease_Severe + HIV + Metastatic_Solid_Tumor + Malignancy + AIDS +
                f.disability_concentrating +
                strata(stratum), data=regression_sdoh)

In [None]:
disability_concentrating_clogit <- clogit(Treatment ~ f.sex + f.Race + f.Ethnicity + f.age + Metformin +
                Myocardial_Infarction +
                Congestive_Heart_Failure + Peripheral_Vascular_Disease +
                Cerebrovascular_Disease + Cerebrovascular_Disease + Dementia +
                Chronic_Pulmonary_Disease + Rheumatic_Disease + Peptic_Ulcer_Disease +
                Liver_Disease_Mild + Liver_Disease_Moderate_Severe +  
                Diabetes_without_Chronic_Complications + Diabetes_with_Chronic_Complications + 
                Hemiplegia_Paraplegia + Renal_Disease_Mild_Moderate + 
                Renal_Disease_Severe + HIV + Metastatic_Solid_Tumor + Malignancy + AIDS +
                f.disability_concentrating +
                strata(stratum), data=regression_sdoh)

In [None]:
save(disability_concentrating_clogit, file = "disability_concentrating_clogit_11042023_AG.RData")

In [None]:
(summary(disability_concentrating_clogit))$coefficients

In [None]:
write.csv((summary(disability_concentrating_clogit))$coefficients)

## Employment

In [None]:
#Employment: employed

In [None]:
head(regression_sdoh2)
dim(regression_sdoh2)
colnames(regression_sdoh2)

In [None]:
table(regression_sdoh2$employment)

In [None]:
regression_sdoh2$employment[regression_sdoh2$employment %in% 'Homemaker' 
          | regression_sdoh2$employment %in% 'Retired'
          | regression_sdoh2$employment %in% 'Self-employed'] <- "Others"
table(regression_sdoh2$employment)

In [None]:
# 470 + 4147 + 656 = 5273, the number in others = success!

In [None]:
# Factor disability
regression_sdoh2$f.employment <- factor(regression_sdoh2$employment,
                levels = c("Employed", "Student", "Unemployed",
                           "Others", "Missing"))

In [None]:
clogit(Treatment ~ f.sex + f.Race + f.Ethnicity + f.age + Metformin +
                Myocardial_Infarction +
                Congestive_Heart_Failure + Peripheral_Vascular_Disease +
                Cerebrovascular_Disease + Cerebrovascular_Disease + Dementia +
                Chronic_Pulmonary_Disease + Rheumatic_Disease + Peptic_Ulcer_Disease +
                Liver_Disease_Mild + Liver_Disease_Moderate_Severe +  
                Diabetes_without_Chronic_Complications + Diabetes_with_Chronic_Complications + 
                Hemiplegia_Paraplegia + Renal_Disease_Mild_Moderate + 
                Renal_Disease_Severe + HIV + Metastatic_Solid_Tumor + Malignancy + AIDS +
                f.employment +
                strata(stratum), data=regression_sdoh2)

In [None]:
employment_clogit2 <- clogit(Treatment ~ f.sex + f.Race + f.Ethnicity + f.age + Metformin +
                Myocardial_Infarction +
                Congestive_Heart_Failure + Peripheral_Vascular_Disease +
                Cerebrovascular_Disease + Cerebrovascular_Disease + Dementia +
                Chronic_Pulmonary_Disease + Rheumatic_Disease + Peptic_Ulcer_Disease +
                Liver_Disease_Mild + Liver_Disease_Moderate_Severe +  
                Diabetes_without_Chronic_Complications + Diabetes_with_Chronic_Complications + 
                Hemiplegia_Paraplegia + Renal_Disease_Mild_Moderate + 
                Renal_Disease_Severe + HIV + Metastatic_Solid_Tumor + Malignancy + AIDS +
                f.employment +
                strata(stratum), data=regression_sdoh2)

In [None]:
save(employment_clogit2, file = "employment_clogit2_11052023_AG.RData")

In [None]:
(summary(employment_clogit2))$coefficients

In [None]:
write.csv((summary(employment_clogit2))$coefficients)

## Income

In [None]:
#Income: 50k â€“ 75k (the national median) ---> changing it to 35k-100k

In [None]:
head(regression_sdoh2)
dim(regression_sdoh2)
colnames(regression_sdoh2)

In [None]:
table(regression_sdoh2$income)

In [None]:
regression_sdoh2$income[regression_sdoh2$income %in% '35k-50k' 
          | regression_sdoh2$income %in% '50k-75k'
          | regression_sdoh2$income %in% '75k-100k'] <- "35k-100k"
table(regression_sdoh2$income)

In [None]:
# Factor income
regression_sdoh2$f.income <- factor(regression_sdoh2$income,
                levels = c("35k-100k", "less 10k", "10k-25k", "25k-35k",
                          "100k-150k", "150k-200k",
                          "more than 200k", "Missing"))

In [None]:
clogit(Treatment ~ f.sex + f.Race + f.Ethnicity + f.age + Metformin +
                Myocardial_Infarction +
                Congestive_Heart_Failure + Peripheral_Vascular_Disease +
                Cerebrovascular_Disease + Cerebrovascular_Disease + Dementia +
                Chronic_Pulmonary_Disease + Rheumatic_Disease + Peptic_Ulcer_Disease +
                Liver_Disease_Mild + Liver_Disease_Moderate_Severe +  
                Diabetes_without_Chronic_Complications + Diabetes_with_Chronic_Complications + 
                Hemiplegia_Paraplegia + Renal_Disease_Mild_Moderate + 
                Renal_Disease_Severe + HIV + Metastatic_Solid_Tumor + Malignancy + AIDS +
                f.income +
                strata(stratum), data=regression_sdoh2)

In [None]:
income_clogit2 <- clogit(Treatment ~ f.sex + f.Race + f.Ethnicity + f.age + Metformin +
                Myocardial_Infarction +
                Congestive_Heart_Failure + Peripheral_Vascular_Disease +
                Cerebrovascular_Disease + Cerebrovascular_Disease + Dementia +
                Chronic_Pulmonary_Disease + Rheumatic_Disease + Peptic_Ulcer_Disease +
                Liver_Disease_Mild + Liver_Disease_Moderate_Severe +  
                Diabetes_without_Chronic_Complications + Diabetes_with_Chronic_Complications + 
                Hemiplegia_Paraplegia + Renal_Disease_Mild_Moderate + 
                Renal_Disease_Severe + HIV + Metastatic_Solid_Tumor + Malignancy + AIDS +
                f.income +
                strata(stratum), data=regression_sdoh2)

In [None]:
save(income_clogit2, file = "income_clogit2_11052023_AG.RData")

In [None]:
(summary(income_clogit2))$coefficients

In [None]:
write.csv((summary(income_clogit2))$coefficients)

## Housing

In [None]:
#Housing: owner

In [None]:
head(regression_sdoh2)
dim(regression_sdoh2)
colnames(regression_sdoh2)

In [None]:
table(regression_sdoh2$housing)

In [None]:
# Factor disability
regression_sdoh2$f.housing <- factor(regression_sdoh2$housing,
                levels = c("Own", "Rent", "Others", "Missing"))

In [None]:
clogit(Treatment ~ f.sex + f.Race + f.Ethnicity + f.age + Metformin +
                Myocardial_Infarction +
                Congestive_Heart_Failure + Peripheral_Vascular_Disease +
                Cerebrovascular_Disease + Cerebrovascular_Disease + Dementia +
                Chronic_Pulmonary_Disease + Rheumatic_Disease + Peptic_Ulcer_Disease +
                Liver_Disease_Mild + Liver_Disease_Moderate_Severe +  
                Diabetes_without_Chronic_Complications + Diabetes_with_Chronic_Complications + 
                Hemiplegia_Paraplegia + Renal_Disease_Mild_Moderate + 
                Renal_Disease_Severe + HIV + Metastatic_Solid_Tumor + Malignancy + AIDS +
                f.housing +
                strata(stratum), data=regression_sdoh2)

In [None]:
housing_clogit <- clogit(Treatment ~ f.sex + f.Race + f.Ethnicity + f.age + Metformin +
                Myocardial_Infarction +
                Congestive_Heart_Failure + Peripheral_Vascular_Disease +
                Cerebrovascular_Disease + Cerebrovascular_Disease + Dementia +
                Chronic_Pulmonary_Disease + Rheumatic_Disease + Peptic_Ulcer_Disease +
                Liver_Disease_Mild + Liver_Disease_Moderate_Severe +  
                Diabetes_without_Chronic_Complications + Diabetes_with_Chronic_Complications + 
                Hemiplegia_Paraplegia + Renal_Disease_Mild_Moderate + 
                Renal_Disease_Severe + HIV + Metastatic_Solid_Tumor + Malignancy + AIDS +
                f.housing +
                strata(stratum), data=regression_sdoh)

In [None]:
save(housing_clogit, file = "housing_clogit_11042023_AG.RData")

In [None]:
(summary(housing_clogit))$coefficients

In [None]:
write.csv((summary(housing_clogit))$coefficients)

## Stable House

In [None]:
#Stable housing: no = baseline (question was whether or not you have to worry about housing
# stability)

In [None]:
head(regression_sdoh2)
dim(regression_sdoh2)
colnames(regression_sdoh2)

In [None]:
table(regression_sdoh2$stable_house)

In [None]:
# Factor disability
regression_sdoh2$f.stable_house <- factor(regression_sdoh2$stable_house,
                levels = c("No", "Yes", "Missing"))

In [None]:
clogit(Treatment ~ f.sex + f.Race + f.Ethnicity + f.age + Metformin +
                Myocardial_Infarction +
                Congestive_Heart_Failure + Peripheral_Vascular_Disease +
                Cerebrovascular_Disease + Cerebrovascular_Disease + Dementia +
                Chronic_Pulmonary_Disease + Rheumatic_Disease + Peptic_Ulcer_Disease +
                Liver_Disease_Mild + Liver_Disease_Moderate_Severe +  
                Diabetes_without_Chronic_Complications + Diabetes_with_Chronic_Complications + 
                Hemiplegia_Paraplegia + Renal_Disease_Mild_Moderate + 
                Renal_Disease_Severe + HIV + Metastatic_Solid_Tumor + Malignancy + AIDS +
                f.stable_house +
                strata(stratum), data=regression_sdoh2)

In [None]:
stable_house_clogit2 <- clogit(Treatment ~ f.sex + f.Race + f.Ethnicity + f.age + Metformin +
                Myocardial_Infarction +
                Congestive_Heart_Failure + Peripheral_Vascular_Disease +
                Cerebrovascular_Disease + Cerebrovascular_Disease + Dementia +
                Chronic_Pulmonary_Disease + Rheumatic_Disease + Peptic_Ulcer_Disease +
                Liver_Disease_Mild + Liver_Disease_Moderate_Severe +  
                Diabetes_without_Chronic_Complications + Diabetes_with_Chronic_Complications + 
                Hemiplegia_Paraplegia + Renal_Disease_Mild_Moderate + 
                Renal_Disease_Severe + HIV + Metastatic_Solid_Tumor + Malignancy + AIDS +
                f.stable_house +
                strata(stratum), data=regression_sdoh2)

In [None]:
save(stable_house_clogit2, file = "stable_house_clogit2_11212023_AG.RData")

In [None]:
(summary(stable_house_clogit2))$coefficients

In [None]:
write.csv((summary(stable_house_clogit2))$coefficients)

## Regression files

In [None]:
# This snippet assumes that you run setup first --> BASE MODEL

# This code copies a file from your Google Bucket into a dataframe

# replace 'test.csv' with the name of the file in your google bucket (don't delete the quotation marks)
name_of_file_in_bucket <- 'regression_bm_11032023_AG.fst'

########################################################################
##
################# DON'T CHANGE FROM HERE ###############################
##
########################################################################

# Get the bucket name
my_bucket <- Sys.getenv('WORKSPACE_BUCKET')

# Copy the file from current workspace to the bucket
# system(paste0("gsutil cp ", my_bucket, "/data/", name_of_file_in_bucket, " ."), intern=T)

# Load the file into a dataframe
regression_bm  <- read_fst(name_of_file_in_bucket)
head(regression_bm)

In [None]:
dim(regression_bm)

In [None]:
# Save regression_sdoh file round 1 --> NO MERGING OF FACTORS
# This snippet assumes that you run setup first

# This code saves your dataframe into a csv file in a "data" folder in Google Bucket

# Replace df with THE NAME OF YOUR DATAFRAME
my_dataframe <- regression_sdoh

# Replace 'test.csv' with THE NAME of the file you're going to store in the bucket (don't delete the quotation marks)
destination_filename <- 'regression_sdoh_11042023_AG.fst'

########################################################################
##
################# DON'T CHANGE FROM HERE ###############################
##
########################################################################

# store the dataframe in current workspace
write_fst(my_dataframe, destination_filename)

# Get the bucket name
my_bucket <- Sys.getenv('WORKSPACE_BUCKET')

# Copy the file from current workspace to the bucket
system(paste0("gsutil cp ./", destination_filename, " ", my_bucket, "/data/"), intern=T)

# Check if file is in the bucket
system(paste0("gsutil ls ", my_bucket, "/data/*.fst"), intern=T)

In [None]:
# This snippet assumes that you run setup first

# This code copies a file from your Google Bucket into a dataframe

# replace 'test.csv' with the name of the file in your google bucket (don't delete the quotation marks)
name_of_file_in_bucket <- 'regression_sdoh_11042023_AG.fst'

########################################################################
##
################# DON'T CHANGE FROM HERE ###############################
##
########################################################################

# Get the bucket name
my_bucket <- Sys.getenv('WORKSPACE_BUCKET')

# Copy the file from current workspace to the bucket
system(paste0("gsutil cp ", my_bucket, "/data/", name_of_file_in_bucket, " ."), intern=T)

# Load the file into a dataframe
regression_sdoh  <- read_fst(name_of_file_in_bucket)
head(regression_sdoh)

In [None]:
colnames(regression_sdoh2)
head(regression_sdoh2)
dim(regression_sdoh2)

In [None]:
# THIS IS THE MERGING OF FACTORS DATAFRAME, regression_sdoh2:
# This snippet assumes that you run setup first

# This code saves your dataframe into a csv file in a "data" folder in Google Bucket

# Replace df with THE NAME OF YOUR DATAFRAME
my_dataframe <- regression_sdoh2

# Replace 'test.csv' with THE NAME of the file you're going to store in the bucket (don't delete the quotation marks)
destination_filename <- 'regression_sdoh2_11122023_AG.fst'

########################################################################
##
################# DON'T CHANGE FROM HERE ###############################
##
########################################################################

# store the dataframe in current workspace
write_fst(my_dataframe, destination_filename)

# Get the bucket name
my_bucket <- Sys.getenv('WORKSPACE_BUCKET')

# Copy the file from current workspace to the bucket
system(paste0("gsutil cp ./", destination_filename, " ", my_bucket, "/data/"), intern=T)

# Check if file is in the bucket
system(paste0("gsutil ls ", my_bucket, "/data/*.fst"), intern=T)


In [None]:
# This snippet assumes that you run setup first

# This code copies a file from your Google Bucket into a dataframe

# replace 'test.csv' with the name of the file in your google bucket (don't delete the quotation marks)
name_of_file_in_bucket <- 'regression_sdoh2_11122023_AG.fst'

########################################################################
##
################# DON'T CHANGE FROM HERE ###############################
##
########################################################################

# Get the bucket name
my_bucket <- Sys.getenv('WORKSPACE_BUCKET')

# Copy the file from current workspace to the bucket
system(paste0("gsutil cp ", my_bucket, "/data/", name_of_file_in_bucket, " ."), intern=T)

# Load the file into a dataframe
regression_sdoh2  <- read_fst(name_of_file_in_bucket)
head(regression_sdoh2)

In [None]:
colnames(regression_sdoh2)

In [None]:
regression_sdoh2$f.stable_house <- NULL

In [None]:
colnames(regression_sdoh2)

In [None]:
# Third version of regression sdoh data (this one has the fixed stable housing)

# This snippet assumes that you run setup first

# This code saves your dataframe into a csv file in a "data" folder in Google Bucket

# Replace df with THE NAME OF YOUR DATAFRAME
my_dataframe <- regression_sdoh2

# Replace 'test.csv' with THE NAME of the file you're going to store in the bucket (don't delete the quotation marks)
destination_filename <- 'regression_sdoh2_11212023_AG.fst'

########################################################################
##
################# DON'T CHANGE FROM HERE ###############################
##
########################################################################

# store the dataframe in current workspace
write_fst(my_dataframe, destination_filename)

# Get the bucket name
my_bucket <- Sys.getenv('WORKSPACE_BUCKET')

# Copy the file from current workspace to the bucket
system(paste0("gsutil cp ./", destination_filename, " ", my_bucket, "/data/"), intern=T)

# Check if file is in the bucket
system(paste0("gsutil ls ", my_bucket, "/data/*.fst"), intern=T)


In [None]:
# This snippet assumes that you run setup first

# This code copies a file from your Google Bucket into a dataframe

# replace 'test.csv' with the name of the file in your google bucket (don't delete the quotation marks)
name_of_file_in_bucket <- 'regression_sdoh2_11212023_AG.fst'

########################################################################
##
################# DON'T CHANGE FROM HERE ###############################
##
########################################################################

# Get the bucket name
my_bucket <- Sys.getenv('WORKSPACE_BUCKET')

# Copy the file from current workspace to the bucket
system(paste0("gsutil cp ", my_bucket, "/data/", name_of_file_in_bucket, " ."), intern=T)

# Load the file into a dataframe
regression_sdoh2  <- read_fst(name_of_file_in_bucket)
head(regression_sdoh2)

In [None]:
dim(regression_sdoh)

In [None]:
colnames(regression_sdoh)

# Genotypes

## Clean Dataset

In [None]:
# This snippet assumes that you run setup first

# This code copies a file from your Google Bucket into a dataframe

# replace 'test.csv' with the name of the file in your google bucket (don't delete the quotation marks)
name_of_file_in_bucket <- 'Acidosis_genotype_CX_11292023.csv'

########################################################################
##
################# DON'T CHANGE FROM HERE ###############################
##
########################################################################

# Get the bucket name
my_bucket <- Sys.getenv('WORKSPACE_BUCKET')

# Copy the file from current workspace to the bucket
system(paste0("gsutil cp ", my_bucket, "/data/", name_of_file_in_bucket, " ."), intern=T)

# Load the file into a dataframe
genotype_og  <- read_csv(name_of_file_in_bucket)
head(genotype_og)

In [None]:
library(survival)

In [None]:
dim(genotype_og)

## MATE1: rs2289669 

In [None]:
rs2289669 <- subset(genotype_og, (locus == 'chr17:19560030'))
head(rs2289669)
dim(rs2289669)

In [None]:
length(unique(rs2289669$person_id
# other data may have duplicates - will need to remove duplicates and choose what one is better
# can do this manually 

In [None]:
# Make the df smaller so that it only includes participants in the acidosis cohort 

In [None]:
# This snippet assumes that you run setup first

# This code copies a file from your Google Bucket into a dataframe

# replace 'test.csv' with the name of the file in your google bucket (don't delete the quotation marks)
name_of_file_in_bucket <- 'complete_cohort_pid_only_11032023_AG.csv'

########################################################################
##
################# DON'T CHANGE FROM HERE ###############################
##
########################################################################

# Get the bucket name
my_bucket <- Sys.getenv('WORKSPACE_BUCKET')

# Copy the file from current workspace to the bucket
system(paste0("gsutil cp ", my_bucket, "/data/", name_of_file_in_bucket, " ."), intern=T)

# Load the file into a dataframe
cohort_pid  <- read_csv(name_of_file_in_bucket)
head(cohort_pid)

In [None]:
dim(cohort_pid)

In [None]:
colnames(cohort_pid)[1] <- "person_id"
head(cohort_pid)

In [None]:
rs2289669 <- merge(cohort_pid, rs2289669, by="person_id", all.x = TRUE)
head(rs2289669)
dim(rs2289669)

In [None]:
table(rs2289669$GT)

In [None]:
table(is.na(rs2289669$GT))

In [None]:
rs2289669_factor <- data.frame(person_id = rs2289669[,'person_id'])
rs2289669_factor$GT <- 'Missing'
rs2289669_factor$GT[rs2289669$GT %in% '0/0'] <- "0/0"
rs2289669_factor$GT[rs2289669$GT %in% '0/1'] <- "0/1"
rs2289669_factor$GT[rs2289669$GT %in% '1/1'] <- "1/1"

head(rs2289669_factor)

In [None]:
table(rs2289669_factor$GT)

In [None]:
rs2289669_factor$f.rs2289669 <- factor(rs2289669_factor$GT,
                levels = c("0/0", "0/1", "1/1", "Missing"))

In [None]:
# This snippet assumes that you run setup first --> BASE MODEL

# This code copies a file from your Google Bucket into a dataframe

# replace 'test.csv' with the name of the file in your google bucket (don't delete the quotation marks)
name_of_file_in_bucket <- 'regression_bm_11032023_AG.fst'

########################################################################
##
################# DON'T CHANGE FROM HERE ###############################
##
########################################################################

# Get the bucket name
my_bucket <- Sys.getenv('WORKSPACE_BUCKET')

# Copy the file from current workspace to the bucket
# system(paste0("gsutil cp ", my_bucket, "/data/", name_of_file_in_bucket, " ."), intern=T)

# Load the file into a dataframe
regression_bm  <- read_fst(name_of_file_in_bucket)
head(regression_bm)

In [None]:
rs2289669_factor <- merge(regression_bm, rs2289669_factor, by="person_id", all.x = TRUE)
head(rs2289669_factor)
dim(rs2289669_factor)

In [None]:
clogit(Treatment ~ f.sex + f.Race + f.Ethnicity + f.age + Metformin +
                Myocardial_Infarction +
                Congestive_Heart_Failure + Peripheral_Vascular_Disease +
                Cerebrovascular_Disease + Cerebrovascular_Disease + Dementia +
                Chronic_Pulmonary_Disease + Rheumatic_Disease + Peptic_Ulcer_Disease +
                Liver_Disease_Mild + Liver_Disease_Moderate_Severe +  
                Diabetes_without_Chronic_Complications + Diabetes_with_Chronic_Complications + 
                Hemiplegia_Paraplegia + Renal_Disease_Mild_Moderate + 
                Renal_Disease_Severe + HIV + Metastatic_Solid_Tumor + Malignancy + AIDS +
                f.rs2289669 +
                strata(stratum), data=rs2289669_factor)

In [None]:
rs22896690_clogit <- clogit(Treatment ~ f.sex + f.Race + f.Ethnicity + f.age + Metformin +
                Myocardial_Infarction +
                Congestive_Heart_Failure + Peripheral_Vascular_Disease +
                Cerebrovascular_Disease + Cerebrovascular_Disease + Dementia +
                Chronic_Pulmonary_Disease + Rheumatic_Disease + Peptic_Ulcer_Disease +
                Liver_Disease_Mild + Liver_Disease_Moderate_Severe +  
                Diabetes_without_Chronic_Complications + Diabetes_with_Chronic_Complications + 
                Hemiplegia_Paraplegia + Renal_Disease_Mild_Moderate + 
                Renal_Disease_Severe + HIV + Metastatic_Solid_Tumor + Malignancy + AIDS +
                f.rs2289669 +
                strata(stratum), data=rs2289669_factor)

In [None]:
save(rs22896690_clogit, file = "rs22896690_clogit_12022023_AG.RData")

In [None]:
(summary(rs22896690_clogit))$coefficients

In [None]:
write.csv((summary(rs22896690_clogit))$coefficients)

## MATE1: rs8065082  

In [None]:
17:19561878

In [None]:
rs8065082 <- subset(genotype_og, (locus == 'chr17:19561878'))
head(rs8065082)
dim(rs8065082)

In [None]:
length(unique(rs8065082$person_id))

In [None]:
rs8065082 <- merge(regression_bm, rs8065082, by="person_id", all.x = TRUE)
head(rs8065082)
dim(rs8065082)

In [None]:
table(rs8065082$GT)

In [None]:
table(is.na(rs8065082$GT))

In [None]:
rs8065082_factor <- data.frame(person_id = rs8065082[,'person_id'])
rs8065082_factor$GT <- 'Missing'
rs8065082_factor$GT[rs8065082$GT %in% '0/0'] <- "0/0"
rs8065082_factor$GT[rs8065082$GT %in% '0/1'] <- "0/1"
rs8065082_factor$GT[rs8065082$GT %in% '1/1'] <- "1/1"

head(rs8065082_factor)

In [None]:
table(rs8065082_factor$GT)

In [None]:
rs8065082_factor$f.rs8065082 <- factor(rs8065082_factor$GT,
                levels = c("0/0", "0/1", "1/1", "Missing"))

In [None]:
rs8065082_factor <- merge(regression_bm, rs8065082_factor, by="person_id", all.x = TRUE)
head(rs8065082_factor)
dim(rs8065082_factor)

In [None]:
clogit(Treatment ~ f.sex + f.Race + f.Ethnicity + f.age + Metformin +
                Myocardial_Infarction +
                Congestive_Heart_Failure + Peripheral_Vascular_Disease +
                Cerebrovascular_Disease + Cerebrovascular_Disease + Dementia +
                Chronic_Pulmonary_Disease + Rheumatic_Disease + Peptic_Ulcer_Disease +
                Liver_Disease_Mild + Liver_Disease_Moderate_Severe +  
                Diabetes_without_Chronic_Complications + Diabetes_with_Chronic_Complications + 
                Hemiplegia_Paraplegia + Renal_Disease_Mild_Moderate + 
                Renal_Disease_Severe + HIV + Metastatic_Solid_Tumor + Malignancy + AIDS +
                f.rs8065082 +
                strata(stratum), data=rs8065082_factor)

In [None]:
rs8065082_clogit <- clogit(Treatment ~ f.sex + f.Race + f.Ethnicity + f.age + Metformin +
                Myocardial_Infarction +
                Congestive_Heart_Failure + Peripheral_Vascular_Disease +
                Cerebrovascular_Disease + Cerebrovascular_Disease + Dementia +
                Chronic_Pulmonary_Disease + Rheumatic_Disease + Peptic_Ulcer_Disease +
                Liver_Disease_Mild + Liver_Disease_Moderate_Severe +  
                Diabetes_without_Chronic_Complications + Diabetes_with_Chronic_Complications + 
                Hemiplegia_Paraplegia + Renal_Disease_Mild_Moderate + 
                Renal_Disease_Severe + HIV + Metastatic_Solid_Tumor + Malignancy + AIDS +
                f.rs8065082 +
                strata(stratum), data=rs8065082_factor)

In [None]:
save(rs8065082_clogit, file = "rs8065082_clogit_12022023_AG.RData")

In [None]:
(summary(rs8065082_clogit))$coefficients

In [None]:
write.csv((summary(rs8065082_clogit))$coefficients)

## MATE2K: rs12943590   

In [None]:
rs12943590 <- subset(genotype_og, (locus == 'chr17:19716685'))
head(rs12943590)
dim(rs12943590)

In [None]:
length(unique(rs12943590$person_id))

In [None]:
rs12943590 <- merge(cohort_pid, rs12943590, by="person_id", all.x = TRUE)
head(rs12943590)
dim(rs12943590)

In [None]:
table(rs12943590$GT)

In [None]:
table(is.na(rs12943590$GT))

In [None]:
rs12943590_factor <- data.frame(person_id = rs12943590[,'person_id'])
rs12943590_factor$GT <- 'Missing'
rs12943590_factor$GT[rs12943590$GT %in% '0/0'] <- "0/0"
rs12943590_factor$GT[rs12943590$GT %in% '0/1'] <- "0/1"
rs12943590_factor$GT[rs12943590$GT %in% '1/1'] <- "1/1"

head(rs12943590_factor)

In [None]:
table(rs12943590_factor$GT)

In [None]:
rs12943590_factor$f.rs12943590 <- factor(rs12943590_factor$GT,
                levels = c("0/0", "0/1", "1/1", "Missing"))

In [None]:
rs12943590_factor <- merge(regression_bm, rs12943590_factor, by="person_id", all.x = TRUE)
head(rs12943590_factor)
dim(rs12943590_factor)

In [None]:
clogit(Treatment ~ f.sex + f.Race + f.Ethnicity + f.age + Metformin +
                Myocardial_Infarction +
                Congestive_Heart_Failure + Peripheral_Vascular_Disease +
                Cerebrovascular_Disease + Cerebrovascular_Disease + Dementia +
                Chronic_Pulmonary_Disease + Rheumatic_Disease + Peptic_Ulcer_Disease +
                Liver_Disease_Mild + Liver_Disease_Moderate_Severe +  
                Diabetes_without_Chronic_Complications + Diabetes_with_Chronic_Complications + 
                Hemiplegia_Paraplegia + Renal_Disease_Mild_Moderate + 
                Renal_Disease_Severe + HIV + Metastatic_Solid_Tumor + Malignancy + AIDS +
                f.rs12943590 +
                strata(stratum), data=rs12943590_factor)

In [None]:
rs12943590_clogit <- clogit(Treatment ~ f.sex + f.Race + f.Ethnicity + f.age + Metformin +
                Myocardial_Infarction +
                Congestive_Heart_Failure + Peripheral_Vascular_Disease +
                Cerebrovascular_Disease + Cerebrovascular_Disease + Dementia +
                Chronic_Pulmonary_Disease + Rheumatic_Disease + Peptic_Ulcer_Disease +
                Liver_Disease_Mild + Liver_Disease_Moderate_Severe +  
                Diabetes_without_Chronic_Complications + Diabetes_with_Chronic_Complications + 
                Hemiplegia_Paraplegia + Renal_Disease_Mild_Moderate + 
                Renal_Disease_Severe + HIV + Metastatic_Solid_Tumor + Malignancy + AIDS +
                f.rs12943590 +
                strata(stratum), data=rs12943590_factor)

In [None]:
save(rs12943590_clogit, file = "rs12943590_clogit_12022023_AG.RData")

In [None]:
(summary(rs12943590_clogit))$coefficients

In [None]:
write.csv((summary(rs12943590_clogit))$coefficients)

## OCTN1: rs272893   

In [None]:
rs272893 <- subset(genotype_og, (locus == 'chr5:132327369'))
head(rs272893)
dim(rs272893)

In [None]:
length(unique(rs272893$person_id))

In [None]:
rs272893 <- merge(cohort_pid, rs272893, by="person_id", all.x = TRUE)
head(rs272893)
dim(rs272893)

In [None]:
table(rs272893$GT)

In [None]:
table(is.na(rs272893$GT))

In [None]:
rs272893_factor <- data.frame(person_id = rs272893[,'person_id'])
rs272893_factor$GT <- 'Missing'
rs272893_factor$GT[rs272893$GT %in% '0/0'] <- "0/0"
rs272893_factor$GT[rs272893$GT %in% '0/1'] <- "0/1"
rs272893_factor$GT[rs272893$GT %in% '1/1'] <- "1/1"

head(rs272893_factor)

In [None]:
table(rs272893_factor$GT)

In [None]:
rs272893_factor$f.rs272893 <- factor(rs272893_factor$GT,
                levels = c("0/0", "0/1", "1/1", "Missing"))

In [None]:
rs272893_factor <- merge(regression_bm, rs272893_factor, by="person_id", all.x = TRUE)
head(rs272893_factor)
dim(rs272893_factor)

In [None]:
clogit(Treatment ~ f.sex + f.Race + f.Ethnicity + f.age + Metformin +
                Myocardial_Infarction +
                Congestive_Heart_Failure + Peripheral_Vascular_Disease +
                Cerebrovascular_Disease + Cerebrovascular_Disease + Dementia +
                Chronic_Pulmonary_Disease + Rheumatic_Disease + Peptic_Ulcer_Disease +
                Liver_Disease_Mild + Liver_Disease_Moderate_Severe +  
                Diabetes_without_Chronic_Complications + Diabetes_with_Chronic_Complications + 
                Hemiplegia_Paraplegia + Renal_Disease_Mild_Moderate + 
                Renal_Disease_Severe + HIV + Metastatic_Solid_Tumor + Malignancy + AIDS +
                f.rs272893 +
                strata(stratum), data=rs272893_factor)

In [None]:
rs272893_clogit <- clogit(Treatment ~ f.sex + f.Race + f.Ethnicity + f.age + Metformin +
                Myocardial_Infarction +
                Congestive_Heart_Failure + Peripheral_Vascular_Disease +
                Cerebrovascular_Disease + Cerebrovascular_Disease + Dementia +
                Chronic_Pulmonary_Disease + Rheumatic_Disease + Peptic_Ulcer_Disease +
                Liver_Disease_Mild + Liver_Disease_Moderate_Severe +  
                Diabetes_without_Chronic_Complications + Diabetes_with_Chronic_Complications + 
                Hemiplegia_Paraplegia + Renal_Disease_Mild_Moderate + 
                Renal_Disease_Severe + HIV + Metastatic_Solid_Tumor + Malignancy + AIDS +
                f.rs272893 +
                strata(stratum), data=rs272893_factor)

In [None]:
save(rs272893_clogit, file = "rs272893_clogit_12022023_AG.RData")

In [None]:
(summary(rs272893_clogit))$coefficients

In [None]:
write.csv((summary(rs272893_clogit))$coefficients)

## OCT1: rs12208357   

In [None]:
rs12208357 <- subset(genotype_og, (locus == 'chr6:160122116'))
head(rs12208357)
dim(rs12208357)

In [None]:
length(unique(rs12208357$person_id))

In [None]:
rs12208357 <- merge(cohort_pid, rs12208357, by="person_id", all.x = TRUE)
head(rs12208357)
dim(rs12208357)

In [None]:
table(rs12208357$GT)

In [None]:
table(is.na(rs12208357$GT))

In [None]:
rs12208357_factor <- data.frame(person_id = rs12208357[,'person_id'])
rs12208357_factor$GT <- 'Missing'
rs12208357_factor$GT[rs12208357$GT %in% '0/0'] <- "0/0"
rs12208357_factor$GT[rs12208357$GT %in% '0/1'] <- "0/1"
rs12208357_factor$GT[rs12208357$GT %in% '1/1'] <- "1/1"

head(rs12208357_factor)

In [None]:
table(rs12208357_factor$GT)

In [None]:
rs12208357_factor$f.rs12208357 <- factor(rs12208357_factor$GT,
                levels = c("0/0", "0/1", "1/1", "Missing"))

In [None]:
rs12208357_factor <- merge(regression_bm, rs12208357_factor, by="person_id", all.x = TRUE)
head(rs12208357_factor)
dim(rs12208357_factor)

In [None]:
clogit(Treatment ~ f.sex + f.Race + f.Ethnicity + f.age + Metformin +
                Myocardial_Infarction +
                Congestive_Heart_Failure + Peripheral_Vascular_Disease +
                Cerebrovascular_Disease + Cerebrovascular_Disease + Dementia +
                Chronic_Pulmonary_Disease + Rheumatic_Disease + Peptic_Ulcer_Disease +
                Liver_Disease_Mild + Liver_Disease_Moderate_Severe +  
                Diabetes_without_Chronic_Complications + Diabetes_with_Chronic_Complications + 
                Hemiplegia_Paraplegia + Renal_Disease_Mild_Moderate + 
                Renal_Disease_Severe + HIV + Metastatic_Solid_Tumor + Malignancy + AIDS +
                f.rs12208357 +
                strata(stratum), data=rs12208357_factor)

In [None]:
rs12208357_clogit <- clogit(Treatment ~ f.sex + f.Race + f.Ethnicity + f.age + Metformin +
                Myocardial_Infarction +
                Congestive_Heart_Failure + Peripheral_Vascular_Disease +
                Cerebrovascular_Disease + Cerebrovascular_Disease + Dementia +
                Chronic_Pulmonary_Disease + Rheumatic_Disease + Peptic_Ulcer_Disease +
                Liver_Disease_Mild + Liver_Disease_Moderate_Severe +  
                Diabetes_without_Chronic_Complications + Diabetes_with_Chronic_Complications + 
                Hemiplegia_Paraplegia + Renal_Disease_Mild_Moderate + 
                Renal_Disease_Severe + HIV + Metastatic_Solid_Tumor + Malignancy + AIDS +
                f.rs12208357 +
                strata(stratum), data=rs12208357_factor)

In [None]:
save(rs12208357_clogit, file = "rs12208357_clogit_12022023_AG.RData")

In [None]:
(summary(rs12208357_clogit))$coefficients

In [None]:
write.csv((summary(rs12208357_clogit))$coefficients)

## OCT1: rs622342    

In [None]:
rs622342 <- subset(genotype_og, (locus == 'chr6:160151834'))
head(rs622342)
dim(rs622342)

In [None]:
length(unique(rs622342$person_id))

In [None]:
rs622342 <- merge(cohort_pid, rs622342, by="person_id", all.x = TRUE)
head(rs622342)
dim(rs622342)

In [None]:
table(rs622342$GT)

In [None]:
table(is.na(rs622342$GT))

In [None]:
rs622342_factor <- data.frame(person_id = rs622342[,'person_id'])
rs622342_factor$GT <- 'Missing'
rs622342_factor$GT[rs622342$GT %in% '0/0'] <- "0/0"
rs622342_factor$GT[rs622342$GT %in% '0/1'] <- "0/1"
rs622342_factor$GT[rs622342$GT %in% '1/1'] <- "1/1"

head(rs622342_factor)

In [None]:
table(rs622342_factor$GT)

In [None]:
rs622342_factor$f.rs622342 <- factor(rs622342_factor$GT,
                levels = c("0/0", "0/1", "1/1", "Missing"))

In [None]:
rs622342_factor <- merge(regression_bm, rs622342_factor, by="person_id", all.x = TRUE)
head(rs622342_factor)
dim(rs622342_factor)

In [None]:
clogit(Treatment ~ f.sex + f.Race + f.Ethnicity + f.age + Metformin +
                Myocardial_Infarction +
                Congestive_Heart_Failure + Peripheral_Vascular_Disease +
                Cerebrovascular_Disease + Cerebrovascular_Disease + Dementia +
                Chronic_Pulmonary_Disease + Rheumatic_Disease + Peptic_Ulcer_Disease +
                Liver_Disease_Mild + Liver_Disease_Moderate_Severe +  
                Diabetes_without_Chronic_Complications + Diabetes_with_Chronic_Complications + 
                Hemiplegia_Paraplegia + Renal_Disease_Mild_Moderate + 
                Renal_Disease_Severe + HIV + Metastatic_Solid_Tumor + Malignancy + AIDS +
                f.rs622342 +
                strata(stratum), data=rs622342_factor)

In [None]:
rs622342_clogit <- clogit(Treatment ~ f.sex + f.Race + f.Ethnicity + f.age + Metformin +
                Myocardial_Infarction +
                Congestive_Heart_Failure + Peripheral_Vascular_Disease +
                Cerebrovascular_Disease + Cerebrovascular_Disease + Dementia +
                Chronic_Pulmonary_Disease + Rheumatic_Disease + Peptic_Ulcer_Disease +
                Liver_Disease_Mild + Liver_Disease_Moderate_Severe +  
                Diabetes_without_Chronic_Complications + Diabetes_with_Chronic_Complications + 
                Hemiplegia_Paraplegia + Renal_Disease_Mild_Moderate + 
                Renal_Disease_Severe + HIV + Metastatic_Solid_Tumor + Malignancy + AIDS +
                f.rs622342 +
                strata(stratum), data=rs622342_factor)

In [None]:
save(rs622342_clogit, file = "rs622342_clogit_12022023_AG.RData")

In [None]:
(summary(rs622342_clogit))$coefficients

In [None]:
write.csv((summary(rs622342_clogit))$coefficients)

## OCT2: rs316019

In [None]:
rs316019 <- subset(genotype_og, (locus == 'chr6:160249250'))
head(rs316019)
dim(rs316019)

In [None]:
length(unique(rs316019$person_id))

In [None]:
rs316019 <- merge(cohort_pid, rs316019, by="person_id", all.x = TRUE)
head(rs316019)
dim(rs316019)

In [None]:
table(rs316019$GT)

In [None]:
table(is.na(rs316019$GT))

In [None]:
rs316019_factor <- data.frame(person_id = rs316019[,'person_id'])
rs316019_factor$GT <- 'Missing'
rs316019_factor$GT[rs316019$GT %in% '0/0'] <- "0/0"
rs316019_factor$GT[rs316019$GT %in% '0/1'] <- "0/1"
rs316019_factor$GT[rs316019$GT %in% '1/1'] <- "1/1"

head(rs316019_factor)

In [None]:
table(rs316019_factor$GT)

In [None]:
rs316019_factor$f.rs316019 <- factor(rs316019_factor$GT,
                levels = c("0/0", "0/1", "1/1", "Missing"))

In [None]:
rs316019_factor <- merge(regression_bm, rs316019_factor, by="person_id", all.x = TRUE)
head(rs316019_factor)
dim(rs316019_factor)

In [None]:
clogit(Treatment ~ f.sex + f.Race + f.Ethnicity + f.age + Metformin +
                Myocardial_Infarction +
                Congestive_Heart_Failure + Peripheral_Vascular_Disease +
                Cerebrovascular_Disease + Cerebrovascular_Disease + Dementia +
                Chronic_Pulmonary_Disease + Rheumatic_Disease + Peptic_Ulcer_Disease +
                Liver_Disease_Mild + Liver_Disease_Moderate_Severe +  
                Diabetes_without_Chronic_Complications + Diabetes_with_Chronic_Complications + 
                Hemiplegia_Paraplegia + Renal_Disease_Mild_Moderate + 
                Renal_Disease_Severe + HIV + Metastatic_Solid_Tumor + Malignancy + AIDS +
                f.rs316019 +
                strata(stratum), data=rs316019_factor)

In [None]:
rs316019_clogit <- clogit(Treatment ~ f.sex + f.Race + f.Ethnicity + f.age + Metformin +
                Myocardial_Infarction +
                Congestive_Heart_Failure + Peripheral_Vascular_Disease +
                Cerebrovascular_Disease + Cerebrovascular_Disease + Dementia +
                Chronic_Pulmonary_Disease + Rheumatic_Disease + Peptic_Ulcer_Disease +
                Liver_Disease_Mild + Liver_Disease_Moderate_Severe +  
                Diabetes_without_Chronic_Complications + Diabetes_with_Chronic_Complications + 
                Hemiplegia_Paraplegia + Renal_Disease_Mild_Moderate + 
                Renal_Disease_Severe + HIV + Metastatic_Solid_Tumor + Malignancy + AIDS +
                f.rs316019 +
                strata(stratum), data=rs316019_factor)

In [None]:
save(rs316019_clogit, file = "rs316019_clogit_12022023_AG.RData")

In [None]:
(summary(rs316019_clogit))$coefficients

In [None]:
write.csv((summary(rs316019_clogit))$coefficients)

# Table 1

In [None]:
head(regression_bm)
dim(regression_bm)

In [None]:
table(regression_bm$Sex_at_Birth[regression_bm$Treatment == 0])

In [None]:
prop.table(table(regression_bm$Sex_at_Birth[regression_bm$Treatment == 0]))

In [None]:
table(regression_bm$Sex_at_Birth[regression_bm$Treatment == 1])
prop.table(table(regression_bm$Sex_at_Birth[regression_bm$Treatment == 1]))

In [None]:
table(regression_bm$Race[regression_bm$Treatment == 0])
prop.table(table(regression_bm$Race[regression_bm$Treatment == 0]))

In [None]:
table(regression_bm$Race[regression_bm$Treatment == 1])
prop.table(table(regression_bm$Race[regression_bm$Treatment == 1]))

In [None]:
table(regression_bm$Hispanic[regression_bm$Treatment == 0])
prop.table(table(regression_bm$Hispanic[regression_bm$Treatment == 0]))

In [None]:
table(regression_bm$Hispanic[regression_bm$Treatment == 1])
prop.table(table(regression_bm$Hispanic[regression_bm$Treatment == 1]))

In [None]:
table(regression_bm$Index_Age_Group[regression_bm$Treatment == 0])
prop.table(table(regression_bm$Index_Age_Group[regression_bm$Treatment == 0]))

In [None]:
median(regression_bm$Index_Age[regression_bm$Treatment ==0])
IQR(regression_bm$Index_Age[regression_bm$Treatment ==0])

In [None]:
quantile(regression_bm$Index_Age[regression_bm$Treatment ==0])

In [None]:
median(regression_bm$Index_Age[regression_bm$Treatment ==1])
IQR(regression_bm$Index_Age[regression_bm$Treatment ==1])

In [None]:
quantile(regression_bm$Index_Age[regression_bm$Treatment ==1])

In [None]:
table(regression_bm$Index_Age_Group[regression_bm$Treatment == 1])
prop.table(table(regression_bm$Index_Age_Group[regression_bm$Treatment == 1]))

In [None]:
table(regression_bm$Liver_Disease_Mild[regression_bm$Treatment == 0])
prop.table(table(regression_bm$Liver_Disease_Mild[regression_bm$Treatment == 0]))

In [None]:
table(regression_bm$Liver_Disease_Mild[regression_bm$Treatment == 1])
prop.table(table(regression_bm$Liver_Disease_Mild[regression_bm$Treatment == 1]))

In [None]:
table(regression_bm$Liver_Disease_Moderate_Severe[regression_bm$Treatment == 0])
prop.table(table(regression_bm$Liver_Disease_Moderate_Severe[regression_bm$Treatment == 0]))

In [None]:
table(regression_bm$Liver_Disease_Moderate_Severe[regression_bm$Treatment == 1])
prop.table(table(regression_bm$Liver_Disease_Moderate_Severe[regression_bm$Treatment == 1]))

In [None]:
table(regression_bm$Renal_Disease_Mild_Moderate[regression_bm$Treatment == 0])
prop.table(table(regression_bm$Renal_Disease_Mild_Moderate[regression_bm$Treatment == 0]))

In [None]:
table(regression_bm$Renal_Disease_Mild_Moderate[regression_bm$Treatment == 1])
prop.table(table(regression_bm$Renal_Disease_Mild_Moderate[regression_bm$Treatment == 1]))

In [None]:
table(regression_bm$Renal_Disease_Severe[regression_bm$Treatment == 0])
prop.table(table(regression_bm$Renal_Disease_Severe[regression_bm$Treatment == 0]))

In [None]:
table(regression_bm$Renal_Disease_Severe[regression_bm$Treatment == 1])
prop.table(table(regression_bm$Renal_Disease_Severe[regression_bm$Treatment == 1]))

In [None]:
table(regression_bm$Diabetes_without_Chronic_Complications[regression_bm$Treatment == 0])
prop.table(table(regression_bm$Diabetes_without_Chronic_Complications[regression_bm$Treatment == 0]))

In [None]:
table(regression_bm$Diabetes_without_Chronic_Complications[regression_bm$Treatment == 1])
prop.table(table(regression_bm$Diabetes_without_Chronic_Complications[regression_bm$Treatment == 1]))

In [None]:
table(regression_bm$Diabetes_with_Chronic_Complications[regression_bm$Treatment == 0])
prop.table(table(regression_bm$Diabetes_with_Chronic_Complications[regression_bm$Treatment == 0]))

In [None]:
table(regression_bm$Diabetes_with_Chronic_Complications[regression_bm$Treatment == 1])
prop.table(table(regression_bm$Diabetes_with_Chronic_Complications[regression_bm$Treatment == 1]))

In [None]:
table(regression_bm$Metformin[regression_bm$Treatment == 0])
prop.table(table(regression_bm$Metformin[regression_bm$Treatment == 0]))

In [None]:
table(regression_bm$Metformin[regression_bm$Treatment == 1])
prop.table(table(regression_bm$Metformin[regression_bm$Treatment == 1]))

In [None]:
table(regression_bm$Myocardial_Infarction[regression_bm$Treatment == 0])
prop.table(table(regression_bm$Myocardial_Infarction[regression_bm$Treatment == 0]))

In [None]:
table(regression_bm$Myocardial_Infarction[regression_bm$Treatment == 1])
prop.table(table(regression_bm$Myocardial_Infarction[regression_bm$Treatment == 1]))

In [None]:
table(regression_bm$Congestive_Heart_Failure[regression_bm$Treatment == 0])
prop.table(table(regression_bm$Congestive_Heart_Failure[regression_bm$Treatment == 0]))

In [None]:
table(regression_bm$Congestive_Heart_Failure[regression_bm$Treatment == 1])
prop.table(table(regression_bm$Congestive_Heart_Failure[regression_bm$Treatment == 1]))

In [None]:
table(regression_bm$Peripheral_Vascular_Disease[regression_bm$Treatment == 0])
prop.table(table(regression_bm$Peripheral_Vascular_Disease[regression_bm$Treatment == 0]))

In [None]:
table(regression_bm$Peripheral_Vascular_Disease[regression_bm$Treatment == 1])
prop.table(table(regression_bm$Peripheral_Vascular_Disease[regression_bm$Treatment == 1]))

In [None]:
table(regression_bm$Cerebrovascular_Disease[regression_bm$Treatment == 0])
prop.table(table(regression_bm$Cerebrovascular_Disease[regression_bm$Treatment == 0]))

In [None]:
table(regression_bm$Cerebrovascular_Disease[regression_bm$Treatment == 1])
prop.table(table(regression_bm$Cerebrovascular_Disease[regression_bm$Treatment == 1]))

In [None]:
table(regression_bm$Dementia[regression_bm$Treatment == 0])
prop.table(table(regression_bm$Dementia[regression_bm$Treatment == 0]))

In [None]:
table(regression_bm$Dementia[regression_bm$Treatment == 1])
prop.table(table(regression_bm$Dementia[regression_bm$Treatment == 1]))

In [None]:
table(regression_bm$Chronic_Pulmonary_Disease[regression_bm$Treatment == 0])
prop.table(table(regression_bm$Chronic_Pulmonary_Disease[regression_bm$Treatment == 0]))

In [None]:
table(regression_bm$Chronic_Pulmonary_Disease[regression_bm$Treatment == 1])
prop.table(table(regression_bm$Chronic_Pulmonary_Disease[regression_bm$Treatment == 1]))

In [None]:
table(regression_bm$Rheumatic_Disease[regression_bm$Treatment == 0])
prop.table(table(regression_bm$Rheumatic_Disease[regression_bm$Treatment == 0]))

In [None]:
table(regression_bm$Rheumatic_Disease[regression_bm$Treatment == 1])
prop.table(table(regression_bm$Rheumatic_Disease[regression_bm$Treatment == 1]))

In [None]:
table(regression_bm$Peptic_Ulcer_Disease[regression_bm$Treatment == 0])
prop.table(table(regression_bm$Peptic_Ulcer_Disease[regression_bm$Treatment == 0]))

In [None]:
table(regression_bm$Peptic_Ulcer_Disease[regression_bm$Treatment == 1])
prop.table(table(regression_bm$Peptic_Ulcer_Disease[regression_bm$Treatment == 1]))

In [None]:
table(regression_bm$Hemiplegia_Paraplegia[regression_bm$Treatment == 0])
prop.table(table(regression_bm$Hemiplegia_Paraplegia[regression_bm$Treatment == 0]))

In [None]:
table(regression_bm$Hemiplegia_Paraplegia[regression_bm$Treatment == 1])
prop.table(table(regression_bm$Hemiplegia_Paraplegia[regression_bm$Treatment == 1]))

In [None]:
table(regression_bm$HIV[regression_bm$Treatment == 0])
prop.table(table(regression_bm$HIV[regression_bm$Treatment == 0]))

In [None]:
table(regression_bm$HIV[regression_bm$Treatment == 1])
prop.table(table(regression_bm$HIV[regression_bm$Treatment == 1]))

In [None]:
table(regression_bm$AIDS[regression_bm$Treatment == 0])
prop.table(table(regression_bm$AIDS[regression_bm$Treatment == 0]))

In [None]:
table(regression_bm$AIDS[regression_bm$Treatment == 1])
prop.table(table(regression_bm$AIDS[regression_bm$Treatment == 1]))

In [None]:
table(regression_bm$Metastatic_Solid_Tumor[regression_bm$Treatment == 0])
prop.table(table(regression_bm$Metastatic_Solid_Tumor[regression_bm$Treatment == 0]))

In [None]:
table(regression_bm$Metastatic_Solid_Tumor[regression_bm$Treatment == 1])
prop.table(table(regression_bm$Metastatic_Solid_Tumor[regression_bm$Treatment == 1]))

In [None]:
table(regression_bm$Malignancy[regression_bm$Treatment == 0])
prop.table(table(regression_bm$Malignancy[regression_bm$Treatment == 0]))

In [None]:
table(regression_bm$Malignancy[regression_bm$Treatment == 1])
prop.table(table(regression_bm$Malignancy[regression_bm$Treatment == 1]))

In [None]:
colnames(regression_bm)

# Table 1a

In [None]:
colnames(regression_sdoh)

In [None]:
regression_sdoh_insu <- merge(regression_sdoh, regression_insu[,c("person_id", "medcare.merge", "medicaid.merge", "empl.merge")], by="person_id", all.x = TRUE)
head(regression_sdoh_insu)
dim(regression_sdoh_insu)

In [None]:
# Save regression_sdoh file round 1 --> NO MERGING OF FACTORS
# This snippet assumes that you run setup first

# This code saves your dataframe into a csv file in a "data" folder in Google Bucket

# Replace df with THE NAME OF YOUR DATAFRAME
my_dataframe <- regression_sdoh_insu

# Replace 'test.csv' with THE NAME of the file you're going to store in the bucket (don't delete the quotation marks)
destination_filename <- 'regression_sdoh_insu_11112023_AG.fst'

########################################################################
##
################# DON'T CHANGE FROM HERE ###############################
##
########################################################################

# store the dataframe in current workspace
write_fst(my_dataframe, destination_filename)

# Get the bucket name
my_bucket <- Sys.getenv('WORKSPACE_BUCKET')

# Copy the file from current workspace to the bucket
system(paste0("gsutil cp ./", destination_filename, " ", my_bucket, "/data/"), intern=T)

# Check if file is in the bucket
system(paste0("gsutil ls ", my_bucket, "/data/*.fst"), intern=T)

In [None]:
colnames(regression_sdoh_insu)

In [None]:
colnames(regression_insu)

In [None]:
table(regression_insu$empl.merge[regression_insu$Treatment == 0])
prop.table(table(regression_insu$empl.merge[regression_insu$Treatment == 0]))

In [None]:
table(regression_insu$empl.merge[regression_insu$Treatment == 1])
prop.table(table(regression_insu$empl.merge[regression_insu$Treatment == 1]))

In [None]:
table(regression_insu$medcare.merge[regression_insu$Treatment == 0])
prop.table(table(regression_insu$medcare.merge[regression_insu$Treatment == 0]))

In [None]:
table(regression_insu$medcare.merge[regression_insu$Treatment == 1])
prop.table(table(regression_insu$medcare.merge[regression_insu$Treatment == 1]))

In [None]:
table(regression_insu$medicaid.merge[regression_insu$Treatment == 0])
prop.table(table(regression_insu$medicaid.merge[regression_insu$Treatment == 0]))

In [None]:
table(regression_insu$medicaid.merge[regression_insu$Treatment == 1])
prop.table(table(regression_insu$medicaid.merge[regression_insu$Treatment == 1]))

In [None]:
colnames(disa_regression)

In [None]:
table(disa_regression$f.disa[disa_regression$Treatment == 0])
prop.table(table(disa_regression$f.disa[disa_regression$Treatment == 0]))

In [None]:
table(disa_regression$f.disa[disa_regression$Treatment == 1])
prop.table(table(disa_regression$f.disa[disa_regression$Treatment == 1]))

In [None]:
table(regression_sdoh$f.disability_blind[regression_sdoh$Treatment == 0])
prop.table(table(regression_sdoh$f.disability_blind[regression_sdoh$Treatment == 0]))

In [None]:
table(regression_sdoh$f.disability_blind[regression_sdoh$Treatment == 1])
prop.table(table(regression_sdoh$f.disability_blind[regression_sdoh$Treatment == 1]))

In [None]:
table(regression_sdoh$f.disability_deaf[regression_sdoh$Treatment == 0])
prop.table(table(regression_sdoh$f.disability_deaf[regression_sdoh$Treatment == 0]))

In [None]:
table(regression_sdoh$f.disability_deaf[regression_sdoh$Treatment == 1])
prop.table(table(regression_sdoh$f.disability_deaf[regression_sdoh$Treatment == 1]))

In [None]:
table(regression_sdoh$f.disability_walking_climbing[regression_sdoh$Treatment == 0])
prop.table(table(regression_sdoh$f.disability_walking_climbing[regression_sdoh$Treatment == 0]))

In [None]:
table(regression_sdoh$f.disability_walking_climbing[regression_sdoh$Treatment == 1])
prop.table(table(regression_sdoh$f.disability_walking_climbing[regression_sdoh$Treatment == 1]))

In [None]:
table(regression_sdoh$f.disability_dressing_bathing[regression_sdoh$Treatment == 0])
prop.table(table(regression_sdoh$f.disability_dressing_bathing[regression_sdoh$Treatment == 0]))

In [None]:
table(regression_sdoh$f.disability_dressing_bathing[regression_sdoh$Treatment == 1])
prop.table(table(regression_sdoh$f.disability_dressing_bathing[regression_sdoh$Treatment == 1]))

In [None]:
table(regression_sdoh$f.disability_errands_alone[regression_sdoh$Treatment == 0])
prop.table(table(regression_sdoh$f.disability_errands_alone[regression_sdoh$Treatment == 0]))

In [None]:
table(regression_sdoh$f.disability_errands_alone[regression_sdoh$Treatment == 1])
prop.table(table(regression_sdoh$f.disability_errands_alone[regression_sdoh$Treatment == 1]))

In [None]:
table(regression_sdoh$f.disability_concentrating[regression_sdoh$Treatment == 0])
prop.table(table(regression_sdoh$f.disability_concentrating[regression_sdoh$Treatment == 0]))

In [None]:
table(regression_sdoh$f.disability_concentrating[regression_sdoh$Treatment == 1])
prop.table(table(regression_sdoh$f.disability_concentrating[regression_sdoh$Treatment == 1]))

In [None]:
table(regression_sdoh2$f.employment[regression_sdoh2$Treatment == 0])
prop.table(table(regression_sdoh2$f.employment[regression_sdoh2$Treatment == 0]))

In [None]:
table(regression_sdoh2$f.employment[regression_sdoh2$Treatment == 1])
prop.table(table(regression_sdoh2$f.employment[regression_sdoh2$Treatment == 1]))

In [None]:
table(regression_sdoh2$f.income[regression_sdoh2$Treatment == 0])
prop.table(table(regression_sdoh2$f.income[regression_sdoh2$Treatment == 0]))

In [None]:
table(regression_sdoh2$f.income[regression_sdoh2$Treatment == 1])
prop.table(table(regression_sdoh2$f.income[regression_sdoh2$Treatment == 1]))

In [None]:
colnames(regression_sdoh2)

In [None]:
table(regression_sdoh2$f.housing[regression_sdoh2$Treatment == 0])
prop.table(table(regression_sdoh2$f.housing[regression_sdoh2$Treatment == 0]))

In [None]:
table(regression_sdoh2$f.housing[regression_sdoh2$Treatment == 1])
prop.table(table(regression_sdoh2$f.housing[regression_sdoh2$Treatment == 1]))

In [None]:
table(regression_sdoh2$f.stable_house[regression_sdoh2$Treatment == 0])
prop.table(table(regression_sdoh2$f.stable_house[regression_sdoh2$Treatment == 0]))

In [None]:
table(regression_sdoh2$f.stable_house[regression_sdoh2$Treatment == 1])
prop.table(table(regression_sdoh2$f.stable_house[regression_sdoh2$Treatment == 1]))

In [None]:
table(regression_sdoh2$f.education[regression_sdoh2$Treatment == 0])
prop.table(table(regression_sdoh2$f.education[regression_sdoh2$Treatment == 0]))

In [None]:
table(regression_sdoh2$f.education[regression_sdoh2$Treatment == 1])
prop.table(table(regression_sdoh2$f.education[regression_sdoh2$Treatment == 1]))