# Set Up

In [None]:
library(tidyverse)
library(fst)
library(bigrquery)
library(stringr)
library(lubridate)

In [None]:
# This snippet assumes that you run setup first

# This code copies a file from your Google Bucket into a dataframe

 

# replace 'test.csv' with the name of the file in your google bucket (don't delete the quotation marks)
name_of_file_in_bucket <- 'Acidosis_Visit_Occurrence_AG_10162023.csv'

 

########################################################################
##
################# DON'T CHANGE FROM HERE ###############################
##
########################################################################

 

# Get the bucket name
my_bucket <- Sys.getenv('WORKSPACE_BUCKET')

 

# Copy the file from current workspace to the bucket
system(paste0("gsutil cp ", my_bucket, "/data/", name_of_file_in_bucket, " ."), intern=T)

 

# Load the file into a dataframe
acidosis_df  <- read_csv(name_of_file_in_bucket)
head(acidosis_df)

In [None]:
dim(acidosis_df)
length(unique(acidosis_df$PERSON_ID))

# Prepare Case Cohort

In [None]:
# Create new dataframe in which only emergent visit types are present

# Define the values to match 
emergent.conditions <- c("Emergency Room Visit", "Emergency Room and Inpatient Visit", "Inpatient Visit", "Inpatient Hospital", "Urgent Care Facility") 
# Select rows matching the values 
acidosis.emergent <- acidosis_df[acidosis_df$STANDARD_CONCEPT_NAME %in% emergent.conditions, ]
acidosis.emergent

In [None]:
# Edit dataframe so that only the first emergent visit for each patient is present

length(unique(acidosis.emergent$PERSON_ID))

In [None]:
# Edit dataframe so that only the first emergent visit for each patient is present

# Only include first visit for acidosis per patient
acidosis.emergent <- acidosis.emergent %>% 
group_by(PERSON_ID) %>% 
arrange(VISIT_START_DATETIME) %>% 
slice(1)

acidosis.emergent
dim(acidosis.emergent)

In [None]:
# This snippet assumes that you run setup first

# This code saves your dataframe into a csv file in a "data" folder in Google Bucket

# Replace df with THE NAME OF YOUR DATAFRAME
my_dataframe <- acidosis.emergent

# Replace 'test.csv' with THE NAME of the file you're going to store in the bucket (don't delete the quotation marks)
destination_filename <- 'acidosis_emergent_conditions_AG_10172023.csv'

########################################################################
##
################# DON'T CHANGE FROM HERE ###############################
##
########################################################################

# store the dataframe in current workspace
write_excel_csv(my_dataframe, destination_filename)

# Get the bucket name
my_bucket <- Sys.getenv('WORKSPACE_BUCKET')

# Copy the file from current workspace to the bucket
system(paste0("gsutil cp ./", destination_filename, " ", my_bucket, "/data/"), intern=T)

# Check if file is in the bucket
system(paste0("gsutil ls ", my_bucket, "/data/*.csv"), intern=T)

In [None]:
### TAKE OUT THE PATIENTS THAT DON'T FIT IN THE INCLUSION / EXCLUSION CRITERIA 

In [None]:
# This snippet assumes that you run setup first

# This code copies a file from your Google Bucket into a dataframe

 

# replace 'test.csv' with the name of the file in your google bucket (don't delete the quotation marks)
name_of_file_in_bucket <- 'acidosis_exclusion_cohort_pid_AG_11022023.csv'

 

########################################################################
##
################# DON'T CHANGE FROM HERE ###############################
##
########################################################################

 

# Get the bucket name
my_bucket <- Sys.getenv('WORKSPACE_BUCKET')

 

# Copy the file from current workspace to the bucket
system(paste0("gsutil cp ", my_bucket, "/data/", name_of_file_in_bucket, " ."), intern=T)

 

# Load the file into a dataframe
acidosis_exclusion_cohort  <- read_csv(name_of_file_in_bucket)
head(acidosis_exclusion_cohort)

In [None]:
library(dplyr)

In [None]:
# Change PERSON_ID -> person_id so that it matches with emergent dataframe
acidosis_exclusion_cohort <- acidosis_exclusion_cohort %>% 
       rename(PERSON_ID = person_id)
head(acidosis_exclusion_cohort)

In [None]:
emergent_og_pid <- select(acidosis.emergent, c('PERSON_ID'))
head(emergent_og_pid)

In [None]:
# Find the PIDs that overlap between the acidosis exclusion cohort participants and the 
# emergent acidosis cases participants (# should be less than 2028)
intersect1 <- dplyr::intersect(emergent_og_pid, acidosis_exclusion_cohort)
dim(intersect1)

In [None]:
head(intersect1)

In [None]:
# This means that the emergent visit acidosis cohort should have 2,676 participants

In [None]:
# Create new dataframe for emergent visists that only contains the 1,731
acidosis_emergent <- merge(intersect1, acidosis.emergent, by="PERSON_ID", all.x = TRUE)
head(acidosis_emergent)
dim(acidosis_emergent)

In [None]:
# Save file as csv with acidosis emergent data

In [None]:
# This snippet assumes that you run setup first

# This code saves your dataframe into a csv file in a "data" folder in Google Bucket

# Replace df with THE NAME OF YOUR DATAFRAME
my_dataframe <- acidosis_emergent

# Replace 'test.csv' with THE NAME of the file you're going to store in the bucket (don't delete the quotation marks)
destination_filename <- 'acidosis_emergent_final_cohort_AG_11022023.csv'

########################################################################
##
################# DON'T CHANGE FROM HERE ###############################
##
########################################################################

# store the dataframe in current workspace
write_excel_csv(my_dataframe, destination_filename)

# Get the bucket name
my_bucket <- Sys.getenv('WORKSPACE_BUCKET')

# Copy the file from current workspace to the bucket
system(paste0("gsutil cp ./", destination_filename, " ", my_bucket, "/data/"), intern=T)

# Check if file is in the bucket
system(paste0("gsutil ls ", my_bucket, "/data/*.csv"), intern=T)

# Demographics

In [None]:
# Demographics of Acidosis Emergent Conditions + Inclusion/Exclusion Criteria

In [None]:
# This snippet assumes that you run setup first

# This code copies a file from your Google Bucket into a dataframe

# replace 'test.csv' with the name of the file in your google bucket (don't delete the quotation marks)
name_of_file_in_bucket <- 'acidosis_emergent_final_cohort_AG_11022023.csv'

########################################################################
##
################# DON'T CHANGE FROM HERE ###############################
##
########################################################################

# Get the bucket name
my_bucket <- Sys.getenv('WORKSPACE_BUCKET')

# Copy the file from current workspace to the bucket
system(paste0("gsutil cp ", my_bucket, "/data/", name_of_file_in_bucket, " ."), intern=T)

# Load the file into a dataframe
acidosis.emergent  <- read_csv(name_of_file_in_bucket)
head(acidosis.emergent)

## Age, Sex at Birth, Race

In [None]:
# This snippet assumes that you run setup first

 

# This code copies a file from your Google Bucket into a dataframe

 

# replace 'test.csv' with the name of the file in your google bucket (don't delete the quotation marks)
name_of_file_in_bucket <- 'demographic_all.csv'

 

########################################################################
##
################# DON'T CHANGE FROM HERE ###############################
##
########################################################################

 

# Get the bucket name
my_bucket <- Sys.getenv('WORKSPACE_BUCKET')

 

# Copy the file from current workspace to the bucket
system(paste0("gsutil cp ", my_bucket, "/data/", name_of_file_in_bucket, " ."), intern=T)

 

# Load the file into a dataframe
my_dataframe  <- read_csv(name_of_file_in_bucket)
head(my_dataframe)

In [None]:
demo <- my_dataframe %>% 
       rename(PERSON_ID = person_id)
head(demo)

In [None]:
# Merge acidosis.emergent dataframe with demographic dataframe 
merged_cohort <- merge(acidosis.emergent, demo, by="PERSON_ID", all.x = TRUE)
head(merged_cohort)
dim(merged_cohort)

In [None]:
# Clean demographic data
clean_cohort <- plyr::rename(merged_cohort, c(PERSON_ID = 'Count',
#                                        year_of_birth = 'Age',
                                       race_source_value = 'Race',
                                       sex_at_birth_source_value = 'Sex_at_Birth',
                                       ethnicity_source_value = 'Hispanic',
                                       gender_source_value = 'Gender'))

for (row in 1:nrow(clean_cohort))
    {
    for (col in 1:ncol(clean_cohort))
        {
        if(grepl('PMI: Skip',clean_cohort[row,col]))
            {clean_cohort[row,col] <- "Skip"}
        if(clean_cohort[row,col] %in% c("Not man only, not woman only, prefer not to answer, or skipped",
                                          "No matching concept",
                                          "None of these",
                                          "I prefer not to answer"))
            {clean_cohort[row,col] <- 'Unspecified'}
    }
}


#current_year <- year(Sys.Date())
clean_cohort$Age <- year(clean_cohort$VISIT_START_DATETIME) - as.numeric(clean_cohort$year_of_birth)


breaks <- c(18,25,35,45,55,65,75,85,1000)
tags <- c('18-25','26-35','36-45','46-55','56-65','66-75','76-85','86+')
clean_cohort$Age_Group <- cut(clean_cohort$Age, breaks=breaks, right=FALSE, labels=tags)

In [None]:
head(clean_cohort)

In [None]:
colnames(clean_cohort)

In [None]:
# Make a pie chart for sex assigned at birth

sex_counts <- select(clean_cohort, Count, Sex_at_Birth) %>% group_by(Sex_at_Birth)
sex_counts <- count(sex_counts, Sex_at_Birth)
colnames(sex_counts) <- c('Sex_at_Birth','Count')

sex_counts <- sex_counts %>% 
    mutate(Sex_at_Birth = ifelse(Sex_at_Birth == 'Not male, not female, prefer not to answer, or skipped',
                                'Not male, not female, prefer\nnot to answer, or skipped', Sex_at_Birth))

slices <- sex_counts$Count
lgd <- sex_counts$Sex_at_Birth
pct <- format(100*(sex_counts$Count)/(sum(sex_counts$Count)), digits = 2)
lbls <- paste(pct,"%",sep="") # ad % to labels

cols = RColorBrewer::brewer.pal(n = length(lgd), name = 'Set3')

sex_counts

par(mar=c(0,0,2,2))
pie(slices, lbls, main = 'sex assigned at birth', font=9, col=cols)
legend("topright", legend=lgd, fill=cols)

In [None]:
# Making sure there are still the correct amount of patients in the dataframe
sum(sex_counts$Count)

In [None]:
# Organize cohort by race and ancestry

race_counts <- select(clean_cohort, Count, Race) %>% group_by(Race)
race_counts <- count(race_counts, Race)
colnames(race_counts) <- c('Race','Count')
par(las = 1) # make label text perpendicular to axis
par(mar=c(3,15,3,1)) # increase y-axis margin


race_counts
barplot(race_counts$Count, main="Race and Ancestry", horiz = TRUE, 
        names.arg = race_counts$Race, cex.names = 0.8)

In [None]:
# Making sure there are still the correct amount of patients in the dataframe
sum(race_counts$Count)

In [None]:
# Organize cohort: Hispanic, Latino, or Spanish

hls_counts <- select(clean_cohort, Count, Hispanic) %>% group_by(Hispanic)
hls_counts <- count(hls_counts, Hispanic)
colnames(hls_counts) <- c('Hispanic','Count')

hls_graph <- hls_counts
hls_graph$Percentage <- format(100*(hls_graph$Count)/(sum(hls_graph$Count)),digits = 2)
slices <- hls_graph$Count
lgd <- hls_graph$Hispanic
pct <- hls_graph$Percentage
lbls <- paste(pct,"%",sep="") # ad % to labels

cols = RColorBrewer::brewer.pal(n = length(lgd), name = 'Set3')

hls_counts

par(mar=c(0,0,2,2))
pie(slices, lbls, main = 'Hispanic Latino or Spanish', font=9, col=cols)
legend("topright", legend=lgd, fill=cols)

In [None]:
# Making sure there are still the correct amount of patients in the dataframe
sum(hls_counts$Count)

In [None]:
# Organize cohort by age distribution at acidosis incidence: 

age_counts <- select(clean_cohort, Count, Age_Group) %>% group_by(Age_Group)
age_counts <- count(age_counts, Age_Group)
colnames(age_counts) <- c('Age_Group','Count')
age_counts


x <- clean_cohort$Age
h<-hist(x, breaks=10, col="grey", xlab="Age",
   main="Histogram with Normal Curve")
xfit<-seq(min(x),max(x),length=40)
yfit<-dnorm(xfit,mean=mean(x),sd=sd(x))
yfit <- yfit*diff(h$mids[1:2])*length(x)
lines(xfit, yfit, col="blue", lwd=2)

In [None]:
# Making sure there are still the correct amount of patients in the dataframe
sum(age_counts$Count)

## Comorbidities

In [None]:
# Upload the comorbidities csv file

# This snippet assumes that you run setup first

# This code copies a file from your Google Bucket into a dataframe

# replace 'test.csv' with the name of the file in your google bucket (don't delete the quotation marks)
name_of_file_in_bucket <- 'AUD_Summary_Comorbidity_v2.csv'

########################################################################
##
################# DON'T CHANGE FROM HERE ###############################
##
########################################################################

# Get the bucket name
my_bucket <- Sys.getenv('WORKSPACE_BUCKET')

# Copy the file from current workspace to the bucket
system(paste0("gsutil cp ", my_bucket, "/data/", name_of_file_in_bucket, " ."), intern=T)

# Load the file into a dataframe
como  <- read_csv(name_of_file_in_bucket)
head(como)

In [None]:
# Find the dimensions of the df
dim(como)

In [None]:
# Merge the como dataframe with the acidosis.emergent dataframe
como <- como %>% 
       rename(PERSON_ID = person_id)
head(como)

In [None]:
# Merge acidosis.emergent dataframe with demographic dataframe 
acidosis_como <- merge(acidosis.emergent, como, by="PERSON_ID", all.x = TRUE)
head(acidosis_como)
dim(acidosis_como)

In [None]:
options(repr.matrix.max.cols=150, repr.matrix.max.rows=200)

In [None]:
# Make sure the same number of patients is in this dataframe, even though the dimensions are
# larger than the 2676 number
length(unique(acidosis_como$PERSON_ID))
head(acidosis_como)

In [None]:
install.packages("UpSetR")

In [None]:
# Set up library for graph
library(UpSetR)
library(ggplot2)
library(grid)

In [None]:
input <- select(acidosis_como, PERSON_ID, HIV, Cerebrovascular_Disease, 
                Chronic_Pulmonary_Disease, Congestive_Heart_Failure, Dementia,
                Diabetes_with_Chronic_Complications, Diabetes_without_Chronic_Complications,
                AIDS, Hemiplegia_Paraplegia, Liver_Disease_Mild, 
                Liver_Disease_Moderate_Severe, Malignancy,
                Metastatic_Solid_Tumor, Myocardial_Infarction, 
                Peptic_Ulcer_Disease, Peripheral_Vascular_Disease, 
                Renal_Disease_Severe, Renal_Disease_Mild_Moderate, Rheumatic_Disease)
head(input)
dim(input)

In [None]:
# When making the graph the first time, there were missing values. 
sum(is.na(input))

In [None]:
# I excluded the missing values in a new dataframe, 'rem'
rem <- na.exclude(input)
sum(is.na(rem))
dim(rem)
# Rem's dimensions contain 4 less patients than input's, indicating that there were only four
# patients that had missing values.

In [None]:
# Create UpSet plot
upset(rem, 
      nintersects = 40, 
      nsets = 19, 
      order.by = "freq", 
      decreasing = T, 
      mb.ratio = c(0.6, 0.4),
      number.angles = 0, 
      text.scale = 1.1, 
      point.size = 2.8, 
      line.size = 0.75
      )

## Social Determinants of Health

In [None]:
# Upload the insurance csv file from Chenxi

# This snippet assumes that you run setup first

# This code copies a file from your Google Bucket into a dataframe

# replace 'test.csv' with the name of the file in your google bucket (don't delete the quotation marks)
name_of_file_in_bucket <- 'Insurance_all.csv'

########################################################################
##
################# DON'T CHANGE FROM HERE ###############################
##
########################################################################

# Get the bucket name
my_bucket <- Sys.getenv('WORKSPACE_BUCKET')

# Copy the file from current workspace to the bucket
system(paste0("gsutil cp ", my_bucket, "/data/", name_of_file_in_bucket, " ."), intern=T)

# Load the file into a dataframe
sdoh  <- read_csv(name_of_file_in_bucket)
head(sdoh)

In [None]:
sdoh <- sdoh %>% 
       rename(PERSON_ID = person_id)
head(sdoh)

In [None]:
dim(sdoh)

In [None]:
# Merge acidosis.emergent dataframe with sdoh dataframe 
acidosis_sdoh <- merge(acidosis.emergent, sdoh, by="PERSON_ID", all.x = TRUE)
head(acidosis_sdoh)
dim(acidosis_sdoh)

In [None]:
# Organize cohort by sdoh values
PID_como <- acidosis_sdoh[,c("empl", "medcare", "medicaid", 
                          "yes", "others",
                          "empl.merge", 
                          "medcare.merge",
                          "medicaid.merge")]
head(PID_como)

In [None]:
# Create UpSet plot
upset(PID_como, 
      nintersects = 25, 
      nsets = 8, 
      order.by = "freq", 
      decreasing = T, 
      mb.ratio = c(0.6, 0.4),
      number.angles = 0, 
      text.scale = 1.1, 
      point.size = 2.8, 
      line.size = 1
      )

# Acidosis + Metformin Histogram

In [None]:
# Upload acidosis emergent dataframe
# This snippet assumes that you run setup first

# This code copies a file from your Google Bucket into a dataframe

# replace 'test.csv' with the name of the file in your google bucket (don't delete the quotation marks)
name_of_file_in_bucket <- 'acidosis_emergent_final_cohort_AG_11022023.csv'

########################################################################
##
################# DON'T CHANGE FROM HERE ###############################
##
########################################################################

# Get the bucket name
my_bucket <- Sys.getenv('WORKSPACE_BUCKET')

# Copy the file from current workspace to the bucket
system(paste0("gsutil cp ", my_bucket, "/data/", name_of_file_in_bucket, " ."), intern=T)

# Load the file into a dataframe
acidosis.emergent  <- read_csv(name_of_file_in_bucket)
head(acidosis.emergent)

In [None]:
dim(acidosis.emergent)

In [None]:
# Upload metformin dataframe (ALL USES)

# This snippet assumes that you run setup first

# This code copies a file from your Google Bucket into a dataframe

# replace 'test.csv' with the name of the file in your google bucket (don't delete the quotation marks)
name_of_file_in_bucket <- 'Metformin_Medication_AG_10122023.csv'

########################################################################
##
################# DON'T CHANGE FROM HERE ###############################
##
########################################################################

# Get the bucket name
my_bucket <- Sys.getenv('WORKSPACE_BUCKET')

# Copy the file from current workspace to the bucket
system(paste0("gsutil cp ", my_bucket, "/data/", name_of_file_in_bucket, " ."), intern=T)

# Load the file into a dataframe
metformin  <- read_csv(name_of_file_in_bucket)
head(metformin)

In [None]:
options(repr.matrix.max.cols=150, repr.matrix.max.rows=200)

In [None]:
# Rename the person_id column in 'metformin'
metformin <- metformin %>% 
       rename(PERSON_ID = person_id)
head(metformin)

In [None]:
# Create a graph to see the different types of metformin prescriptions used

medication_counts <- select(metformin, PERSON_ID, standard_concept_name) %>% group_by(standard_concept_name)
medication_counts <- count(medication_counts, standard_concept_name)
colnames(medication_counts) <- c('standard_concept_name','PERSON_ID')
par(las = 1) # make label text perpendicular to axis
par(mar=c(3,15,3,1)) # increase y-axis margin

head(medication_counts)
dim(medication_counts)

In [None]:
# Organize the dataframe in descending order for counts of person_id using the drug
med_asc <- medication_counts[order(-medication_counts$PERSON_ID),]
head(med_asc, 20)

# Now, we know what the top 20 common prescription types are for metformin

In [None]:
# Merge the two dataframes, keeping only the patients that are present in the emergent 
# acidosis dataframe

merged_cohort <- merge(acidosis.emergent, metformin, by="PERSON_ID", all.x = TRUE)
head(merged_cohort)
dim(merged_cohort)

In [None]:
length(unique(merged_cohort$PERSON_ID))

In [None]:
# index date: VISIT_START_DATETIME
# metformin start: drug_exposure_start_datetime
# Create a dataframe with just the PID, index date, visit type, medication type, and metformin start date
met_acid_cohort <- select(merged_cohort, c('PERSON_ID', 'STANDARD_CONCEPT_NAME', 'VISIT_START_DATETIME', 
                                'standard_concept_name', 'drug_exposure_start_datetime', 
                                'drug_type_concept_name'))
dim(met_acid_cohort)
head(met_acid_cohort)

In [None]:
# Change the datetime for visit start and drug exposure start
met_acid_cohort$index_date <- as.Date(met_acid_cohort$VISIT_START_DATETIME)
met_acid_cohort$drug_date <- as.Date(met_acid_cohort$drug_exposure_start_datetime)
head(met_acid_cohort)

In [None]:
# Create a new column for the difference between metformin index date and acidosis index date
met_acid_cohort$date_diff <- met_acid_cohort$drug_date - met_acid_cohort$index_date
head(met_acid_cohort)

In [None]:
met_acid_cohort$date_integer <- as.integer(met_acid_cohort$date_diff)
head(met_acid_cohort)

In [None]:
# Plot histogram
hist(met_acid_cohort$date_integer,
    main = "Metformin Start Date - Acidosis Index Date",
    col = "light blue",
    xlab = "Time Before/After Acidosis Date (Days)")