# Set Up

In [None]:
install.packages('fst',repos = "https://cloud.r-project.org")

In [None]:
# Load libraries
library(tidyverse)
library(fst)
library(bigrquery)
library(stringr)

# dx_pid

In [None]:
# load diagnosis dataset

# This snippet assumes that you run setup first

 

# This code copies a file from your Google Bucket into a dataframe

 

# replace 'test.csv' with the name of the file in your google bucket (don't delete the quotation marks)
name_of_file_in_bucket <- 'AASLD_dx_cohort.fst'

 

########################################################################
##
################# DON'T CHANGE FROM HERE ###############################
##
########################################################################

 

# Get the bucket name
my_bucket <- Sys.getenv('WORKSPACE_BUCKET')

 

# Copy the file from current workspace to the bucket
system(paste0("gsutil cp ", my_bucket, "/data/", name_of_file_in_bucket, " ."), intern=T)

 

# Load the file into a dataframe
dx  <- read_fst(name_of_file_in_bucket)
head(dx)

In [None]:
dim(dx)

In [None]:
download_data <- function(query) {
    tb <- bq_project_query(Sys.getenv('GOOGLE_PROJECT'), query)
    bq_table_download(tb)
}

In [None]:
dataset <- Sys.getenv("WORKSPACE_CDR")
dx_pid <- download_data(str_glue("SELECT distinct person_id
                                FROM {dataset}.condition_occurrence ORDER BY person_id"))

In [None]:
dim(dx_pid)

# demo_pid

In [None]:
# This snippet assumes that you run setup first

 

# This code copies a file from your Google Bucket into a dataframe

 

# replace 'test.csv' with the name of the file in your google bucket (don't delete the quotation marks)
name_of_file_in_bucket <- 'AUD_Survey_Basics_Lifestyle.fst'

 

########################################################################
##
################# DON'T CHANGE FROM HERE ###############################
##
########################################################################

 

# Get the bucket name
my_bucket <- Sys.getenv('WORKSPACE_BUCKET')

 

# Copy the file from current workspace to the bucket
system(paste0("gsutil cp ", my_bucket, "/data/", name_of_file_in_bucket, " ."), intern=T)

 

# Load the file into a dataframe
aud_basics  <- read_fst(name_of_file_in_bucket)
head(aud_basics)

In [None]:
unique(aud_basics$survey)
table(aud_basics['survey'])

In [None]:
# create data frame with just the basics data
the.basics <- aud_basics[aud_basics$survey == 'The Basics',]
head(the.basics)

In [None]:
# See how many unique PID's are in this dataframe
length(unique(the.basics$person_id))

In [None]:
# Create a dataframe that only contains the PID
basics_pid <- select(the.basics, c('person_id'))
dim(basics_pid)
head(basics_pid)

In [None]:
length(unique(basics_pid$person_id))

In [None]:
# Remove duplicate PIDs
basics_pid <- basics_pid %>% distinct(person_id, .keep_all = TRUE)
head(basics_pid)
dim(basics_pid)

# Intersection of 2 cohorts

In [None]:
library(dplyr)

In [None]:
# Find the intersection between dx_pid and genomics_pid, create a new dataframe
intersect1 <- dplyr::intersect(dx_pid, basics_pid)
dim(intersect1)

# Acidosis Cohort: What Overlaps?

In [None]:
## See how many of the 3,169 participants do NOT meet the inclusion/exclusion criteria

## those that do not meet the criteria = need to be taken out

In [None]:
# Upload the acidosis dataframe (this dataframe includes ALL acidosis visits, not just emergent)

# This snippet assumes that you run setup first

# This code copies a file from your Google Bucket into a dataframe

 

# replace 'test.csv' with the name of the file in your google bucket (don't delete the quotation marks)
name_of_file_in_bucket <- 'Acidosis_Visit_Occurrence_AG_10162023.csv'

 

########################################################################
##
################# DON'T CHANGE FROM HERE ###############################
##
########################################################################

 

# Get the bucket name
my_bucket <- Sys.getenv('WORKSPACE_BUCKET')

 

# Copy the file from current workspace to the bucket
system(paste0("gsutil cp ", my_bucket, "/data/", name_of_file_in_bucket, " ."), intern=T)

 

# Load the file into a dataframe
acidosis_df  <- read_csv(name_of_file_in_bucket)
head(acidosis_df)

In [None]:
dim(acidosis_df)

In [None]:
# Remove duplicate PIDs
acidosis_df <- acidosis_df %>% distinct(PERSON_ID, .keep_all = TRUE)
head(acidosis_df)
dim(acidosis_df)

In [None]:
# Change PERSON_ID -> person_id so that it matches with intersect dataframe
acidosis_df <- acidosis_df %>% 
       rename(person_id = PERSON_ID)
head(acidosis_df)

In [None]:
# Create new dataframe from acidosis_df that just contains PID
acidosis_df <- select(acidosis_df, c('person_id'))
dim(acidosis_df)
head(acidosis_df)

In [None]:
# See how many patients in the acidosis dataframe are included in the inclusion/exclusion cohort
# Acidosis cases that are not included in the cohort will need to be removed
intersect.acidosis <- dplyr::intersect(intersect1, acidosis_df)
head(intersect.acidosis)
dim(intersect.acidosis)

In [None]:
# This means that only 3169/3169 acidosis patients can be included in the analysis.

In [None]:
# Save this acidosis cohort as a csv file 
# [THIS IS THE DF THAT CONTAINS ALL ACIDOSIS PTS, REGARDLESS OF VISIT TYPE]

In [None]:
# This snippet assumes that you run setup first

# This code saves your dataframe into a csv file in a "data" folder in Google Bucket

# Replace df with THE NAME OF YOUR DATAFRAME
my_dataframe <- intersect.acidosis

# Replace 'test.csv' with THE NAME of the file you're going to store in the bucket (don't delete the quotation marks)
destination_filename <- 'acidosis_exclusion_cohort_pid_AG_11022023.csv'

########################################################################
##
################# DON'T CHANGE FROM HERE ###############################
##
########################################################################

# store the dataframe in current workspace
write_excel_csv(my_dataframe, destination_filename)

# Get the bucket name
my_bucket <- Sys.getenv('WORKSPACE_BUCKET')

# Copy the file from current workspace to the bucket
system(paste0("gsutil cp ./", destination_filename, " ", my_bucket, "/data/"), intern=T)

# Check if file is in the bucket
system(paste0("gsutil ls ", my_bucket, "/data/*.csv"), intern=T)


# Setdiff(Intersection - All Acidosis Cases)

In [None]:
### I think I will need to change this once I have all of the acidosis cases (emergent and 
### non emergent) - and take out the people that don't meet the necessary criteria first

## once I take out those people in step 6, then I can have the intersection between everyone
## that meets the criteria - acidosis cases


In [None]:
# Subtract the acidosis patients from the intersect cohort to create your control cohort
control_cohort <- setdiff(intersect1, intersect.acidosis)
dim(control_cohort)
head(control_cohort)

In [None]:
# Save the control cohort as a csv file

# This snippet assumes that you run setup first

# This code saves your dataframe into a csv file in a "data" folder in Google Bucket

# Replace df with THE NAME OF YOUR DATAFRAME
my_dataframe <- control_cohort

# Replace 'test.csv' with THE NAME of the file you're going to store in the bucket (don't delete the quotation marks)
destination_filename <- 'acidosis_control_cohort_pid_AG_11022023.csv'

########################################################################
##
################# DON'T CHANGE FROM HERE ###############################
##
########################################################################

# store the dataframe in current workspace
write_excel_csv(my_dataframe, destination_filename)

# Get the bucket name
my_bucket <- Sys.getenv('WORKSPACE_BUCKET')

# Copy the file from current workspace to the bucket
system(paste0("gsutil cp ./", destination_filename, " ", my_bucket, "/data/"), intern=T)

# Check if file is in the bucket
system(paste0("gsutil ls ", my_bucket, "/data/*.csv"), intern=T)