# All cancer data (cases and potential controls)

In [None]:
rm(list = ls())
library(tidyverse)
library(bigrquery)
library(data.table)

## Load cancer data set

In [None]:
name_of_file_in_bucket <- "cleaned_AoU_WGS_cancer_data.csv"

# Get the bucket name
my_bucket <- Sys.getenv('WORKSPACE_BUCKET')

# Copy csv file from the bucket to the current working space
system(paste("gsutil cp", paste0(my_bucket, "/AoU_cancer/phenotypes/", name_of_file_in_bucket), "."), intern = TRUE)

cat('[INFO] ', name_of_file_in_bucket, ' is successfully downloaded into your working space\n')

# Load dataframe from the downloaded CSV file
cancer_data <- fread(name_of_file_in_bucket)

## Filter to individuals with male or female sex assigned at birth

**Note: This variable will be used to subset cases and controls for sex-specific cancer types**

In [None]:
table(cancer_data$sex_at_birth,cancer_data$genetic_sex, useNA="ifany")

In [None]:
cancer_data = cancer_data %>% filter(genetic_sex %in% c("Female","Male"))

## Create a sex indicator variable for analyes

In [None]:
cancer_data = cancer_data %>% mutate(Female = ifelse(genetic_sex=="Female",1,0))
table(cancer_data$genetic_sex,cancer_data$Female)

## Create lists of case and control IDs 

In [None]:
case_ids = cancer_data %>% filter(case==1)
case_ids = case_ids %>% select(person_id) %>% unique() %>% unlist() %>% as.vector()
control_ids = cancer_data %>% filter(case==0)
control_ids = control_ids %>% select(person_id) %>% unique() %>% unlist() %>% as.vector()

# Prepare phenotype and covariate data sets

## Load data on related samples

In [None]:
relatedness <- read.delim("./population_structure/relatedness.tsv")
glimpse(relatedness)

## Drop one sample from each pair with a suspected 1st degree or closer relationship

**Note: A kinship coefficient threshold of $\dfrac{1}{2^{5/2}}$ will be used (reference PMID: 20926424)**

**Note: Controls will be dropped preferentially**

In [None]:
rel_thresh = (1/(2^(5/2)))
table(relatedness$kin > rel_thresh)

### Function to perform the relatedness pruning

In [None]:
remove_related_individuals <- function(df, cases, controls, rel_thresh) {
  # Initialize vectors to track removed participants
  removed_ids <- c()
  
  # Track the original number of cases and controls
  original_case_count <- length(cases)
  original_control_count <- length(controls)
  
  # Iterate over the pairs with kinship coefficient greater than the threshold
  for (i in seq_len(nrow(df))) {
    id1 <- df[i, 1]
    id2 <- df[i, 2]
    kinship <- df[i, 3]
    
    if (kinship > rel_thresh) {
      # Check if id1 and id2 are in cases or controls
      id1_is_case <- id1 %in% cases
      id2_is_case <- id2 %in% cases
      id1_is_control <- id1 %in% controls
      id2_is_control <- id2 %in% controls
      
      # Determine which individual to remove
      if (id1_is_control && !id2_is_control) {
        removed_ids <- c(removed_ids, id1)
      } else if (id2_is_control && !id1_is_control) {
        removed_ids <- c(removed_ids, id2)
      } else if (id1_is_control && id2_is_control) {
        removed_ids <- c(removed_ids, id1)  # Arbitrarily remove id1
      } else if (id1_is_case && id2_is_case) {
        removed_ids <- c(removed_ids, id1)  # Arbitrarily remove id1
      } else if (id1_is_case && id2_is_control) {
        removed_ids <- c(removed_ids, id2)
      } else if (id2_is_case && id1_is_control) {
        removed_ids <- c(removed_ids, id1)
      }
    }
  }
  
  # Determine the final list of IDs to keep
  ids_to_remove <- unique(removed_ids)
  ids_to_keep <- setdiff(c(cases, controls), ids_to_remove)
  
  # Report the number of cases and controls before and after exclusions
  final_cases <- intersect(ids_to_keep, cases)
  final_controls <- intersect(ids_to_keep, controls)
  
  cat("Original number of cases:", original_case_count, "\n")
  cat("Original number of controls:", original_control_count, "\n")
  cat("Number of cases removed:", original_case_count - length(final_cases), "\n")
  cat("Number of controls removed:", original_control_count - length(final_controls), "\n")
  cat("Number of cases after exclusions:", length(final_cases), "\n")
  cat("Number of controls after exclusions:", length(final_controls), "\n")
  
  # Return the vector of participant identifiers to keep
  return(ids_to_keep)
}

### Conduct relatedness pruning

In [None]:
set.seed(134689)
ids_to_keep = remove_related_individuals(relatedness,case_ids,control_ids,rel_thresh)

In [None]:
cancer_data = cancer_data %>% filter(person_id %in% ids_to_keep)

In [None]:
unique(cancer_data$person_id) %>% length()

## Add PCs

In [None]:
PCs = fread("./ancestry/srWGS_PCA.eigenvec")

In [None]:
PCs = PCs[,2:ncol(PCs)] ## Drop FID (all 0's)

In [None]:
cancer_data = merge(cancer_data,PCs,by.x="person_id",by.y="IID")

## Prepare phenotype data 

### Define cancer sets

In [None]:
ectoderm = c("lip", "tongue", "gum", "mouth floor", "palate",
             "other mouth", "parotid gland", "salivary glands", "accessory sinuses",
             "thymus", "melanoma", "non-melanoma skin", "peripheral nervous system", "breast",
             "prostate", "eye", "meninges", "brain", "other central nervous")
mesoderm = c("heart and other", "bone and cartilage", "mesothelioma", "Kaposi's sarcoma",
             "retroperitoneum and peritoneum", "other soft tissue", "cervix", "other uterus",
             "endometrium", "ovary", "placenta", "testis", "kidney", "renal pelvis", "ureter",
             "bladder", "Hodgkin's lymphoma", "non-follicular lymphoma",
             "mature T and NK-cell lymphoma", "other non-Hodgkin's lymphoma",
             "other T and NK-cell lymphoma", "malignant immunoproliferative disease",
             "multiple myeloma", "lymphoid leukemia", "myeloid leukemia", "monocytic leukemia",
             "other leukemia")
endoderm = c("tonsil", "oropharynx", "nasopharynx", "pyriform sinus", "hypopharynx", "esophagus",
             "stomach", "small intestine", "appendix", "colon", "rectosigmoid junction", "rectum",
             "anus", "liver", "gallbladder", "pancreas", "larynx", "trachea", "lung and bronchus",
             "vagina", "thyroid gland")
hormone = c("breast","endometrium","ovary","prostate","testis","thyroid gland")
smoking = c("lip", "tongue", "gum", "mouth floor", "palate", "other mouth", "parotid gland",
            "salivary glands", "tonsil", "oropharynx", "nasopharynx", "pyriform sinus", "hypopharynx",
            "other head and neck", "esophagus", "stomach", "liver", "pancreas", "larynx", "trachea",
            "lung and bronchus", "cervix", "kidney", "renal pelvis", "ureter", "bladder",
            "colon", "rectosigmoid junction", "rectum","anus")
infectious = c("lip", "tongue", "gum", "mouth floor", "palate", "other mouth", "parotid gland",
               "salivary glands", "tonsil", "oropharynx", "nasopharynx", "pyriform sinus", "hypopharynx",
               "other head and neck", "stomach", "anus", "liver", "gallbladder", "larynx",
               "lung and bronchus", "Kaposi's sarcoma", "vulva", "cervix", "Hodgkin's lymphoma",
               "follicular lymphoma", "non-follicular lymphoma", "mature T and NK-cell lymphoma",
               "other non-Hodgkin's lymphoma","other T and NK-cell lymphoma")

in_ectoderm = cancer_data %>%
  filter(cancer_site %in% ectoderm) %>%
  select(person_id)

in_mesoderm = cancer_data %>%
  filter(cancer_site %in% mesoderm) %>%
  select(person_id)

in_endoderm = cancer_data %>%
  filter(cancer_site %in% endoderm) %>%
  select(person_id)

in_hormone = cancer_data %>%
  filter(cancer_site %in% hormone) %>%
  select(person_id)

in_smoking = cancer_data %>%
  filter(cancer_site %in% smoking) %>%
  select(person_id)

in_infectious = cancer_data %>%
  filter(cancer_site %in% infectious) %>%
  select(person_id)

in_anus = cancer_data %>%
  filter(cancer_site == "anus") %>%
  select(person_id)

in_bladder = cancer_data %>%
  filter(cancer_site == "bladder") %>%
  select(person_id)

in_bone = cancer_data %>%
  filter(cancer_site == "bone and cartilage") %>%
  select(person_id)

in_brain = cancer_data %>%
  filter(cancer_site == "brain") %>%
  select(person_id)

in_breast = cancer_data %>%
  filter(cancer_site == "breast") %>%
  select(person_id)

in_cervix = cancer_data %>%
  filter(cancer_site == "cervix", Female == 1) %>%
  select(person_id)

in_colon = cancer_data %>%
  filter(cancer_site == "colon") %>%
  select(person_id)

in_endometrium = cancer_data %>%
  filter(cancer_site == "endometrium", Female == 1) %>%
  select(person_id)

in_esophagus = cancer_data %>%
  filter(cancer_site == "esophagus") %>%
  select(person_id)

in_eye = cancer_data %>%
  filter(cancer_site == "eye") %>%
  select(person_id)

in_follicular_lymphoma = cancer_data %>%
  filter(cancer_site == "follicular lymphoma") %>%
  select(person_id)

in_heart = cancer_data %>%
  filter(cancer_site == "heart and other") %>%
  select(person_id)

in_Hodgkins_lymphoma = cancer_data %>%
  filter(cancer_site == "Hodgkin's lymphoma") %>%
  select(person_id)

in_kidney = cancer_data %>%
  filter(cancer_site == "kidney") %>%
  select(person_id)

in_larynx = cancer_data %>%
  filter(cancer_site == "larynx") %>%
  select(person_id)

in_liver = cancer_data %>%
  filter(cancer_site == "liver") %>%
  select(person_id)

in_lung = cancer_data %>%
  filter(cancer_site == "lung and bronchus") %>%
  select(person_id)

in_lymphoid_leukemia = cancer_data %>%
  filter(cancer_site == "lymphoid leukemia") %>%
  select(person_id)

in_MID = cancer_data %>%
  filter(cancer_site == "malignant immunoproliferative disease") %>%
  select(person_id)

in_M_TNK_lymphoma = cancer_data %>%
  filter(cancer_site == "mature T and NK-cell lymphoma") %>%
  select(person_id)

in_melanoma = cancer_data %>%
  filter(cancer_site == "melanoma") %>%
  select(person_id)

in_myeloma = cancer_data %>%
  filter(cancer_site == "multiple myeloma") %>%
  select(person_id)

in_myeloid_leukemia = cancer_data %>%
  filter(cancer_site == "myeloid leukemia") %>%
  select(person_id)

in_NF_lymphoma = cancer_data %>%
  filter(cancer_site == "non-follicular lymphoma") %>%
  select(person_id)

in_non_melanoma = cancer_data %>%
  filter(cancer_site == "non-melanoma skin") %>%
  select(person_id)

in_oropharynx = cancer_data %>%
  filter(cancer_site == "oropharynx") %>%
  select(person_id)

in_OCNS = cancer_data %>%
  filter(cancer_site == "other soft tissue") %>%
  select(person_id)

in_other_uterus = cancer_data %>%
  filter(cancer_site == "other uterus", Female == 1) %>%
  select(person_id)

in_ovary = cancer_data %>%
  filter(cancer_site == "ovary", Female == 1) %>%
  select(person_id)

in_pancreas = cancer_data %>%
  filter(cancer_site == "pancreas") %>%
  select(person_id)

in_prostate = cancer_data %>%
  filter(cancer_site == "prostate", Female == 0) %>%
  select(person_id)

in_rectosigmoid = cancer_data %>%
  filter(cancer_site == "rectosigmoid junction") %>%
  select(person_id)

in_rectum = cancer_data %>%
  filter(cancer_site == "rectum") %>%
  select(person_id)

in_renal_pelvis = cancer_data %>%
  filter(cancer_site == "renal pelvis") %>%
  select(person_id)

in_peritoneum = cancer_data %>%
  filter(cancer_site == "retroperitoneum and peritoneum") %>%
  select(person_id)

in_small_intestine = cancer_data %>%
  filter(cancer_site == "small intestine") %>%
  select(person_id)

in_stomach = cancer_data %>%
  filter(cancer_site == "stomach") %>%
  select(person_id)

in_testis = cancer_data %>%
  filter(cancer_site == "testis", Female == 0) %>%
  select(person_id)

in_thyroid = cancer_data %>%
  filter(cancer_site == "thyroid gland") %>%
  select(person_id)

in_tongue = cancer_data %>%
  filter(cancer_site == "tongue") %>%
  select(person_id)

in_tonsil = cancer_data %>%
  filter(cancer_site == "tonsil") %>%
  select(person_id)


### Prepare phenotype data set

In [None]:
## Case-control sets ##
case_data = cancer_data %>%
  filter(case == 1)
unique_cases <- !duplicated(case_data$person_id)
unique_cases <- case_data[unique_cases,]

case_pheno = unique_cases %>%
  select(c("person_id",
           "age_at_sample",
           "Female",
            paste0("PC",1:30)))

case_pheno = case_pheno %>%
  mutate(ectoderm = ifelse(person_id %in% in_ectoderm$person_id,1,NA),
         mesoderm = ifelse(person_id %in% in_mesoderm$person_id,1,NA),
         endoderm = ifelse(person_id %in% in_endoderm$person_id,1,NA),
         hormone = ifelse(person_id %in% in_hormone$person_id,1,NA),
         smoking = ifelse(person_id %in% in_smoking$person_id,1,NA),
         infectious = ifelse(person_id %in% in_infectious$person_id,1,NA),
         anus = ifelse(person_id %in% in_anus$person_id,1,NA),
         bladder = ifelse(person_id %in% in_bladder$person_id,1,NA),
         brain = ifelse(person_id %in% in_brain$person_id,1,NA),
         breast = ifelse(person_id %in% in_breast$person_id,1,NA),
         cervix = ifelse(person_id %in% in_cervix$person_id,1,NA),
         colon = ifelse(person_id %in% in_colon$person_id,1,NA),
         endometrium = ifelse(person_id %in% in_endometrium$person_id,1,NA),
         esophagus = ifelse(person_id %in% in_esophagus$person_id,1,NA),
         eye = ifelse(person_id %in% in_eye$person_id,1,NA),
         follicular_lymphoma = ifelse(person_id %in% in_follicular_lymphoma$person_id,1,NA),
         heart = ifelse(person_id %in% in_heart$person_id,1,NA),
         HL = ifelse(person_id %in% in_Hodgkins_lymphoma$person_id,1,NA),
         kidney = ifelse(person_id %in% in_kidney$person_id,1,NA),
         larynx = ifelse(person_id %in% in_larynx$person_id,1,NA),
         liver = ifelse(person_id %in% in_liver$person_id,1,NA),
         lung = ifelse(person_id %in% in_lung$person_id,1,NA),
         lymphoid_leukemia = ifelse(person_id %in% in_lymphoid_leukemia$person_id,1,NA),
         MID = ifelse(person_id %in% in_MID$person_id,1,NA),
         M_TNK_lymphoma = ifelse(person_id %in% in_M_TNK_lymphoma$person_id,1,NA),
         myeloid_leukemia = ifelse(person_id %in% in_myeloid_leukemia$person_id,1,NA),
         NF_lymphoma = ifelse(person_id %in% in_NF_lymphoma$person_id,1,NA),
         non_melanoma = ifelse(person_id %in% in_non_melanoma$person_id,1,NA),
         oropharynx = ifelse(person_id %in% in_oropharynx$person_id,1,NA),
         OCNS = ifelse(person_id %in% in_OCNS$person_id,1,NA),
         other_uterus = ifelse(person_id %in% in_other_uterus$person_id,1,NA),
         ovary = ifelse(person_id %in% in_ovary$person_id,1,NA),
         pancreas = ifelse(person_id %in% in_pancreas$person_id,1,NA),
         prostate = ifelse(person_id %in% in_prostate$person_id,1,NA),
         rectosigmoid = ifelse(person_id %in% in_rectosigmoid$person_id,1,NA),
         rectum = ifelse(person_id %in% in_rectum$person_id,1,NA),
         renal_pelvis = ifelse(person_id %in% in_renal_pelvis$person_id,1,NA),
         peritoneum = ifelse(person_id %in% in_peritoneum$person_id,1,NA),
         small_intestine = ifelse(person_id %in% in_small_intestine$person_id,1,NA),
         stomach = ifelse(person_id %in% in_stomach$person_id,1,NA),
         testis = ifelse(person_id %in% in_testis$person_id,1,NA),
         thyroid = ifelse(person_id %in% in_thyroid$person_id,1,NA),
         tongue = ifelse(person_id %in% in_tongue$person_id,1,NA),
         tonsil = ifelse(person_id %in% in_tonsil$person_id,1,NA))

control_data = cancer_data %>%
  filter(case == 0)

control_pheno = control_data %>%
  select(c("person_id",
           "age_at_sample",
           "Female",
           paste0("PC",1:30)))

control_pheno = control_pheno %>%
  mutate(ectoderm = 0,
         mesoderm = 0,
         endoderm = 0,
         hormone = 0,
         smoking = 0,
         infectious = 0,
         anus = 0,
         bladder = 0,
         brain = 0,
         breast = ifelse(Female == 0, NA, 0),
         cervix = ifelse(Female == 0, NA, 0),
         colon = 0,
         endometrium = ifelse(Female == 0, NA, 0),
         esophagus = 0,
         eye = 0,
         follicular_lymphoma = 0,
         heart = 0,
         HL = 0,
         kidney = 0,
         larynx = 0,
         liver = 0,
         lung = 0,
         lymphoid_leukemia = 0,
         MID = 0,
         M_TNK_lymphoma = 0,
         myeloid_leukemia = 0,
         NF_lymphoma = 0,
         non_melanoma = 0,
         oropharynx = 0,
         OCNS = 0,
         other_uterus = ifelse(Female == 0, NA, 0),
         ovary = ifelse(Female == 0, NA, 0),
         pancreas = 0,
         prostate = ifelse(Female == 0, 0, NA),
         rectosigmoid = 0,
         rectum = 0,
         renal_pelvis = 0,
         peritoneum = 0,
         small_intestine = 0,
         stomach = 0,
         testis = ifelse(Female == 0, 0, NA),
         thyroid = 0,
         tongue = 0,
         tonsil = 0)

## Combine into final phenotype dataset ##
pheno = rbind(case_pheno,control_pheno) %>%
  arrange(person_id)

## Write final phenotype dataset ##
write.table(pheno,
            file = "phenotypes.txt",
            row.names = FALSE,
            col.names = TRUE,
            sep = " ",
            quote = FALSE)

### Prepare sample lists

In [None]:
# Write files with participant IDs for analyses #

## IDs (single column per row) ##
IDs = cancer_data %>% 
  pull(person_id) %>%
  unique()

## PLINK-format IDs (two columns per row) ##
PLINK_format_IDs = data.frame(FID = 0, IID = IDs)


## Write and upload ID files ##
write.table(IDs, file = "samples.txt",
            row.names = FALSE,
            col.names = FALSE,
            sep = " ",
            quote = FALSE)
write.table(PLINK_format_IDs,
            file = "PLINK_samples.txt",
            row.names = FALSE,
            col.names = FALSE,
            sep = " ",
            quote = FALSE)

## Save phenotype data

In [None]:
destination_filename <- 'phenotypes.txt'

# Copy file to the bucket
system(paste("gsutil cp", destination_filename, paste0(my_bucket, "/austin_working/dissertation_project/phenotypes/")), intern = TRUE)

In [None]:
destination_filename <- 'samples.txt'

# Copy file to the bucket
system(paste("gsutil cp", destination_filename, paste0(my_bucket, "/austin_working/dissertation_project/phenotypes/")), intern = TRUE)

In [None]:
destination_filename <- 'PLINK_samples.txt'

# Copy file to the bucket
system(paste("gsutil cp", destination_filename, paste0(my_bucket, "/austin_working/dissertation_project/phenotypes/")), intern = TRUE)

In [None]:
table(pheno$ectoderm)

In [None]:
table(pheno$mesoderm)

In [None]:
table(pheno$endoderm)

In [None]:
table(pheno$hormone)

In [None]:
table(pheno$smoking)

In [None]:
table(pheno$infectious)

# Prepare individual cancer phenotype files

## Define small cancer groups

In [None]:
colorectal = c("colon","rectum","rectosigmoid junction")
NHL = c("non-follicular lymphoma","mature T and NK-cell lymphoma", "other non-Hodgkin's lymphoma",
        "other T and NK-cell lymphoma")
leukemias = c("lymphoid leukemia", "myeloid leukemia", "monocytic leukemia",
              "other leukemia")
oral = c("lip", "tongue", "gum", "mouth floor", "palate",
         "other mouth", "parotid gland", "salivary glands", "accessory sinuses")
neck = c("oropharynx", "nasopharynx", "pyriform sinus", "hypopharynx","larynx")

## Subset cancer cases by cancer type of small group

In [None]:
in_colorectal = cancer_data %>%
  filter(cancer_site %in% colorectal) %>%
  select(person_id)

in_NHL = cancer_data %>%
  filter(cancer_site %in% NHL) %>%
  select(person_id)

in_leukemias = cancer_data %>%
  filter(cancer_site %in% leukemias) %>%
  select(person_id)

in_oral = cancer_data %>%
  filter(cancer_site %in% oral) %>%
  select(person_id)

in_neck = cancer_data %>%
  filter(cancer_site %in% neck) %>%
  select(person_id)

in_anus = cancer_data %>%
  filter(cancer_site == "anus") %>%
  select(person_id)

in_bladder = cancer_data %>%
  filter(cancer_site == "bladder") %>%
  select(person_id)

in_bone = cancer_data %>%
  filter(cancer_site == "bone and cartilage") %>%
  select(person_id)

in_brain = cancer_data %>%
  filter(cancer_site == "brain") %>%
  select(person_id)

in_breast = cancer_data %>%
  filter(cancer_site == "breast", Female == 1) %>%
  select(person_id)

in_cervix = cancer_data %>%
  filter(cancer_site == "cervix", Female == 1) %>%
  select(person_id)

in_endometrium = cancer_data %>%
  filter(cancer_site == "endometrium", Female == 1) %>%
  select(person_id)

in_esophagus = cancer_data %>%
  filter(cancer_site == "esophagus") %>%
  select(person_id)

in_eye = cancer_data %>%
  filter(cancer_site == "eye") %>%
  select(person_id)

in_Hodgkins_lymphoma = cancer_data %>%
  filter(cancer_site == "Hodgkin's lymphoma") %>%
  select(person_id)

in_kidney = cancer_data %>%
  filter(cancer_site == "kidney") %>%
  select(person_id)

in_liver = cancer_data %>%
  filter(cancer_site == "liver") %>%
  select(person_id)

in_lung = cancer_data %>%
  filter(cancer_site == "lung and bronchus") %>%
  select(person_id)

in_melanoma = cancer_data %>%
  filter(cancer_site == "melanoma") %>%
  select(person_id)

in_myeloma = cancer_data %>%
  filter(cancer_site == "multiple myeloma") %>%
  select(person_id)

in_non_melanoma = cancer_data %>%
  filter(cancer_site == "non-melanoma skin") %>%
  select(person_id)

in_ovary = cancer_data %>%
  filter(cancer_site == "ovary", Female == 1) %>%
  select(person_id)

in_pancreas = cancer_data %>%
  filter(cancer_site == "pancreas") %>%
  select(person_id)

in_prostate = cancer_data %>%
  filter(cancer_site == "prostate", Female == 0) %>%
  select(person_id)

in_stomach = cancer_data %>%
  filter(cancer_site == "stomach") %>%
  select(person_id)

in_testis = cancer_data %>%
  filter(cancer_site == "testis", Female == 0) %>%
  select(person_id)

in_thyroid = cancer_data %>%
  filter(cancer_site == "thyroid gland") %>%
  select(person_id)

## Prepare phenotype data set

In [None]:
## Case-control sets ##
case_data = cancer_data %>%
  filter(case == 1)
unique_cases <- !duplicated(case_data$person_id)
unique_cases <- case_data[unique_cases,]

case_pheno = unique_cases %>%
  select(c("person_id",
           "age_at_sample",
           "Female",
            paste0("PC",1:30)))

case_pheno = case_pheno %>%
  mutate(colorectal = ifelse(person_id %in% in_colorectal$person_id,1,NA),
         NHL = ifelse(person_id %in% in_NHL$person_id,1,NA),
         leukemias = ifelse(person_id %in% in_leukemias$person_id,1,NA),
         oral = ifelse(person_id %in% in_oral$person_id,1,NA),
         neck = ifelse(person_id %in% in_neck$person_id,1,NA),
         anus = ifelse(person_id %in% in_anus$person_id,1,NA),
         bladder = ifelse(person_id %in% in_bladder$person_id,1,NA),
         brain = ifelse(person_id %in% in_brain$person_id,1,NA),
         breast = ifelse(person_id %in% in_breast$person_id,1,NA),
         bone = ifelse(person_id %in% in_bone$person_id,1,NA),
         prostate = ifelse(person_id %in% in_prostate$person_id,1,NA),
         cervix = ifelse(person_id %in% in_cervix$person_id,1,NA),
         endometrium = ifelse(person_id %in% in_endometrium$person_id,1,NA),
         esophagus = ifelse(person_id %in% in_esophagus$person_id,1,NA),
         eye = ifelse(person_id %in% in_eye$person_id,1,NA),
         HL = ifelse(person_id %in% in_Hodgkins_lymphoma$person_id,1,NA),
         kidney = ifelse(person_id %in% in_kidney$person_id,1,NA),
         liver = ifelse(person_id %in% in_liver$person_id,1,NA),
         lung = ifelse(person_id %in% in_lung$person_id,1,NA),
         non_melanoma = ifelse(person_id %in% in_non_melanoma$person_id,1,NA),
         melanoma = ifelse(person_id %in% in_melanoma$person_id,1,NA),
         myeloma = ifelse(person_id %in% in_myeloma$person_id,1,NA),
         ovary = ifelse(person_id %in% in_ovary$person_id,1,NA),
         pancreas = ifelse(person_id %in% in_pancreas$person_id,1,NA),
         stomach = ifelse(person_id %in% in_stomach$person_id,1,NA),
         testis = ifelse(person_id %in% in_testis$person_id,1,NA),
         thyroid = ifelse(person_id %in% in_thyroid$person_id,1,NA))

control_data = cancer_data %>%
  filter(case == 0)

control_pheno = control_data %>%
  select(c("person_id",
           "age_at_sample",
           "Female",
           paste0("PC",1:30)))

control_pheno = control_pheno %>%
  mutate(colorectal = 0,
         NHL = 0,
         leukemias = 0,
         oral = 0,
         neck = 0,
         anus = 0,
         bladder = 0,
         brain = 0,
         breast = ifelse(Female == 0, NA, 0),
         bone = 0,
         prostate = ifelse(Female == 0, 0, NA),
         cervix = ifelse(Female == 0, NA, 0),
         endometrium = ifelse(Female == 0, NA, 0),
         esophagus = 0,
         eye = 0,
         HL = 0,
         kidney = 0,
         liver = 0,
         lung = 0,
         non_melanoma = 0,
         melanoma = 0,
         myeloma = 0,
         ovary = ifelse(Female == 0, NA, 0),
         pancreas = 0,
         stomach = 0,
         testis = ifelse(Female == 0, 0, NA),
         thyroid = 0)

## Combine into final phenotype dataset ##
pheno = rbind(case_pheno,control_pheno) %>%
  arrange(person_id)

## Write final phenotype dataset ##
write.table(pheno,
            file = "single_cancer_phenotypes.txt",
            row.names = FALSE,
            col.names = TRUE,
            sep = " ",
            quote = FALSE)

In [None]:
destination_filename <- 'single_cancer_phenotypes.txt'

# Copy file to the bucket
system(paste("gsutil cp", destination_filename, paste0(my_bucket, "/austin_working/dissertation_project/phenotypes/")), intern = TRUE)

# Check and correct issues with eye and non-menalona skin cancer sets

## Look at predicted ancestry 

In [None]:
ancestry = fread("./ancestry/ancestry_preds.tsv")

In [None]:
eye = pheno %>% filter(eye >= 0) %>% select(c("person_id",
           "age_at_sample",
           "Female",
           paste0("PC",1:30),"eye"))

In [None]:
eye = merge(eye,ancestry,by.x="person_id",by.y="research_id")

In [None]:
table(eye$ancestry_pred,eye$eye)

In [None]:
non_melanoma = pheno %>% filter(non_melanoma >= 0) %>% select(c("person_id",
           "age_at_sample",
           "Female",
           paste0("PC",1:30),"non_melanoma"))

In [None]:
non_melanoma = merge(non_melanoma,ancestry,by.x="person_id",by.y="research_id")

In [None]:
table(non_melanoma$ancestry_pred,non_melanoma$non_melanoma)

## Refine sets with more stringent relatedness pruning (exclude from each pair any individual with a kinship coefficient >0.1)

In [None]:
eye_NMSC = pheno %>% filter(eye >= 0 | non_melanoma >= 0) %>% select(c("person_id",
           "age_at_sample",
           "Female",
           paste0("PC",1:30),"eye","non_melanoma"))

In [None]:
rel_thresh = 0.1
table(relatedness$kin > rel_thresh)

In [None]:
set.seed(134689)
case_ids = eye_NMSC %>% filter(eye == 1 | non_melanoma == 1) %>% select(person_id) %>% pull()
control_ids = eye_NMSC %>% filter(eye == 0 | non_melanoma == 0) %>% select(person_id) %>% pull()

In [None]:
str(case_ids)
str(control_ids)

In [None]:
ids_to_keep = remove_related_individuals(relatedness,case_ids,control_ids,rel_thresh)

In [None]:
eye_NMSC = eye_NMSC %>% filter(person_id %in% ids_to_keep)

## Randomly downsample controls 

In [None]:
set.seed(42857920)
control_rows <- which(eye_NMSC[["eye"]] == 0 & eye_NMSC[["non_melanoma"]] == 0)
remove_indices <- sample(control_rows, length(control_rows) * 0.75)

In [None]:
eye_NMSC <- eye_NMSC[-remove_indices, ]

In [None]:
table(eye_NMSC$eye)

In [None]:
table(eye_NMSC$non_melanoma)

In [None]:
## Write new phenotype dataset ##
write.table(eye_NMSC,
            file = "eye_NMSC_cancer_phenotypes.txt",
            row.names = FALSE,
            col.names = TRUE,
            sep = " ",
            quote = FALSE)
destination_filename <- 'eye_NMSC_cancer_phenotypes.txt'

# Copy file to the bucket
system(paste("gsutil cp", destination_filename, paste0(my_bucket, "/austin_working/dissertation_project/phenotypes/")), intern = TRUE)

In [None]:
for(i in sort(unique(cancer_data$cancer_site))) {
  n <- cancer_data %>%
    filter(cancer_site == i) %>%
    distinct(person_id, .keep_all = TRUE) %>%
    nrow()
  
  print(paste0(i, " = ", n))
}