QC Array
========

***Analysis step***

**Author:** *Jay Kim*

## Test sample contamination (bafRegress)
G. Jun, M. Flickinger, K. N. Hetrick, Kurt, J. M. Romm, K. F. Doheny, G. Abecasis, M. Boehnke,and H. M. Kang, _Detecting and Estimating Contamination of Human DNA Samples in Sequencing and Array-Based Genotype Data_, American journal of human genetics doi:10.1016/j.ajhg.2012.09.004 (volume 91 issue 5 pp.839 - 848)

### Get all sample metadata and gencall data for project/batch:

In [8]:
library(purrr)
library(dplyr, warn.conflicts = FALSE)
library(tidyr)

# Get the bead pool manifest data
s3 <- paws::s3()
bucket <- 'pbc-iscan-qcarrays'
filename <- "InfiniumQCArray-24v1-0_A4.bpm"
bpm <- paste0("/tmp/", filename)
s3$download_file(bucket, filename, bpm)
bead.array.files <- reticulate::import('IlluminaBeadArrayFiles')
manifest <- bead.array.files$BeadPoolManifest(bpm)

# TODO: Fetch the sample metadata from the database
#json <- httr::content(httr::GET("http://{endpoint}/sampleinfo?batchid={JIRA}"), "text")
json <-
'[
    {"Sample_ID" : "CPT0nnnnn_0001", "Barcode" : "203323200003", "Position" : "R11C02"},
    {"Sample_ID" : "CPT0nnnnn_0002", "Barcode" : "203323200003", "Position" : "R11C02"}
]'
sample.info <- jsonlite::fromJSON(json, simplifyDataFrame=FALSE)
names(sample.info) <- map(sample.info, ~ .$Sample_ID)
# Get genotype data for each sample in the sample list
get.sample.data <- function(sample) {
    key <- glue::glue('{sample$Barcode}_{sample$Position}.gtc')
    filename <- paste0("/tmp/",key)
    s3$download_file(bucket, key, filename)
    gtc <- bead.array.files$GenotypeCalls(filename)
    baf <- gtc$get_ballele_freqs()
    abgeno <- unlist(map(gtc$get_genotypes(),
                    ~ bead.array.files$code2genotype[.+1]))
    return(data.frame(snp=manifest$names,baf,abgeno))
}
sample.data <- map(sample.info, get.sample.data)

### Aggregate project/batch MAFs across entire study/cohort:

In [21]:
# for each project/batch in the study/cohort

    # TODO: See if the MAFs for this project/batch are in the cache

    # else calculate them
    maf.calc <- function(AA, AB, BB) (AB+2*BB)/(2*(AA+AB+BB))
    maf <- bind_rows(sample.data, .id="Sample_ID") %>%
      group_by(snp, abgeno) %>%
      summarise(n=n(), .groups="drop_last") %>%
      spread(abgeno, n) %>%
      replace(is.na(.), 0) %>%
      summarise(maf=maf.calc(AA,AB,BB))

    # TODO: Cache the MAFs for this project/batch

# TODO: Calculate the combined MAFs across all projects/batches in the study/cohort

### Test each sample for contamination:

In [20]:
source("bafRegress.R")
baf_results <- map(sample.data, ~ testsamplecontamination(.$baf,.$abgeno,maf$maf))

# Analysis

set the name of comparison prefix by pulling the parent directory name

In [None]:
comparison = tail(strsplit(getwd(),"/")[[1]],n=1)

## Import gencalls

In [14]:
source("analysis-utils.R")
raw_data <- import.gencalls(sample.info)
summary(raw_data)
head(raw_data)

--- raw_data ---
A genotypes object with 15949 sites x 1 samples
Allele encoding: native 
Intensity data: yes (raw) 
Sample metadata: yes ( 0 male / 1 female / 0 unknown )
Filters set: 0 sites / 0 samples 
Checksum: c730e3dd28525c0e2a47d0c908838ad3 
Genotypes matrix:
                CPT0n 
 2010-08-Y-1111     H 
 2010-08-Y-1221     H 
 2010-08-Y-1995     H 
 2010-08-Y-2045     H 
 2010-08-Y-3042     H 
 2010-08-Y-3189     H 
 2010-08-Y-3314     H 
 2010-08-Y-3348     H 
 2010-08-Y-3576     H 
  2010-08-Y-749     H 

Marker map:
 chr         marker cM pos A1 A2
   0 2010-08-Y-1111 NA   0  A  G
   0 2010-08-Y-1221 NA   0  A  G
   0 2010-08-Y-1995 NA   0  A  C
   0 2010-08-Y-2045 NA   0  A  G
   0 2010-08-Y-3042 NA   0  T  C
   0 2010-08-Y-3189 NA   0  A  C
   0 2010-08-Y-3314 NA   0  T  C
   0 2010-08-Y-3348 NA   0  T  C
   0 2010-08-Y-3576 NA   0  T  C
   0  2010-08-Y-749 NA   0  T  C

Sample info:
            fid            iid mom dad sex pheno
 CPT0nnnnn_0001 CPT0nnnnn_0001   0   0  

## QC .idat

In [None]:
# ...

## Import .gds file

In [None]:
# ...