QC Array
========

***Analysis step***

**Author:** *Jay Kim*

## Test sample contamination (bafRegress)
G. Jun, M. Flickinger, K. N. Hetrick, Kurt, J. M. Romm, K. F. Doheny, G. Abecasis, M. Boehnke,and H. M. Kang, _Detecting and Estimating Contamination of Human DNA Samples in Sequencing and Array-Based Genotype Data_, American journal of human genetics doi:10.1016/j.ajhg.2012.09.004 (volume 91 issue 5 pp.839 - 848)

In [None]:
### Read in bead pool manifest and fetch sample metadata from the database

In [100]:
library(plyr)
library(dplyr, warn.conflicts = FALSE)
library(tidyr)
library(purrr)

# Get the bead pool manifest data
s3 <- paws::s3()
bucket <- 'pbc-iscan-qcarrays'
key <- "InfiniumQCArray-24v1-0_A4.bpm"
filename <- paste0("/tmp/", key)
if (!file.exists(filename)) {
    s3$download_file(bucket, key, filename)
}
bead.array.files <- reticulate::import('IlluminaBeadArrayFiles')
manifest <- bead.array.files$BeadPoolManifest(filename)

# TODO: Fetch the sample metadata from the database
#json <- httr::content(httr::GET("http://{endpoint}/sampleinfo?batchid={JIRA}"))
json <-
'[
    {"Sample_ID" : "CPT0nnnnn_0001", "Barcode" : "203323200003", "Position" : "R11C02"},
    {"Sample_ID" : "CPT0nnnnn_0002", "Barcode" : "203323200003", "Position" : "R11C02"}
]'
sample.info <- jsonlite::fromJSON(json, simplifyDataFrame=FALSE)
names(sample.info) <- map(sample.info, ~ .$Sample_ID)

### Get baf and genotype data for all samples in the project/batch:

In [93]:
# Get genotype data for each sample in the sample list
get.baf.data <- function(sample) {
    key <- glue::glue('{sample$Barcode}_{sample$Position}.gtc')
    filename <- paste0("/tmp/",key)
    if (!file.exists(filename)) {
        s3$download_file(bucket, key, filename)
    }
    gtc <- bead.array.files$GenotypeCalls(filename)
    baf <- gtc$get_ballele_freqs()
    abgeno <- unlist(map(gtc$get_genotypes(),
                    ~ bead.array.files$code2genotype[.+1]))
    return(list(baf=baf,abgeno=abgeno))
}
baf.data <- sample.info %>% map(get.baf.data)

### Aggregate project/batch MAFs across entire study/cohort:

In [90]:
# TODO: Fetch the sample metadata from the database
#json <- httr::content(httr::GET("http://{endpoint}/batch?to={YYYYMMDD}"))
json <-
'[
    {"JIRA" : "JEWSC_YYYMMDD_QCarray", "nsamples" : "20"}
]'
batches <- jsonlite::fromJSON(json, simplifyDataFrame=FALSE)
names(batches) <- map(batches, ~ .$JIRA)
popmaf <- generate.popmaf(batches)

### Test each sample for contamination:

In [96]:
source("bafRegress.R")
baf.results <- baf.data %>%
    map(~ testsamplecontamination(.$baf,.$abgeno,popmaf$maf))

# Analysis

set the name of comparison prefix by pulling the parent directory name

In [None]:
comparison = tail(strsplit(getwd(),"/")[[1]],n=1)

## Import gencalls

In [124]:
source("analysis-utils.R")
raw_data <- import.gencalls(sample.info, manifest)
summary(raw_data)
head(raw_data)

[1] 2
[1] 2
CPT0nnnnn_0001 CPT0nnnnn_0002 
             2              2 
--- raw_data ---
A genotypes object with 15949 sites x 2 samples
Allele encoding: native 
Intensity data: yes (raw) 
Sample metadata: yes ( 0 male / 2 female / 0 unknown )
Filters set: 0 sites / 0 samples 
Checksum: 2ebb6487683dbda81c6458d34d10d644 
Genotypes matrix:
                CPT0n CPT0n 
 2010-08-Y-1111     H     H 
 2010-08-Y-1221     H     H 
 2010-08-Y-1995     H     H 
 2010-08-Y-2045     H     H 
 2010-08-Y-3042     H     H 
 2010-08-Y-3189     H     H 
 2010-08-Y-3314     H     H 
 2010-08-Y-3348     H     H 
 2010-08-Y-3576     H     H 
  2010-08-Y-749     H     H 

Marker map:
 chr         marker cM pos A1 A2
   0 2010-08-Y-1111 NA   0  A  G
   0 2010-08-Y-1221 NA   0  A  G
   0 2010-08-Y-1995 NA   0  A  C
   0 2010-08-Y-2045 NA   0  A  G
   0 2010-08-Y-3042 NA   0  T  C
   0 2010-08-Y-3189 NA   0  A  C
   0 2010-08-Y-3314 NA   0  T  C
   0 2010-08-Y-3348 NA   0  T  C
   0 2010-08-Y-3576 NA   0  T

## QC .idat

In [None]:
# ...

## Import .gds file

In [None]:
# ...