QC Array
========

***Analysis step***

**Author:** *Jay Kim*

## Parameters

In [2]:
SAMPLE_INFO_BUCKET <- "pbc-qcarray-sample-info"
GTC_BUCKET <- "pbc-iscan-qcarrays"
BATCH_NAME <- "JEWSC_20211118_Qcarray"

## Setup

In [3]:
library(dplyr, warn.conflicts = FALSE)
library(arrow, warn.conflicts = FALSE)
library(glue)

## Test sample contamination (bafRegress)
G. Jun, M. Flickinger, K. N. Hetrick, Kurt, J. M. Romm, K. F. Doheny, G. Abecasis, M. Boehnke,and H. M. Kang, _Detecting and Estimating Contamination of Human DNA Samples in Sequencing and Array-Based Genotype Data_, American journal of human genetics doi:10.1016/j.ajhg.2012.09.004 (volume 91 issue 5 pp.839 - 848)

### Calculate the population MAF

In [4]:
source("popmaf.R")

# Compute the MAFs for this batch and store it in the cache
calc_batch_maf(GTC_BUCKET, BATCH_NAME)

# Now calculate the population MAFs by averaging across all batches
batch_mafs <- open_dataset(glue("s3://{GTC_BUCKET}/maf/"))
popmaf <- batch_mafs %>%
    group_by(marker) %>%
    collect() %>%
    summarize(maf = sum(maf) / n())

FileSystemDataset with 1 Parquet file
marker: string
maf: double

### Test each sample for contamination

In [23]:
source("bafRegress.R")
gtc_data <- open_dataset(glue("s3://{GTC_BUCKET}/parquet/{BATCH_NAME}/"))
baf_results <- gtc_data %>%
    select(Sample_ID, baf, abgeno) %>%
    group_by(Sample_ID) %>%
    collect() %>%
    group_modify(~ testsamplecontamination(.$baf, .$abgeno, popmaf$maf)) %>%
    tidyr::spread(names, fit)

# Analysis

## Import gencalls

In [25]:
source("analysis-utils.R")
gtc_data <- open_dataset(glue("s3://{GTC_BUCKET}/parquet/{BATCH_NAME}/"))
coordinates_file_path <- glue("s3://{GTC_BUCKET}/InfiniumQCArray-24v1-0_A3_Physical-and-Genetic-Coordinates.txt")
strand_report_file_path <- glue("s3://{GTC_BUCKET}/InfiniumQCArray-24v1-0_A3_StrandReport_FDT.txt")

raw_data <- import.gencalls(gtc_data, coordinates_file_path, strand_report_file_path)
summary(raw_data)
head(raw_data)

“Setting row names on a tibble is deprecated.”
“Setting row names on a tibble is deprecated.”


--- raw_data ---
A genotypes object with 15949 sites x 72 samples
Allele encoding: native 
Intensity data: yes (raw) 
Sample metadata: yes
Filters set: 0 sites / 0 samples 
Checksum: f6f64711a43d81e42b6abc8a943fccd6 
Genotypes matrix:
                CMB00 CMB00 CMB00 CMB00 CMB00 CMB00 CMB00 CMB00 CMB00 CPT02 
 2010-08-Y-1111     A     A     A     A     A     A     A     A     A     A 
 2010-08-Y-1221     A     A     A     A     A     A     A     A     A     A 
 2010-08-Y-1995     A     A     A     A     A     A     A     A     A     A 
 2010-08-Y-2045     G     G     G     G     G     G     G     G     G     G 
 2010-08-Y-3042     T     T     T     T     T     T     T     T     T     T 
 2010-08-Y-3189     A     A     A     A     A     A     A     A     A     A 
 2010-08-Y-3314     T     T     T     T     T     T     T     T     T     T 
 2010-08-Y-3348     T     N     T     T     T     N     T     N     T     T 
 2010-08-Y-3576     T     T     T     T     T     T     T     T     T   

## QC .idat

In [None]:
# ...

## Import .gds file

In [None]:
# ...