# Generate QC Report 
Author: Liam T McKay (liammckay19)

This notebook compiles a json file with this information about a sample:
    - # of uniquely mapped 
    - # uniquely mapped non-duplicate 
    - # uniquely mapped exonic non-duplicate
    - # total reads
    - % UMEND 
    - % Duplicates
    - RIN score
    - the 95th percentile of gene expression
    - # expressed genes
    - # pan-cancer upoutliers
### Input: 
Notebook 1.json

Notebook 4.0.json

Log.final.out, fastqc files 

### Output: (Saves to json file called 4.25.json)


`Date`

`Threshold Statement`

`Sample_id`

`Read count table`

>`sample_id `

>`Total_sequences`

>`UM`

>`UMND`

>`UMEND `

>`Multimapped_reads`
   
 `UMEND reads/Total reads`
>`Measure, Result, Reference_range, In_reference_range?, n_samples`

`Duplicate reads/Total reads`
>`Measure, Result, Reference_range, In_reference_range?, n_samples`

`RIN`
>`Measure, Result, Reference_range, In_reference_range?, n_samples`

`Expressed genes (*1000)`
>`Measure, Result, Reference_range, In_reference_range?, n_samples`

`Pan-cancer up outliers`
>`Measure, Result, Reference_range, In_reference_range?, n_samples`

`95th percentile of genes in sample (log2(TPM)+1)`
>`Measure, Result, Reference_range, In_reference_range?, n_samples`

In [None]:
# global parameters
umendThreshold = 1e7

In [None]:
# This cell loads the configuration and contains some helper functions.
# Please don't edit it.
import os
import glob
import json
import logging
import sys
import numpy as np
import datetime
from collections import OrderedDict

with open("./conf.json","r") as conf:
    c=json.load(conf)
sample_id = c["sample_id"]    
print("Running on sample: {}".format(sample_id))
logging.basicConfig(**c["info"]["logging_config"])
logging.info("4.25: Generate QC Report")

def and_log(s):
    logging.info(s)
    return s

j = {}

def load_notebook_output(notebook_num):
    outputfile = "./{}.json".format(notebook_num)
    try:
        with open(outputfile, "r") as f:
           result = json.load(f)
        return result
    except IOError:
        print("Error! Couldn't find output of previous notebook at {}".format(outputfile))
        return {}
print and_log("QC Report Results\n-------------------------")
# What is the name of your notebook? Fill this out. For example, for "Template.ipynb", write "Template".
notebook_name = "4.25_qc-reporting"
sample_name = c["sample_id"]

### Requires 1.0 and 4.0 json files

In [None]:
# get rsem results (# of expressed genes) and outlier results (pan cancer up outliers)
nbOne_rsemResults = load_notebook_output(1)
nbFour_outlierResults = load_notebook_output(4.0)

In [None]:
# get redCap file path and QC_reference ranges path
redCap_RIN_score_path_and_filename = c["ref_file"]["redcap_download_rinscore"]
reference_ranges_path = c["ref_file"]["reference_ranges"] 
if not redCap_RIN_score_path_and_filename:
    print and_log("QC Report Notice: no redCap_RIN_score_path_and_filename in conf.json")
if not reference_ranges_path:
    print and_log("QC Report Notice: no reference_ranges_path in conf.json")

In [None]:
def findData(filePath, beginningSearchString, endSearchString):
    with open(filePath, "r") as f:
        lines = f.readlines()
        line = "".join(lines)  # join entire html file into one line
        start = line.find(beginningSearchString) + len(beginningSearchString)
        end = line.find(endSearchString)
    return line[start:end]

In [None]:
def findFilePaths(newest=True):
    """
    Parse file paths in mustard prism for qc reporting
    :param newest: Returns highest version of cgl-pipe, and bam-umend-qc
    :return: file paths to fastqc html, log.final.out, bam umend qc json
    """
    # secondary paths
    cgl_search = "/ucsc_cgl-rnaseq-cgl-pipeline-*/"
    
    fastqcPath = c['dir']['secondary'] + cgl_search + "QC/fastQC/R1_fastqc.html"
    logfinaloutPath = c['dir']['secondary'] + cgl_search + "QC/STAR/Log.final.out"
    
    fastqc = glob.glob(os.path.join(fastqcPath))
    logfinalout = glob.glob(os.path.join(logfinaloutPath))
    
    # sort globed paths by highest numbers first 
    if newest:
        fastqc.sort(reverse=True)
        logfinalout.sort(reverse=True)
    try:
        fastqc_Found = fastqc[0]
    except IndexError:
        print and_log("QC Report Notice: R1_fastqc.html file not found. Read from "+ fastqcPath)
        fastqc_Found = ""
        
    try:
        logfinalout_Found = logfinalout[0]
    except IndexError:
        print and_log("QC Report Notice: Log.final.out file not found. Read from "+logfinaloutPath)
        logfinalout_Found = ""

    return fastqc_Found, logfinalout_Found

In [None]:
# get read totals for: total sequences, uniquely mapped, multi-mapped reads

# find file paths for fastqc, log.final.out, bamumendqc.json
fastqcHtmlPath, logFinalPath = findFilePaths(newest=True)

# total sequences
try:
    totalSequences = findData(fastqcHtmlPath, "Total Sequences</td><td>",
                              "</td></tr><tr><td>Sequences flagged as poor quality")
    totalSequences = float(totalSequences.rstrip())
except IOError:
    totalSequences = "NA"
    print and_log("QC Report Notice: total sequences not found in "+fastqcHtmlPath+".\n total_sequences is NA")

# uniquely mapped
try:
    uniquelyMapped = "".join(findData(logFinalPath, "Uniquely mapped reads number |",
                                  "Uniquely mapped reads %").split())  # split with no args to remove all whitespace
    uniquelyMapped = float(uniquelyMapped.rstrip())
except IOError:
    print and_log("QC Report Notice: uniquely_mapped not found in "+logFinalPath+".\n uniquely_mapped is NA")
    uniquelyMapped = "NA"

# multimapped 
try:
    multiMapped = "".join(findData(logFinalPath,"Number of reads mapped to multiple loci |", 
                                   "% of reads mapped to multiple loci").split())  
    # split with no args to remove all whitespace
    multiMapped = float(multiMapped.rstrip())
except IOError:
    print and_log("QC Report Notice: multiMapped not found in "+logFinalPath+".\n multiMapped is NA")
    multiMapped = "NA"

In [None]:
# get outlier results 
umend_results = None
try:
    uniquelyMappedNonDuplicate = str(c['info']['secondary_qc']["uniqMappedNonDupeReadCount"])
    umend = str(c['info']['secondary_qc']["estExonicUniqMappedNonDupeReadCount"])
    
    # remove all carriage returns for output
    uniquelyMappedNonDuplicate = float(uniquelyMappedNonDuplicate.rstrip())
    umend = float(umend.rstrip())
    
    pctUMEND = umend / totalSequences
except ValueError:
    print and_log("QC Report Notice: umend_results are NA. \n\tUMND, UMEND, pct UMEND, pct Duplicate are NA")
    uniquelyMappedNonDuplicate = 'NA'
    umend = 'NA'
    pctUMEND = "NA"
except TypeError:
    print and_log("QC Report Notice: UMND, UMEND, pct UMEND, pct Duplicate are NA")
    uniquelyMappedNonDuplicate = 'NA'
    umend = 'NA'
    pctUMEND = "NA"
    
try:
    pctDuplicate = 1 - (float(uniquelyMappedNonDuplicate) / float(uniquelyMapped))
except ValueError:
    print and_log("4.25: Notice: uniquelyMappedNonDuplicate and/or uniquelyMapped not found. "+
                    "\n\tUMND, UMEND, pct UMEND, pct Duplicate are NA")
    pctDuplicate = "NA"

In [None]:
# get pan cancer up outliers and 95th Percentile of tumor sample
try:
    pcUpOutliers = 0
    for genesResult in nbFour_outlierResults['outlier_results']['pc_outlier'].values():
        if genesResult == u'pc_up' and genesResult != "":
            pcUpOutliers += 1
except KeyError:
    print and_log("QC Report Notice: 4.0.json doesn't have one of [outlier_results][pc_outlier] or neither keys."+
                  "\n\tPan-cancer Outliers = NA")
    pcUpOutliers = "NA"

try:
    sampleExpressionArray = np.array([i for i in nbFour_outlierResults['outlier_results']['sample'].values()], float)
    ninetyFifthPercentile = float(np.percentile(sampleExpressionArray,95))
except KeyError:
    print and_log("QC Report Notice: 4.0.json doesn't have one of [outlier_results][sample] or neither keys."+
                  "95th Percentile = NA")
    ninetyFifthPercentile = "NA"

In [None]:
# get RIN score for all samples "redcap_combined_sample_donor_data_downloaded_via_API_*.txt"
if c['ref_file']['redcap_download_rinscore']:
    rinData = np.genfromtxt(c['ref_file']['redcap_download_rinscore'], 
                            dtype=str, delimiter='\t', usecols=(0, 9), skip_header=1)
    try:
        sampleRow = np.where(rinData == sample_name)[0][0]
        rinScore = rinData[sampleRow][1]
        if "n/a" not in rinScore:
            rinScore = float(rinScore)
        else:
            rinScore = rinScore.upper().replace("/","")
    except (IndexError, ValueError):
        rinScore = "NA"
        print and_log("QC Report Notice: redCap_RIN_score_path_and_filename = None \nrin score not found")
else:
    print and_log("QC Report Notice: redCap_RIN_score_path_and_filename = None \nrin score not found")
    rinScore = "NA"

In [None]:
# get expressed genes
try:
    geneExpressionValues = nbOne_rsemResults["tpm_hugo_norm_uniq"][sample_id].values()
    nExpressedGenes = np.count_nonzero([float(a) for a in geneExpressionValues]) / 1000.0
except KeyError:
    print and_log("QC ReportNotice: number of expressed genes not found.\n saving NA")
    nExpressedGenes = "NA"

In [None]:
# get reference ranges
# =========== as of coding this the ordering of rows should be ===========
# 0 UMEND reads/Total reads
# 1 Duplicate reads/Total reads
# 2 RIN
# 3 Expressed genes (*1000)
# 4 Pan-cancer up outliers
# 5 95th percentile of genes in sample - (log2(tpm+1))

try:
    referenceRanges = np.genfromtxt(c['ref_file']['reference_ranges'],
                                    dtype=[('Measure', str), ('ref_min', float), ('ref_max', float),
                                           ('n_samples', int)], delimiter='\t', skip_header=0,
                                    usecols=(0, 1, 2, 3))
except IOError:
    referenceRanges = np.array([[1]*5]*7)
    print("QC Report Notice: reference ranges not found")

In [None]:
# generate threshold statement
umendThresInMil = str(int(umendThreshold/10**6))
try:
    if float(umend) > umendThreshold:  # 10 million umend reads threshold
        thresholdStatement = "This sample exceeds the minimum required threshold of "+umendThresInMil+\
            " million reads."
    else:
        thresholdStatement = "This sample does not meet the minimum required threshold of "+umendThresInMil+\
            " million reads."
except ValueError:
    thresholdStatement = "This sample does not meet the minimum required threshold of "+umendThresInMil+\
        " million reads."
print and_log(thresholdStatement)

In [None]:
# Today's date - will be printed in Summary+slides
today = datetime.datetime.now()
generatedTime = "Generated on " + str(today.day) + " " + today.strftime("%B") + ", " + str(today.year)
logging.info(generatedTime)

In [None]:
def updateJsonInfo(measure = "NA", rounded_measurement="NA", measurement="NA", refRange=[[1]*5]*7, 
                   refNum=0):
    """
    Organizes information about the measures associated with the qc_report into a ordered dictionary format
    for writing to the final output json. 
    :param measure: Aspect of the sample that has been measured and gathered here for QC reporting
    :param refNum: Row number associated with the measure from the qc_reference ranges tsv file
    :param refRange: Actual reference range data in a numpy array in the order of the qc_reference ranges tsv file
    :param result: Values of the measure 
    """
    need_asterisk = False
    reference = [refRange[refNum][1], refRange[refNum][2]]
    n_samples = refRange[refNum][3]
    added_precision_measurement = rounded_measurement
    refYesOrNo = "No"
    if measure == "UMEND_reads/Total_reads":
        original_lower_rounded_bound = round(reference[0], 3) # round special case 
        original_upper_rounded_bound = round(reference[1], 3) # for pc-upoutliers
    else:
        original_lower_rounded_bound = round(reference[0], 2) # round special case 
        original_upper_rounded_bound = round(reference[1], 2) # for pc-upoutliers
    lower_extended_bound = original_lower_rounded_bound
    upper_extended_bound = original_upper_rounded_bound
    
    if "NA" in str(measurement):
        refYesOrNo = "NA"
    else:
        if reference[0] <= measurement <= reference[1]:
            refYesOrNo = "Yes"
        elif reference[0] <= measurement and measure == "RIN": # only check lower bound ref range for RIN
            refYesOrNo = "Yes"
        if rounded_measurement == original_lower_rounded_bound or rounded_measurement == original_upper_rounded_bound:
            # add decimals if rounded measurements are the same
            need_asterisk = True
            i = 1
            while (
            float(added_precision_measurement) == float(lower_extended_bound) or 
            float(added_precision_measurement) == float(upper_extended_bound)):
                added_precision_measurement = round(measurement, len(str(rounded_measurement))+i)
                lower_extended_bound = round(reference[0], len(str(rounded_measurement))+i)
                upper_extended_bound = round(reference[1], len(str(rounded_measurement))+i)
                print(measurement,len(str(rounded_measurement))+i)
                i += 1
                if len(str(rounded_measurement))+i > 6:  # only round to 6 decimal places
                    break
    
    qcInfo = OrderedDict([
        ("Measure", measure),
        ("Result", added_precision_measurement),
        ("Rounded_result", rounded_measurement),
        ("Precise_result", measurement),
        ("Reference_range", str(lower_extended_bound) + " - " + str(upper_extended_bound)),
        ("Rounded_reference_range", str(original_lower_rounded_bound) + " - " + str(original_upper_rounded_bound)),
        ("Precise_reference_lower", reference[0]),
        ("Precise_reference_upper", reference[1]),
        ("In_reference_range?", refYesOrNo),
        ("n_samples", n_samples),
        ("asterisk", need_asterisk)]
    )
    
    return qcInfo

In [None]:
# create qc report
qcReport = OrderedDict([
    ("Date",generatedTime),
    ("Threshold_Statement",thresholdStatement),
    ("UMEND_threshold",umendThreshold),
    ("Sample_id",sample_name),
    ('Read_count_table',''),
    ("UMEND_reads/Total_reads",''),
    ("Duplicate_reads/Total_reads",''),
    ("RIN",''),
    ("Expressed_genes_(*1000)",''),
    ("Pan-cancer_up_outliers",''),
    ("95th_percentile_of_genes_in_sample_(log2(TPM)+1)",'')
])

umendThresholdAsterisk = False
if "NA" not in str(umend):
    if round(umend/10**6,2) == round(umendThreshold,2):
        umendThresholdAsterisk = True

qcReport["Read_count_table"] = OrderedDict([
        ("sample_id", sample_name),
        ("Total_sequences", totalSequences),
        ("UM", uniquelyMapped),
        ("UMND", uniquelyMappedNonDuplicate),
        ("UMEND", umend),
        ("Multimapped_reads", multiMapped),
        ("asterisk", umendThresholdAsterisk)
    ])

# Measure name, full precision, rounded precision. 
# -- checks to make sure the values are float before rounding


measuresForReference = [
    ("UMEND_reads/Total_reads", (round(pctUMEND, 3) if "NA" not in str(pctUMEND) else pctUMEND), 
     pctUMEND),
    
    ("Duplicate_reads/Total_reads", (round(pctDuplicate, 2) if "NA" not in str(pctDuplicate) else pctDuplicate),
     pctDuplicate),
    
    ("RIN", (round(rinScore, 2) if "NA" not in str(rinScore) else rinScore), rinScore),
    
    ("Expressed_genes_(*1000)", (round(nExpressedGenes, 2) if "NA" not in str(nExpressedGenes) else nExpressedGenes),
     nExpressedGenes),
    
    ("Pan-cancer_up_outliers", pcUpOutliers,
     pcUpOutliers),
    
    ("95th_percentile_of_genes_in_sample_(log2(TPM)+1)",
     (round(ninetyFifthPercentile, 2) if "NA" not in str(ninetyFifthPercentile) else ninetyFifthPercentile),
     ninetyFifthPercentile)
]



for i, measure in enumerate(measuresForReference, 1):
    qcReport[measure[0]] = updateJsonInfo(measure=measure[0], 
                                          rounded_measurement=measure[1], 
                                          measurement=measure[2], 
                                          refRange=referenceRanges, 
                                          refNum=i)

with open(c["json"]["4.25"], 'w') as outFile:
    json.dump(qcReport, outFile, indent=2)

print "Done. Output location: " + c["json"]["4.25"]
print(json.dumps(qcReport, indent=4)) # prints json below