# Diagnostic Genome Analysis

## Importing data from pipeline output

In [2]:
import pandas
import numpy

annovar_output = snakemake.input[2]

columns_of_interest = ["Chr", "Start", "End", "Ref", "Alt", 
                       "Func.refGene", "Gene.refGene", "avsnp138", 
                       "SIFT_score", "Polyphen2_HDIV_score", "Polyphen2_HDIV_pred",
                       "CLNDN", "CLNDISDB", "CLNSIG", "CLNSIG"]

datatypes = {'SIFT_score': numpy.float64, 
             'Polyphen2_HDIV_score': numpy.float64}

annovar = pandas.read_csv(annovar_output, sep="\t", na_values = '.', 
                          usecols=columns_of_interest, dtype=datatypes) 

annovar.head()
print(snakemake.input)
print(snakemake)

Unnamed: 0,Chr,Start,End,Ref,Alt,Func.refGene,Gene.refGene,avsnp138,SIFT_score,Polyphen2_HDIV_score,Polyphen2_HDIV_pred,CLNDN,CLNDISDB,CLNSIG
0,chr2,857708,857708,A,C,ncRNA_intronic,LINC01115,rs7587603,,,,,,
1,chr2,857713,857713,C,T,ncRNA_intronic,LINC01115,rs12714428,,,,,,
2,chr2,9831131,9831131,A,G,intergenic,YWHAQ;TAF1B,rs6432031,,,,,,
3,chr2,25655772,25655772,T,C,exonic,DTNB,rs7583475,,,,,,
4,chr2,29529562,29529562,C,G,intronic,ALK,rs7584501,,,,,,


## Data exploration

### Significant variants (based on SIFT and Polyphen2 HDIV scores)

In [3]:
annovar = annovar.sort_values(by=['SIFT_score', 'Polyphen2_HDIV_score'], ascending = False)
annovar.head(25)

Unnamed: 0,Chr,Start,End,Ref,Alt,Func.refGene,Gene.refGene,avsnp138,SIFT_score,Polyphen2_HDIV_score,Polyphen2_HDIV_pred,CLNDN,CLNDISDB,CLNSIG
844,chr18,28650748,28650748,A,C,exonic,DSC2,rs151024019,1.0,0.004,B,Arrhythmogenic_right_ventricular_cardiomyopath...,"MedGen:C0349788,Orphanet:ORPHA247,SNOMED_CT:25...",Conflicting_interpretations_of_pathogenicity
280,chr6,112457390,112457390,C,T,exonic,LAMA4,rs2032567,1.0,0.001,B,Dilated_cardiomyopathy_1JJ|not_specified,"MedGen:C3808935,OMIM:615235|MedGen:CN169374",Benign
38,chr2,130832358,130832358,T,C,exonic,POTEF,rs201946437,1.0,0.0,B,,,
136,chr3,14175262,14175262,T,C,exonic,TMEM43,rs2340917,1.0,0.0,B,Arrhythmogenic_right_ventricular_cardiomyopath...,"MedGen:C0349788,Orphanet:ORPHA247,SNOMED_CT:25...",Benign/Likely_benign
166,chr3,38739574,38739574,T,C,exonic,SCN10A,rs6599241,1.0,0.0,B,not_specified,MedGen:CN169374,Benign
641,chr14,23861811,23861811,A,G,exonic,MYH6,rs365990,1.0,0.0,B,Atrial_septal_defect|Hypertrophic_cardiomyopat...,"Human_Phenotype_Ontology:HP:0001631,MedGen:C00...",Benign/Likely_benign
789,chr17,62020348,62020348,T,C,exonic,SCN4A,rs2058194,1.0,0.0,B,Paramyotonia_congenita_of_von_Eulenburg|Hypoka...,"MedGen:C0221055,OMIM:168300|MedGen:C0238358,Or...",Benign
879,chr20,33583331,33583331,A,G,exonic,MYH7B,rs2425015,1.0,0.0,B,,,
41,chr2,131221102,131221102,T,C,exonic,POTEI,rs139258542,1.0,,,,,
568,chr10,88466442,88466442,A,G,exonic,LDB3,rs138251566,0.95,0.0,B,Primary_dilated_cardiomyopathy|Myofibrillar_my...,"EFO:EFO_0000407,Human_Phenotype_Ontology:HP:00...",Conflicting_interpretations_of_pathogenicity
