In [68]:
# Import libraries and init Matplotlib for inline graphs
%matplotlib inline

import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import myvariant as mv
import numpy as np
import pyupset as pup
from IPython.display import display

# Set Pandas display options so I can see the full table output
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

# Degree-of-Support Filters

First, lets make some subsets to capture the different partitions, one for each level of support in a VEP.

Singleton - 1/3 prediction

Doubleton - 2/3 prediction

Consensus - 3/3 prediction


In [4]:
data1 = pd.read_excel("../data/Graeme Ford - Supplementary Table.xlsx", sheet_name=0, na_values="-")
data2 = pd.read_excel("../data/Graeme Ford - Supplementary Table.xlsx", sheet_name=1, na_values="-")
data3 = pd.read_excel("../data/Graeme Ford - Supplementary Table.xlsx", sheet_name=2, na_values="-")

data1["Gene"] = "CYP2A6"
data2["Gene"] = "CYP2B6"
data3["Gene"] = "UGT2B7"

data = pd.concat([data1, data2, data3])

In [5]:
sift = data["SIFT"].str.contains("deleterious", na=False)
notSift = ~data["SIFT"].str.contains("deleterious", na=False)
polyphen = data["PolyPhen"].str.contains("damaging", na=False)
notPolyphen = ~data["PolyPhen"].str.contains("damaging", na=False)
condel = data["Condel"].str.contains("deleterious", na=False)
notCondel = ~data["Condel"].str.contains("deleterious", na=False)


consensusFilter = data[sift & polyphen & condel]

Doubletons = data[(sift & polyphen & notCondel) | (sift & notPolyphen & condel) | (notSift & polyphen & condel)]

Singletons = data[(sift & notPolyphen & notCondel) | (notSift & polyphen & notCondel) | (notSift & notPolyphen & condel)]


# Allele Frequency Filters

Next, lets make some filters to see allelic and clinically significant status variants.

Allelic Cutoff - 1% frequency

Clinically Significant Cutoff - 4% frequency

In [46]:
def clin_filter(dataset):
    return \
    dataset[(dataset["AFR"] >= 0.04) | \
    (dataset["AMR"] >= 0.04) | \
    (dataset["SAS"] >= 0.04) | \
    (dataset["EAS"] >= 0.04) | \
    (dataset["EUR"] >= 0.04) & \
    ~(dataset["Allele"].str.contains("-", na=False)) & \
    ~(dataset["A1"].str.contains("CN", na=False))]
def alle_filter(dataset): 
    return \
    dataset[(dataset["AFR"] >= 0.01) | \
    (dataset["AMR"] >= 0.01) | \
    (dataset["SAS"] >= 0.01) | \
    (dataset["EAS"] >= 0.01) | \
    (dataset["EUR"] >= 0.01) & \
    ~(dataset["Allele"].str.contains("-", na=False)) & \
    ~(dataset["A1"].str.contains("CN", na=False))]


# Un-Conservative Summary:

lets make an unconservative approach summary (i.e. including consensus, Doubletons AND Singletons) for frequency:

In [73]:
display(alle_filter(pd.concat([consensusFilter, Doubletons, Singletons])).sort_values(by=["Gene", "Variant"], ascending=[True, False]))
display(clin_filter(pd.concat([consensusFilter, Doubletons, Singletons])).sort_values(by=["Gene", "Variant"], ascending=[True, False]))

Unnamed: 0,Variant,Location,Allele,Existing_variation,Consequence,Feature,SOURCE,SIFT,PolyPhen,Condel,CADD_PHRED,PHENOTYPES,A1,A2,AFR,AMR,EAS,EUR,SAS,AFR_P_EUR,AFR_OR_EUR,AFR_P_EAS,AFR_OR_EAS,AFR_P_AMR,AFR_OR_AMR,AFR_P_SAS,AFR_OR_SAS,Gene
275,rs1801272,19:40848628-40848628,T,"rs1801272,CM980517",missense_variant,NM_000762,RefSeq,deleterious(0),probably_damaging(0.988),deleterious(0.882),23.2,Nicotine__poor_metabolism_of+ClinVar+rs1801272...,T,A,0.0,0.007205,0.0,0.0338,0.006135,4.236806e-11,0.0,1.0,0.0,0.01117589,0.0,0.01415036,0.0,CYP2A6
355,rs145308399,19:40849872-40849872,T,rs145308399,missense_variant,NM_000762,RefSeq,tolerated(0.05),probably_damaging(0.92),deleterious(0.716),23.4,,T,C,0.001984,0.001441,0.000992,0.000994,0.02147,1.0,1.99734,1.0,2.001314,1.0,1.377483,2.603957e-05,0.090688,CYP2A6
249,19_41354126,19:40848221-40848221,C,rs771986786,"missense_variant,splice_region_variant",NM_000762,RefSeq,deleterious(0.05),benign(0.26),neutral(0.372),22.8,,C,G,0.07143,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,CYP2A6
64,19_41350611,19:40844706-40844706,A,rs371553133,missense_variant,NM_000762,RefSeq,deleterious(0.02),benign(0.363),deleterious(0.498),14.01,,A,C,0.0625,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,CYP2A6
9,rs8192709,19:40991369-40991369,T,rs8192709,missense_variant,NM_000767,RefSeq,deleterious(0.03),benign(0.265),neutral(0.392),9.184,,T,C,0.04537,0.04323,0.04663,0.06262,0.04397,0.09543397,0.711455,0.9161876,0.971696,0.9055165,1.051813,0.9143431,1.033334,CYP2B6
14,rs34284776,19:40991391-40991391,C,rs34284776,missense_variant,NM_000767,RefSeq,deleterious(0),benign(0.174),neutral(0.461),5.068,,C,G,0.01389,0.002882,0.0,0.0,0.0,0.0001166525,,0.0001166472,,0.02110907,4.869827,0.0001190254,,CYP2B6
13,rs33926104,19:40991390-40991390,A,rs33926104,missense_variant,NM_000767,RefSeq,deleterious(0.02),benign(0.03),neutral(0.385),9.768,,A,C,0.01389,0.002882,0.0,0.0,0.0,0.0001166525,,0.0001166472,,0.02110907,4.869827,0.0001190254,,CYP2B6
960,rs28399499,19:41012316-41012316,C,"rs28399499,CM066043",missense_variant,NM_000767,RefSeq,deleterious(0),probably_damaging(0.995),deleterious(0.902),28.7,efavirenz_response_-_Metabolism/PK+ClinVar+rs2...,C,T,0.0865,0.01009,0.0,0.0,0.0,4.187095e-28,,3.8363780000000002e-28,,9.231537e-14,9.287742,1.631326e-27,,CYP2B6


Unnamed: 0,Variant,Location,Allele,Existing_variation,Consequence,Feature,SOURCE,SIFT,PolyPhen,Condel,CADD_PHRED,PHENOTYPES,A1,A2,AFR,AMR,EAS,EUR,SAS,AFR_P_EUR,AFR_OR_EUR,AFR_P_EAS,AFR_OR_EAS,AFR_P_AMR,AFR_OR_AMR,AFR_P_SAS,AFR_OR_SAS,Gene
249,19_41354126,19:40848221-40848221,C,rs771986786,"missense_variant,splice_region_variant",NM_000762,RefSeq,deleterious(0.05),benign(0.26),neutral(0.372),22.8,,C,G,0.07143,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,CYP2A6
64,19_41350611,19:40844706-40844706,A,rs371553133,missense_variant,NM_000762,RefSeq,deleterious(0.02),benign(0.363),deleterious(0.498),14.01,,A,C,0.0625,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,CYP2A6
9,rs8192709,19:40991369-40991369,T,rs8192709,missense_variant,NM_000767,RefSeq,deleterious(0.03),benign(0.265),neutral(0.392),9.184,,T,C,0.04537,0.04323,0.04663,0.06262,0.04397,0.09543397,0.711455,0.9161876,0.971696,0.9055165,1.051813,0.9143431,1.033334,CYP2B6
960,rs28399499,19:41012316-41012316,C,"rs28399499,CM066043",missense_variant,NM_000767,RefSeq,deleterious(0),probably_damaging(0.995),deleterious(0.902),28.7,efavirenz_response_-_Metabolism/PK+ClinVar+rs2...,C,T,0.0865,0.01009,0.0,0.0,0.0,4.187095e-28,,3.8363780000000002e-28,,9.231537e-14,9.287742,1.631326e-27,,CYP2B6


In [63]:

pd.concat([consensusFilter, Doubletons, Sing])

Unnamed: 0,Variant,Location,Allele,Existing_variation,Consequence,Feature,SOURCE,SIFT,PolyPhen,Condel,...,SAS,AFR_P_EUR,AFR_OR_EUR,AFR_P_EAS,AFR_OR_EAS,AFR_P_AMR,AFR_OR_AMR,AFR_P_SAS,AFR_OR_SAS,Gene
20,rs561053481,19:40843858-40843858,A,rs561053481,missense_variant,NM_000762,RefSeq,deleterious(0.01),probably_damaging(0.99),deleterious(0.847),...,0.002045,1.000000,0.0,1.000000,0.0,1.000000,0.0,0.242378,0.0,CYP2A6
151,rs58571639,19:40845998-40845998,A,rs58571639,missense_variant,NM_000762,RefSeq,deleterious(0),possibly_damaging(0.716),deleterious(0.704),...,0.000000,1.000000,,1.000000,1.0,1.000000,,1.000000,,CYP2A6
152,rs528089983,19:40846003-40846003,A,rs528089983,missense_variant,NM_000762,RefSeq,deleterious(0),probably_damaging(0.997),deleterious(0.911),...,0.000000,0.499752,,0.499752,,0.516734,,0.499862,,CYP2A6
158,rs201027514,19:40846078-40846078,A,"rs201027514,COSM712827",missense_variant,NM_000762,RefSeq,deleterious(0.01),possibly_damaging(0.539),deleterious(0.588),...,0.000000,0.499503,0.0,1.000000,0.0,1.000000,0.0,1.000000,0.0,CYP2A6
197,rs560382680,19:40846897-40846897,A,"rs560382680,COSM3797108",missense_variant,NM_000762,RefSeq,deleterious(0.01),possibly_damaging(0.907),deleterious(0.766),...,0.001022,1.000000,0.0,1.000000,0.0,1.000000,0.0,0.492447,0.0,CYP2A6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
967,rs553658831,19:41012355-41012355,G,"rs553658831,COSM4581070",missense_variant,NM_000767,RefSeq,deleterious(0.01),benign(0),neutral(0.406),...,0.000000,1.000000,0.0,1.000000,0.0,0.407756,0.0,1.000000,0.0,CYP2B6
984,rs530338921,19:41012689-41012689,A,rs530338921,missense_variant,NM_000767,RefSeq,deleterious(0.03),benign(0.014),neutral(0.365),...,0.001022,1.000000,0.0,1.000000,0.0,1.000000,0.0,0.492447,0.0,CYP2B6
1193,rs559298344,19:41016673-41016673,C,rs559298344,missense_variant,NM_000767,RefSeq,deleterious(0),benign(0.161),neutral(0.459),...,0.000000,1.000000,0.0,1.000000,0.0,0.407756,0.0,1.000000,0.0,CYP2B6
1197,rs368451099,19:41016682-41016682,T,"rs368451099,COSM4078403,COSM6084934",missense_variant,NM_000767,RefSeq,deleterious(0.01),benign(0.024),neutral(0.407),...,0.002045,1.000000,0.0,1.000000,0.0,1.000000,0.0,0.242378,0.0,CYP2B6


In [61]:
print("Doubletons")
Doubletons


Doubletons


Unnamed: 0,Variant,Location,Allele,Existing_variation,Consequence,Feature,SOURCE,SIFT,PolyPhen,Condel,...,SAS,AFR_P_EUR,AFR_OR_EUR,AFR_P_EAS,AFR_OR_EAS,AFR_P_AMR,AFR_OR_AMR,AFR_P_SAS,AFR_OR_SAS,Gene
64,19_41350611,19:40844706-40844706,A,rs371553133,missense_variant,NM_000762,RefSeq,deleterious(0.02),benign(0.363),deleterious(0.498),...,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,CYP2A6
120,rs137904044,19:40845465-40845465,A,rs137904044,missense_variant,NM_000762,RefSeq,deleterious(0),benign(0.294),deleterious(0.535),...,0.0,1.0,,1.0,,1.0,,1.0,,CYP2A6
284,rs4986891,19:40848724-40848724,A,"rs4986891,CM015150",missense_variant,NM_000762,RefSeq,deleterious(0),benign(0.34),deleterious(0.552),...,0.0,0.06219,,0.06219,,0.083744,,0.062328,,CYP2A6
355,rs145308399,19:40849872-40849872,T,rs145308399,missense_variant,NM_000762,RefSeq,tolerated(0.05),probably_damaging(0.92),deleterious(0.716),...,0.02147,1.0,1.99734,1.0,2.001314,1.0,1.377483,2.6e-05,0.090688,CYP2A6
781,rs563079673,19:41009248-41009248,T,rs563079673,missense_variant,NM_000767,RefSeq,tolerated(0.11),probably_damaging(0.961),deleterious(0.675),...,0.0,1.0,0.0,0.499752,0.0,1.0,0.0,1.0,0.0,CYP2B6
962,rs567219326,19:41012340-41012340,C,"rs567219326,COSM1202848",missense_variant,NM_000767,RefSeq,deleterious(0),benign(0.24),deleterious(0.469),...,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,CYP2B6


In [7]:
print("Singletons")
Singletons


Singletons


Unnamed: 0,Variant,Location,Allele,Existing_variation,Consequence,Feature,SOURCE,SIFT,PolyPhen,Condel,...,SAS,AFR_P_EUR,AFR_OR_EUR,AFR_P_EAS,AFR_OR_EAS,AFR_P_AMR,AFR_OR_AMR,AFR_P_SAS,AFR_OR_SAS,Gene
112,rs114558780,19:40845322-40845322,A,rs114558780,missense_variant,NM_000762,RefSeq,tolerated(0.17),possibly_damaging(0.759),neutral(0.340),...,0.0,0.007704,,0.007704,,0.024468,,0.007752,,CYP2A6
200,rs58720852,19:40846933-40846933,T,rs58720852,missense_variant,NM_000762,RefSeq,deleterious(0),benign(0.178),neutral(0.461),...,0.0,1.0,,1.0,,1.0,,1.0,,CYP2A6
206,rs539734599,19:40846996-40846996,C,rs539734599,missense_variant,NM_000762,RefSeq,deleterious(0.03),benign(0.017),neutral(0.365),...,0.0,0.499752,,0.499752,,0.516734,,0.499862,,CYP2A6
249,19_41354126,19:40848221-40848221,C,rs771986786,"missense_variant,splice_region_variant",NM_000762,RefSeq,deleterious(0.05),benign(0.26),neutral(0.372),...,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,CYP2A6
261,rs150091942,19:40848366-40848366,C,rs150091942,missense_variant,NM_000762,RefSeq,deleterious(0.05),benign(0.069),neutral(0.350),...,0.001022,0.499503,0.0,0.06219,0.0,1.0,0.0,0.492447,0.0,CYP2A6
279,rs200793736,19:40848671-40848671,T,rs200793736,missense_variant,NM_000762,RefSeq,deleterious(0.01),benign(0.146),neutral(0.418),...,0.0,0.499503,0.0,1.0,0.0,1.0,0.0,1.0,0.0,CYP2A6
9,rs8192709,19:40991369-40991369,T,rs8192709,missense_variant,NM_000767,RefSeq,deleterious(0.03),benign(0.265),neutral(0.392),...,0.04397,0.095434,0.711455,0.916188,0.971696,0.905517,1.051813,0.914343,1.033334,CYP2B6
13,rs33926104,19:40991390-40991390,A,rs33926104,missense_variant,NM_000767,RefSeq,deleterious(0.02),benign(0.03),neutral(0.385),...,0.0,0.000117,,0.000117,,0.021109,4.869827,0.000119,,CYP2B6
14,rs34284776,19:40991391-40991391,C,rs34284776,missense_variant,NM_000767,RefSeq,deleterious(0),benign(0.174),neutral(0.461),...,0.0,0.000117,,0.000117,,0.021109,4.869827,0.000119,,CYP2B6
520,rs572134005,19:41004083-41004083,A,"rs572134005,COSM566302",missense_variant,NM_000767,RefSeq,deleterious(0.01),benign(0.2),neutral(0.425),...,0.0,1.0,,1.0,,1.0,,1.0,,CYP2B6


In [76]:
phenoFilter = data[data["PHENOTYPES"].notna()]
phenoFilter


Unnamed: 0,Variant,Location,Allele,Existing_variation,Consequence,Feature,SOURCE,SIFT,PolyPhen,Condel,...,SAS,AFR_P_EUR,AFR_OR_EUR,AFR_P_EAS,AFR_OR_EAS,AFR_P_AMR,AFR_OR_AMR,AFR_P_SAS,AFR_OR_SAS,Gene
2,rs569152950,19:40843552-40843552,A,rs569152950,3_prime_UTR_variant,NM_000762,RefSeq,,,,...,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,CYP2A6
8,rs8192733,19:40843645-40843645,C,rs8192733,3_prime_UTR_variant,NM_000762,RefSeq,,,,...,0.4366,1.2630980000000001e-27,0.367042,3.499269e-39,0.301859,8.318859e-30,0.317022,1.413518e-13,0.500667,CYP2A6
62,rs2002977,19:40844689-40844689,A,rs2002977,synonymous_variant,NM_000762,RefSeq,,,,...,0.06033,0.01231963,1.464811,5.512373e-13,3.708542,1.299872e-12,4.738528,7.611925e-06,2.07839,CYP2A6
124,rs148057229,19:40845524-40845524,C,rs148057229,intron_variant,NM_000762,RefSeq,,,,...,0.0,1.0,0.0,0.4997519,0.0,1.0,0.0,1.0,0.0,CYP2A6
186,rs575865729,19:40846712-40846712,G,rs575865729,intron_variant,NM_000762,RefSeq,,,,...,0.0,1.0,0.0,1.0,0.0,0.4077556,0.0,1.0,0.0,CYP2A6
210,rs111033610,19:40847036-40847036,G,"rs28399447,CM024238",missense_variant,NM_000762,RefSeq,tolerated(1),benign(0.003),neutral(0.000),...,0.0,1.0,0.0,0.007704343,0.0,1.0,0.0,1.0,0.0,CYP2A6
216,rs56113850,19:40847202-40847202,C,rs56113850,intron_variant,NM_000762,RefSeq,,,,...,0.4243,2.7014050000000003e-29,0.364155,0.8164854,1.025743,9.729662e-37,0.279911,0.0003077353,0.717881,CYP2A6
228,rs56267346,19:40847433-40847433,G,rs56267346,intron_variant,NM_000762,RefSeq,,,,...,0.2209,1.860973e-35,3.468634,3.343963e-13,1.980822,5.8349020000000006e-33,3.904372,9.748669e-23,2.595114,CYP2A6
248,esv3644362;esv3644363,19:40848147-40848147,sequence_variant,,intron_variant,NM_000762,RefSeq,,,,...,0.04855,0.3056694,1.342912,1.750149e-11,0.291882,1.0,0.99472,0.2170283,0.739579,CYP2A6
272,rs28399442,19:40848553-40848553,A,rs28399442,intron_variant,NM_000762,RefSeq,,,,...,0.01022,4.420266e-06,0.045888,1.0,,6.278987e-05,0.051273,0.005252366,0.094709,CYP2A6


# Save results

We have done our filtering. now we need to save it for reference later.

In [67]:
writer = pd.ExcelWriter('../Results/VEP.xlsx', engine='xlsxwriter')

consensusFilter.to_excel(writer, sheet_name="Consensus")

Singletons.to_excel(writer, sheet_name="Singletons")
Doubletons.to_excel(writer, sheet_name="Doubletons")
phenoFilter.to_excel(writer, sheet_name="Phenotype Matches")
phenoConsensus

writer.save()