# Set up

In [1]:
# use with bioconda kernel

In [2]:
import pandas as pd
import urllib.request
import os

In [3]:
! cd /home/jovyan/data/
! pwd

/home/jovyan/data


# Import a list of all human TFs

In [4]:
tf_df=pd.read_csv('http://humantfs.ccbr.utoronto.ca/download/v_1.01/DatabaseExtract_v_1.01.csv',index_col='HGNC symbol') # import a df of all human TFs
tf_df=tf_df.drop(['Unnamed: 0'],axis=1) # tidy
tf_list=tf_df.index.tolist() # convert gene symbol to list

# Define a list of phenotypes and search terms for those

In [5]:
# Using the list of phenotypes (see notebook pt1), make a dictionary grouping them. Have remoced partial strings in case these are inappropraitely matched
phenotype_dict={
    'Heterotaxy':['Heterotaxy','Dextrocardia'],
    'ASD':['ASD','Atrial septal defect','Patent foramen ovale'],
    'VSD':['VSD','Ventricular septal defect'],
    'AVSD':['AVSD','Atrioventricular septal defect'],
    'Malformation of outflow tracts':['Truncus arteriosus','Malformation of outflow tracts','Fallot','fallot','Double outlet right ventricle','Pentalogy of fallot','Transposition of the great arteries','Transposition of the great vessels',],
    'Functional single ventricle':['Functional single ventricle','Double-inlet left ventricle','Hypoplastic left heart syndrome','Hypoplastic right ventricle','Hypoplastic rv'],
#    'Obstructive lesions':['Obstructive lesions','alv','stenosis'], # removed because I don't know how to define this term
    'Cardiomyopathy':['Cardiomyopathy'],
    'DCM':['DCM','Dilated cardiomyopathy'],
    'HCM':['HCM','Hypertrophic cardiomyopathy'],
    'LVNC':['LVNC','Left ventricular noncompaction'],
    'Venous anomaly':['Total anomalous pulmonary venous return','Pulmonary vein stenosis','Partial anomalous pulmonary venous return','Persistent left superior vena cava'],
    'Pulmonary artery':['Dilatation of the pulmonary trunk','Hypoplastic left pulmonary artery','Pulmonary artery dysplasia and hypoplasia','Pulmonary atresia','Pulmonary trunk and pulmonary artery absence'],
    'Aortic arch':['Right aortic arch','Patent ductus arteriosus','Ductus arteriosus','Coarctation of the aorta','Interrupted aortic arch','Lsvc and left pulmonary artery arising from the ductus arteriosus'],
    'Aorta':['Aberrant supraclavicular artery','Anomalous right subclavian artery','Aortic aneurysm','Aortic hypoplasia'],
    'Coronary':['Anomalous left coronary artery from the pulmonary artery','Coronary artery anomaly'],
    'VA valves':['Pulmonic stenosis','Absent pulmonary valve','Bicuspid aortic valve', 'Aortic stenosis','Dysplastic pulmonary valve','Pulmonary stenosis'],
    'AV valves':['Dysplastic mitral valve','Dysplastic tricuspid valve',"Ebstein's anomaly",'Mitral atresia','Mitral stenosis','Mitral valve dysplasia','Tricuspid atresia','Tricuspid stenosis','Tricuspid valve atresia','Tricuspid valve dysplasia']
}

# Download the (weekly updating) ClinVar database from the FTP server

# Read in the database

In [6]:
clinvar_summary_df=pd.read_csv('/home/jovyan/data/variant_summary.txt',sep='\t')
clinvar_summary_df.shape

  clinvar_summary_df=pd.read_csv('/home/jovyan/data/variant_summary.txt',sep='\t')


(3093519, 34)

In [7]:
clinvar_summary_df.columns

Index(['#AlleleID', 'Type', 'Name', 'GeneID', 'GeneSymbol', 'HGNC_ID',
       'ClinicalSignificance', 'ClinSigSimple', 'LastEvaluated', 'RS# (dbSNP)',
       'nsv/esv (dbVar)', 'RCVaccession', 'PhenotypeIDS', 'PhenotypeList',
       'Origin', 'OriginSimple', 'Assembly', 'ChromosomeAccession',
       'Chromosome', 'Start', 'Stop', 'ReferenceAllele', 'AlternateAllele',
       'Cytogenetic', 'ReviewStatus', 'NumberSubmitters', 'Guidelines',
       'TestedInGTR', 'OtherIDs', 'SubmitterCategories', 'VariationID',
       'PositionVCF', 'ReferenceAlleleVCF', 'AlternateAlleleVCF'],
      dtype='object')

In [8]:
clinvar_summary_df.head(3)

Unnamed: 0,#AlleleID,Type,Name,GeneID,GeneSymbol,HGNC_ID,ClinicalSignificance,ClinSigSimple,LastEvaluated,RS# (dbSNP),...,ReviewStatus,NumberSubmitters,Guidelines,TestedInGTR,OtherIDs,SubmitterCategories,VariationID,PositionVCF,ReferenceAlleleVCF,AlternateAlleleVCF
0,15041,Indel,NM_014855.3(AP5Z1):c.80_83delinsTGCTGTAAACTGTA...,9907,AP5Z1,HGNC:22197,Pathogenic,1,-,397704705,...,"criteria provided, single submitter",2,-,N,"ClinGen:CA215070,OMIM:613653.0001",3,2,4820844,GGAT,TGCTGTAAACTGTAACTGTAAA
1,15041,Indel,NM_014855.3(AP5Z1):c.80_83delinsTGCTGTAAACTGTA...,9907,AP5Z1,HGNC:22197,Pathogenic,1,-,397704705,...,"criteria provided, single submitter",2,-,N,"ClinGen:CA215070,OMIM:613653.0001",3,2,4781213,GGAT,TGCTGTAAACTGTAACTGTAAA
2,15042,Deletion,NM_014855.3(AP5Z1):c.1413_1426del (p.Leu473fs),9907,AP5Z1,HGNC:22197,Pathogenic,1,"Jun 29, 2010",397704709,...,no assertion criteria provided,1,-,N,"ClinGen:CA215072,OMIM:613653.0002",1,3,4827360,GCTGCTGGACCTGCC,G


# Filter the database

- Pathogenic or Likely Pathogenic
- Review status >1 submitter or expert
- Last reviewed since 2016

In [9]:
clinvar_summary_df.columns

Index(['#AlleleID', 'Type', 'Name', 'GeneID', 'GeneSymbol', 'HGNC_ID',
       'ClinicalSignificance', 'ClinSigSimple', 'LastEvaluated', 'RS# (dbSNP)',
       'nsv/esv (dbVar)', 'RCVaccession', 'PhenotypeIDS', 'PhenotypeList',
       'Origin', 'OriginSimple', 'Assembly', 'ChromosomeAccession',
       'Chromosome', 'Start', 'Stop', 'ReferenceAllele', 'AlternateAllele',
       'Cytogenetic', 'ReviewStatus', 'NumberSubmitters', 'Guidelines',
       'TestedInGTR', 'OtherIDs', 'SubmitterCategories', 'VariationID',
       'PositionVCF', 'ReferenceAlleleVCF', 'AlternateAlleleVCF'],
      dtype='object')

In [10]:
clinvar_summary_df['ClinicalSignificance'].value_counts()

Uncertain significance                                                1226639
Likely benign                                                          828566
Benign                                                                 402434
Pathogenic                                                             256411
Conflicting interpretations of pathogenicity                           133465
                                                                       ...   
Conflicting interpretations of pathogenicity; drug response; other          2
Pathogenic; association; protective                                         2
Conflicting interpretations of pathogenicity; Affects                       2
Benign; association; confers sensitivity                                    2
Conflicting interpretations of pathogenicity; protective                    2
Name: ClinicalSignificance, Length: 80, dtype: int64

In [11]:
ClinicalSignificance=['Pathogenic',
                      'Likely pathogenic', 'Pathogenic/Likely pathogenic'
                     ]

In [12]:
clinvar_summary_df=clinvar_summary_df[clinvar_summary_df['ClinicalSignificance'].isin(ClinicalSignificance)]
clinvar_summary_df.shape

(406244, 34)

In [13]:
clinvar_summary_df['ReviewStatus'].value_counts()

criteria provided, single submitter                     243267
no assertion criteria provided                           82163
criteria provided, multiple submitters, no conflicts     63520
reviewed by expert panel                                 17244
practice guideline                                          50
Name: ReviewStatus, dtype: int64

In [14]:
ReviewStatus=['criteria provided, multiple submitters, no conflicts','reviewed by expert panel']

In [15]:
clinvar_summary_df=clinvar_summary_df[clinvar_summary_df['ReviewStatus'].isin(ReviewStatus)]
clinvar_summary_df.shape

(80764, 34)

In [16]:
clinvar_summary_df['LastEvaluated_year']=clinvar_summary_df['LastEvaluated'].str.split(', ',expand=True)[1]
clinvar_summary_df['LastEvaluated_year'].value_counts()

2021    35318
2022    18355
2016     8366
2020     5608
2019     4103
2017     4039
2018     2357
2013     2220
2015      237
2014       68
2011       63
2012        6
2009        2
Name: LastEvaluated_year, dtype: int64

In [17]:
clinvar_summary_df=clinvar_summary_df[clinvar_summary_df['LastEvaluated_year'].isin(['2022','2021','2020','2019','2018','2017','2016'])]
clinvar_summary_df.shape

(78146, 35)

In [18]:
clinvar_summary_df[['PhenotypeIDS', 'PhenotypeList']].head(3)

Unnamed: 0,PhenotypeIDS,PhenotypeList
6,"MONDO:MONDO:0032624,MedGen:C4748791,OMIM:61824...","Mitochondrial complex 1 deficiency, nuclear ty..."
7,"MONDO:MONDO:0032624,MedGen:C4748791,OMIM:61824...","Mitochondrial complex 1 deficiency, nuclear ty..."
48,"MONDO:MONDO:0012984,MedGen:C2675204,OMIM:61267...",PHARC syndrome|not provided


In [19]:
# Make new columns indicating whether a phenotype is present
phenotype_column='PhenotypeList'

for phenotype in range(len(phenotype_dict)):
    clinvar_summary_df['phenotype_'+str(list(phenotype_dict.keys())[phenotype])]=0
    clinvar_summary_df['phenotype_'+str(list(phenotype_dict.keys())[phenotype])][clinvar_summary_df[phenotype_column].str.contains(case=False,pat='|'.join(list(phenotype_dict.values())[phenotype]))]=1

clinvar_summary_df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clinvar_summary_df['phenotype_'+str(list(phenotype_dict.keys())[phenotype])][clinvar_summary_df[phenotype_column].str.contains(case=False,pat='|'.join(list(phenotype_dict.values())[phenotype]))]=1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clinvar_summary_df['phenotype_'+str(list(phenotype_dict.keys())[phenotype])][clinvar_summary_df[phenotype_column].str.contains(case=False,pat='|'.join(list(phenotype_dict.values())[phenotype]))]=1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-vie

Unnamed: 0,#AlleleID,Type,Name,GeneID,GeneSymbol,HGNC_ID,ClinicalSignificance,ClinSigSimple,LastEvaluated,RS# (dbSNP),...,phenotype_DCM,phenotype_HCM,phenotype_LVNC,phenotype_Venous anomaly,phenotype_Pulmonary artery,phenotype_Aortic arch,phenotype_Aorta,phenotype_Coronary,phenotype_VA valves,phenotype_AV valves
6,15044,single nucleotide variant,NM_017547.4(FOXRED1):c.694C>T (p.Gln232Ter),55572,FOXRED1,HGNC:26927,Pathogenic,1,"Dec 30, 2019",267606829,...,0,0,0,0,0,0,0,0,0,0
7,15044,single nucleotide variant,NM_017547.4(FOXRED1):c.694C>T (p.Gln232Ter),55572,FOXRED1,HGNC:26927,Pathogenic,1,"Dec 30, 2019",267606829,...,0,0,0,0,0,0,0,0,0,0
48,15066,single nucleotide variant,NM_001042472.3(ABHD12):c.1054C>T (p.Arg352Ter),26090,ABHD12,HGNC:15868,Pathogenic,1,"Jan 05, 2022",267606624,...,0,0,0,0,0,0,0,0,0,0
49,15066,single nucleotide variant,NM_001042472.3(ABHD12):c.1054C>T (p.Arg352Ter),26090,ABHD12,HGNC:15868,Pathogenic,1,"Jan 05, 2022",267606624,...,0,0,0,0,0,0,0,0,0,0
52,15068,Microsatellite,NM_138413.4(HOGA1):c.938AGG[2] (p.Glu315del),112817,HOGA1,HGNC:25155,Pathogenic,1,"Dec 15, 2021",397509360,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3057146,1706616,Deletion,NM_001034853.2(RPGR):c.1225del (p.Arg409fs),6103,RPGR,HGNC:10295,Pathogenic/Likely pathogenic,1,"Apr 14, 2022",-1,...,0,0,0,0,0,0,0,0,0,0
3057261,1706676,Duplication,NM_003322.6(TULP1):c.162dup (p.Thr55fs),7287,TULP1,HGNC:12423,Pathogenic/Likely pathogenic,1,"Dec 30, 2021",-1,...,0,0,0,0,0,0,0,0,0,0
3057262,1706676,Duplication,NM_003322.6(TULP1):c.162dup (p.Thr55fs),7287,TULP1,HGNC:12423,Pathogenic/Likely pathogenic,1,"Dec 30, 2021",-1,...,0,0,0,0,0,0,0,0,0,0
3060703,1708587,Duplication,NM_000152.5(GAA):c.1848dup (p.Val617fs),2548,GAA,HGNC:4065,Pathogenic,1,"Sep 06, 2022",-1,...,0,0,0,0,0,0,0,0,0,0


In [20]:
# Add a TF column
clinvar_summary_df['TF']=0
clinvar_summary_df['TF'][clinvar_summary_df.index.isin(tf_list)]=1 # add a column indicating whether TF or not

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clinvar_summary_df['TF'][clinvar_summary_df.index.isin(tf_list)]=1 # add a column indicating whether TF or not


# Review

In [21]:
for phenotype in range(len(phenotype_dict)):
    print('\n')
    print(str(list(phenotype_dict.keys())[phenotype]))
    print(clinvar_summary_df['GeneSymbol'][clinvar_summary_df['phenotype_'+str(list(phenotype_dict.keys())[phenotype])]==1].value_counts())



Heterotaxy
ZIC3          4
MMP21         4
CERS1;GDF1    2
NODAL         2
Name: GeneSymbol, dtype: int64


ASD
NKX2-5    6
ACTC1     4
TAB2      2
Name: GeneSymbol, dtype: int64


VSD
Series([], Name: GeneSymbol, dtype: int64)


AVSD
Series([], Name: GeneSymbol, dtype: int64)


Malformation of outflow tracts
MED13L    6
JAG1      4
FOXC1     2
ROBO1     2
Name: GeneSymbol, dtype: int64


Functional single ventricle
Series([], Name: GeneSymbol, dtype: int64)


Cardiomyopathy
TTN       664
MYBPC3    526
MYH7      234
DSP       176
LMNA      118
         ... 
MT-TI       2
COQ9        2
MT-TL1      2
MT-TK       2
PRDM16      1
Name: GeneSymbol, Length: 61, dtype: int64


DCM
TTN                 656
LMNA                 96
FLNC                 52
MYH7                 46
TNNT2                42
BAG3                 36
DSP                  30
DMD                  24
SCN5A                14
RBM20                14
DES                  12
ABCC9                10
TPM1                 10
MYB

# Tidy for export

In [22]:
clinvar_summary_df.set_index('GeneSymbol',inplace=True)
clinvar_summary_df

Unnamed: 0_level_0,#AlleleID,Type,Name,GeneID,HGNC_ID,ClinicalSignificance,ClinSigSimple,LastEvaluated,RS# (dbSNP),nsv/esv (dbVar),...,phenotype_HCM,phenotype_LVNC,phenotype_Venous anomaly,phenotype_Pulmonary artery,phenotype_Aortic arch,phenotype_Aorta,phenotype_Coronary,phenotype_VA valves,phenotype_AV valves,TF
GeneSymbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
FOXRED1,15044,single nucleotide variant,NM_017547.4(FOXRED1):c.694C>T (p.Gln232Ter),55572,HGNC:26927,Pathogenic,1,"Dec 30, 2019",267606829,-,...,0,0,0,0,0,0,0,0,0,0
FOXRED1,15044,single nucleotide variant,NM_017547.4(FOXRED1):c.694C>T (p.Gln232Ter),55572,HGNC:26927,Pathogenic,1,"Dec 30, 2019",267606829,-,...,0,0,0,0,0,0,0,0,0,0
ABHD12,15066,single nucleotide variant,NM_001042472.3(ABHD12):c.1054C>T (p.Arg352Ter),26090,HGNC:15868,Pathogenic,1,"Jan 05, 2022",267606624,-,...,0,0,0,0,0,0,0,0,0,0
ABHD12,15066,single nucleotide variant,NM_001042472.3(ABHD12):c.1054C>T (p.Arg352Ter),26090,HGNC:15868,Pathogenic,1,"Jan 05, 2022",267606624,-,...,0,0,0,0,0,0,0,0,0,0
HOGA1,15068,Microsatellite,NM_138413.4(HOGA1):c.938AGG[2] (p.Glu315del),112817,HGNC:25155,Pathogenic,1,"Dec 15, 2021",397509360,-,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
RPGR,1706616,Deletion,NM_001034853.2(RPGR):c.1225del (p.Arg409fs),6103,HGNC:10295,Pathogenic/Likely pathogenic,1,"Apr 14, 2022",-1,-,...,0,0,0,0,0,0,0,0,0,0
TULP1,1706676,Duplication,NM_003322.6(TULP1):c.162dup (p.Thr55fs),7287,HGNC:12423,Pathogenic/Likely pathogenic,1,"Dec 30, 2021",-1,-,...,0,0,0,0,0,0,0,0,0,0
TULP1,1706676,Duplication,NM_003322.6(TULP1):c.162dup (p.Thr55fs),7287,HGNC:12423,Pathogenic/Likely pathogenic,1,"Dec 30, 2021",-1,-,...,0,0,0,0,0,0,0,0,0,0
GAA,1708587,Duplication,NM_000152.5(GAA):c.1848dup (p.Val617fs),2548,HGNC:4065,Pathogenic,1,"Sep 06, 2022",-1,-,...,0,0,0,0,0,0,0,0,0,0


In [25]:
# Remove rows where there is no association with a cardiac phenotype
cols=clinvar_summary_df.columns.tolist()
phen_cols=[x for x in cols if x.startswith('phenotype_')]
print(clinvar_summary_df.shape)
clinvar_summary_df=clinvar_summary_df.loc[~(clinvar_summary_df[phen_cols]==0).all(axis=1)]
print(clinvar_summary_df.shape)
clinvar_summary_df['source']='ClinVar'
clinvar_summary_df

(3349, 52)
(3349, 52)


Unnamed: 0_level_0,#AlleleID,Type,Name,GeneID,HGNC_ID,ClinicalSignificance,ClinSigSimple,LastEvaluated,RS# (dbSNP),nsv/esv (dbVar),...,phenotype_LVNC,phenotype_Venous anomaly,phenotype_Pulmonary artery,phenotype_Aortic arch,phenotype_Aorta,phenotype_Coronary,phenotype_VA valves,phenotype_AV valves,TF,source
GeneSymbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
RBM20,15307,single nucleotide variant,NM_001134363.3(RBM20):c.1913C>T (p.Pro638Leu),282996,HGNC:27424,Pathogenic,1,"Jul 26, 2022",267607003,-,...,0,0,0,0,0,0,0,0,0,ClinVar
RBM20,15307,single nucleotide variant,NM_001134363.3(RBM20):c.1913C>T (p.Pro638Leu),282996,HGNC:27424,Pathogenic,1,"Jul 26, 2022",267607003,-,...,0,0,0,0,0,0,0,0,0,ClinVar
RBM20,15308,single nucleotide variant,NM_001134363.3(RBM20):c.1901G>A (p.Arg634Gln),282996,HGNC:27424,Pathogenic,1,"Mar 18, 2022",267607001,-,...,0,0,0,0,0,0,0,0,0,ClinVar
RBM20,15308,single nucleotide variant,NM_001134363.3(RBM20):c.1901G>A (p.Arg634Gln),282996,HGNC:27424,Pathogenic,1,"Mar 18, 2022",267607001,-,...,0,0,0,0,0,0,0,0,0,ClinVar
RBM20,15309,single nucleotide variant,NM_001134363.3(RBM20):c.1906C>A (p.Arg636Ser),282996,HGNC:27424,Pathogenic/Likely pathogenic,1,"Jan 06, 2022",267607002,-,...,0,0,0,0,0,0,0,0,0,ClinVar
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTN,1464444,Duplication,NM_001267550.2(TTN):c.68286_68289dup (p.Trp227...,7273,HGNC:12403,Pathogenic,1,"Apr 14, 2022",-1,-,...,0,0,0,0,0,0,0,0,0,ClinVar
TTN,1464767,single nucleotide variant,NM_001267550.2(TTN):c.66628C>T (p.Gln22210Ter),7273,HGNC:12403,Pathogenic/Likely pathogenic,1,"Mar 07, 2022",-1,-,...,0,0,0,0,0,0,0,0,0,ClinVar
TTN,1464767,single nucleotide variant,NM_001267550.2(TTN):c.66628C>T (p.Gln22210Ter),7273,HGNC:12403,Pathogenic/Likely pathogenic,1,"Mar 07, 2022",-1,-,...,0,0,0,0,0,0,0,0,0,ClinVar
PKP2,1487925,Deletion,NM_001005242.3(PKP2):c.1264_1265del (p.Leu422fs),5318,HGNC:9024,Pathogenic,1,"Jun 01, 2022",-1,-,...,0,0,0,0,0,0,0,0,0,ClinVar


In [26]:
clinvar_summary_df.to_csv('/home/jovyan/data/ClinVar_with_phenotype_annotation.csv')