# Setup

In [11]:
import numpy as np
import pandas as pd
import scanpy as sc
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.pyplot import rc_context
import bbknn
import re
import httplib2 as http
import json
import os
import glob

In [12]:
sc.settings.verbosity = 0            # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_header()
sc.settings.set_figure_params(dpi=80, facecolor='white', color_map = 'Reds', figsize = (5, 5),dpi_save=300)

scanpy==1.9.1 anndata==0.8.0 umap==0.5.3 numpy==1.22.4 scipy==1.8.1 pandas==1.4.3 scikit-learn==1.1.1 statsmodels==0.13.2 python-igraph==0.9.11 pynndescent==0.5.7


# Import a list of all human TFs

In [13]:
tf_df=pd.read_csv('http://humantfs.ccbr.utoronto.ca/download/v_1.01/DatabaseExtract_v_1.01.csv',index_col='HGNC symbol') # import a df of all human TFs
tf_df=tf_df.drop(['Unnamed: 0'],axis=1) # tidy
tf_list=tf_df.index.tolist() # convert gene symbol to list

# Define a list of phenotypes and search terms for those

In [14]:
# Using the list of phenotypes (see notebook pt1), make a dictionary grouping them
phenotype_dict={
    'Heterotaxy':['eterotaxy','Dextrocardia'],
    'ASD':['ASD','trial septal','Atrial septal defect','Patent foramen ovale'],
    'VSD':['VSD','entricular septal','Ventricular septal defect'],
    'AVSD':['AVSD','trioventricular septal','Atrioventricular septal defect'],
    'Malformation of outflow tracts':['Truncus arteriosus','Malformation of outflow tracts','Fallot','fallot','Double outlet right ventricle','Pentalogy of fallot','Transposition of the great arteries','Transposition of the great vessels',],
    'Functional single ventricle':['Functional single ventricle','Double-inlet left ventricle','Hypoplastic left heart syndrome','Hypoplastic right ventricle','Hypoplastic rv'],
#    'Obstructive lesions':['Obstructive lesions','alv','stenosis'], # removed because I don't know how to define this term
    'Cardiomyopathy':['ardiomyopathy'],
    'DCM':['DCM','ilated'],
    'HCM':['HCM','ypertrophic'],
    'LVNC':['ompaction','LVNC','Left ventricular noncompaction'],
    'Venous anomaly':['Total anomalous pulmonary venous return','Pulmonary vein stenosis','Partial anomalous pulmonary venous return','Persistent left superior vena cava'],
    'Pulmonary artery':['Dilatation of the pulmonary trunk','Hypoplastic left pulmonary artery','Pulmonary artery dysplasia and hypoplasia','Pulmonary atresia','Pulmonary trunk and pulmonary artery absence'],
    'Aortic arch':['Right aortic arch','Patent ductus arteriosus','Ductus arteriosus','Coarctation of the aorta','Interrupted aortic arch','Lsvc and left pulmonary artery arising from the ductus arteriosus'],
    'Aorta':['Aberrant supraclavicular artery','Anomalous right subclavian artery','Aortic aneurysm','Aortic hypoplasia'],
    'Coronary anomaly':['Anomalous left coronary artery from the pulmonary artery','Coronary artery anomaly'],
    'VA valves':['Pulmonic stenosis','Absent pulmonary valve','Bicuspid aortic valve', 'Aortic stenosis','Dysplastic pulmonary valve','Pulmonary stenosis'],
    'AV valves':['Dysplastic mitral valve','Dysplastic tricuspid valve',"Ebstein's anomaly",'Mitral atresia','Mitral stenosis','Mitral valve dysplasia','Tricuspid atresia','Tricuspid stenosis','Tricuspid valve atresia','Tricuspid valve dysplasia']
}

# Import PanelApp panels

In [15]:
# read in .tsvs for each trait, calling it by the trait name

all_files = []
for i in range(len(os.listdir('/home/jovyan/notebooks/suspension/scanpy_clustering/marker_lists/PanelApp/'))):
    all_files.append(os.listdir('/home/jovyan/notebooks/suspension/scanpy_clustering/marker_lists/PanelApp/')[i])
    
# remove '.ipynb...'

bad_traits = [item for item in all_files if item.startswith('.')]
all_files=[item for item in all_files if item not in bad_traits]

all_files

['Brugada syndrome.tsv',
 'Catecholaminergic polymorphic VT.tsv',
 'Pulmonary arterial hypertension.tsv',
 'Dilated cardiomyopathy - adult and teen.tsv',
 'Left Ventricular Noncompaction Cardiomyopathy.tsv',
 'Hypertrophic cardiomyopathy - teen and adult.tsv',
 'Familial non syndromic congenital heart disease.tsv',
 'Long QT syndrome.tsv',
 'Dilated Cardiomyopathy and conduction defects.tsv',
 'Cardiomyopathies - including childhood onset.tsv',
 'Arrhythmogenic cardiomyopathy.tsv',
 'Thoracic aortic aneurysm or dissection.tsv',
 'Cardiac arrhythmias.tsv',
 'Progressive cardiac conduction disease.tsv']

In [16]:
# Some of the files aren't about developmental disorders

files_for_analysis=['Left Ventricular Noncompaction Cardiomyopathy.tsv',
 'Familial non syndromic congenital heart disease.tsv',
'Cardiomyopathies - including childhood onset.tsv']

In [17]:
path = '/home/jovyan/notebooks/suspension/scanpy_clustering/marker_lists/PanelApp/'

li = []

for filename in files_for_analysis:
    df = pd.read_table(os.path.join(path,filename))
    li.append(df)

df = pd.concat(li, axis=0, ignore_index=True)
df=df.set_index('Entity Name')
df=df.rename_axis('gene')
df

Unnamed: 0_level_0,Entity type,Gene Symbol,Sources(; separated),Level4,Level3,Level2,Model_Of_Inheritance,Phenotypes,Omim,Orphanet,...,Position GRCh38 Start,Position GRCh38 End,STR Repeated Sequence,STR Normal Repeats,STR Pathogenic Repeats,Region Haploinsufficiency Score,Region Triplosensitivity Score,Region Required Overlap Percentage,Region Variant Type,Region Verbose Name
gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ACTC1,gene,ACTC1,Expert list;Expert Review Green;Illumina TruGe...,Left Ventricular Noncompaction Cardiomyopathy,Cardiomyopathy,Cardiovascular disorders,"MONOALLELIC, autosomal or pseudoautosomal, imp...",Left ventricular noncompaction 4 ;Left Ventric...,,,...,,,,,,,,,,
MYBPC3,gene,MYBPC3,Eligibility statement prior genetic testing;Ex...,Left Ventricular Noncompaction Cardiomyopathy,Cardiomyopathy,Cardiovascular disorders,"MONOALLELIC, autosomal or pseudoautosomal, imp...","Left ventricular noncompaction 10, ;Hypertroph...",,,...,,,,,,,,,,
MYH7,gene,MYH7,Eligibility statement prior genetic testing;Ex...,Left Ventricular Noncompaction Cardiomyopathy,Cardiomyopathy,Cardiovascular disorders,"MONOALLELIC, autosomal or pseudoautosomal, imp...","Left ventricular noncompaction 5, OMIM:613426;...",,,...,,,,,,,,,,
TAZ,gene,TAZ,Expert list;Expert Review Green;Illumina TruGe...,Left Ventricular Noncompaction Cardiomyopathy,Cardiomyopathy,Cardiovascular disorders,"X-LINKED: hemizygous mutation in males, monoal...",Left Ventricular Noncompaction Cardiomyopathy,,,...,,,,,,,,,,
TNNT2,gene,TNNT2,Eligibility statement prior genetic testing;Ex...,Left Ventricular Noncompaction Cardiomyopathy,Cardiomyopathy,Cardiovascular disorders,"MONOALLELIC, autosomal or pseudoautosomal, imp...","Left ventricular noncompaction 6, ;Hypertrophi...",,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TPM1,gene,TPM1,Expert Review Green;London South GLH;NHS GMS;S...,Cardiomyopathies - including childhood onset,,,"MONOALLELIC, autosomal or pseudoautosomal, NOT...","Cardiomyopathy, familial hypertrophic, 3;Cardi...",,,...,,,,,,,,,,
TSFM,gene,TSFM,Expert Review Green;NHS GMS;South West GLH,Cardiomyopathies - including childhood onset,,,"BIALLELIC, autosomal or pseudoautosomal",Combined oxidative phosphorylation deficiency ...,,,...,,,,,,,,,,
TTN,gene,TTN,Expert Review Green;NHS GMS;South West GLH,Cardiomyopathies - including childhood onset,,,"MONOALLELIC, autosomal or pseudoautosomal, NOT...","Cardiomyopathy, familial hypertrophic, 9,;Card...",,,...,,,,,,,,,,
TTR,gene,TTR,Expert Review Green;NHS GMS;South West GLH,Cardiomyopathies - including childhood onset,,,"MONOALLELIC, autosomal or pseudoautosomal, NOT...",syndromic HCM,,,...,,,,,,,,,,


In [18]:
phenotype_column='Phenotypes'

df['Phenotypes'][df['Phenotypes'].isna()]=df['Level4'] # replaces NaNs in Phenotypes column, allowing it to be used to filter for conditions

for phenotype in range(len(phenotype_dict)):
    df['phenotype_'+str(list(phenotype_dict.keys())[phenotype])]=0
    df['phenotype_'+str(list(phenotype_dict.keys())[phenotype])][df[phenotype_column].str.contains('|'.join(list(phenotype_dict.values())[phenotype]))]=1

df['TF']=0
df['TF'][df.index.isin(tf_list)]=1 # add a column indicating whether TF or not
df=df[df['Entity type']!='region'] # Remove any non-gene entities
df['source']='PanelApp'
df['granular_phenotype']=df[phenotype_column]
PanelApp=df
PanelApp

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Phenotypes'][df['Phenotypes'].isna()]=df['Level4'] # replaces NaNs in Phenotypes column, allowing it to be used to filter for conditions
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['phenotype_'+str(list(phenotype_dict.keys())[phenotype])][df[phenotype_column].str.contains('|'.join(list(phenotype_dict.values())[phenotype]))]=1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['phenotype_'+str(list(phenotype_dict.keys())[phenotype])][df[phenotype_column].str

Unnamed: 0_level_0,Entity type,Gene Symbol,Sources(; separated),Level4,Level3,Level2,Model_Of_Inheritance,Phenotypes,Omim,Orphanet,...,phenotype_Venous anomaly,phenotype_Pulmonary artery,phenotype_Aortic arch,phenotype_Aorta,phenotype_Coronary anomaly,phenotype_VA valves,phenotype_AV valves,TF,source,granular_phenotype
gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ACTC1,gene,ACTC1,Expert list;Expert Review Green;Illumina TruGe...,Left Ventricular Noncompaction Cardiomyopathy,Cardiomyopathy,Cardiovascular disorders,"MONOALLELIC, autosomal or pseudoautosomal, imp...",Left ventricular noncompaction 4 ;Left Ventric...,,,...,0,0,0,0,0,0,0,0,PanelApp,Left ventricular noncompaction 4 ;Left Ventric...
MYBPC3,gene,MYBPC3,Eligibility statement prior genetic testing;Ex...,Left Ventricular Noncompaction Cardiomyopathy,Cardiomyopathy,Cardiovascular disorders,"MONOALLELIC, autosomal or pseudoautosomal, imp...","Left ventricular noncompaction 10, ;Hypertroph...",,,...,0,0,0,0,0,0,0,0,PanelApp,"Left ventricular noncompaction 10, ;Hypertroph..."
MYH7,gene,MYH7,Eligibility statement prior genetic testing;Ex...,Left Ventricular Noncompaction Cardiomyopathy,Cardiomyopathy,Cardiovascular disorders,"MONOALLELIC, autosomal or pseudoautosomal, imp...","Left ventricular noncompaction 5, OMIM:613426;...",,,...,0,0,0,0,0,0,0,0,PanelApp,"Left ventricular noncompaction 5, OMIM:613426;..."
TAZ,gene,TAZ,Expert list;Expert Review Green;Illumina TruGe...,Left Ventricular Noncompaction Cardiomyopathy,Cardiomyopathy,Cardiovascular disorders,"X-LINKED: hemizygous mutation in males, monoal...",Left Ventricular Noncompaction Cardiomyopathy,,,...,0,0,0,0,0,0,0,0,PanelApp,Left Ventricular Noncompaction Cardiomyopathy
TNNT2,gene,TNNT2,Eligibility statement prior genetic testing;Ex...,Left Ventricular Noncompaction Cardiomyopathy,Cardiomyopathy,Cardiovascular disorders,"MONOALLELIC, autosomal or pseudoautosomal, imp...","Left ventricular noncompaction 6, ;Hypertrophi...",,,...,0,0,0,0,0,0,0,0,PanelApp,"Left ventricular noncompaction 6, ;Hypertrophi..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TPM1,gene,TPM1,Expert Review Green;London South GLH;NHS GMS;S...,Cardiomyopathies - including childhood onset,,,"MONOALLELIC, autosomal or pseudoautosomal, NOT...","Cardiomyopathy, familial hypertrophic, 3;Cardi...",,,...,0,0,0,0,0,0,0,0,PanelApp,"Cardiomyopathy, familial hypertrophic, 3;Cardi..."
TSFM,gene,TSFM,Expert Review Green;NHS GMS;South West GLH,Cardiomyopathies - including childhood onset,,,"BIALLELIC, autosomal or pseudoautosomal",Combined oxidative phosphorylation deficiency ...,,,...,0,0,0,0,0,0,0,0,PanelApp,Combined oxidative phosphorylation deficiency ...
TTN,gene,TTN,Expert Review Green;NHS GMS;South West GLH,Cardiomyopathies - including childhood onset,,,"MONOALLELIC, autosomal or pseudoautosomal, NOT...","Cardiomyopathy, familial hypertrophic, 9,;Card...",,,...,0,0,0,0,0,0,0,0,PanelApp,"Cardiomyopathy, familial hypertrophic, 9,;Card..."
TTR,gene,TTR,Expert Review Green;NHS GMS;South West GLH,Cardiomyopathies - including childhood onset,,,"MONOALLELIC, autosomal or pseudoautosomal, NOT...",syndromic HCM,,,...,0,0,0,0,0,0,0,0,PanelApp,syndromic HCM


In [19]:
PanelApp.to_csv('/home/jovyan/data/PanelApp_with_phenotype_annotation.csv')