In [1]:
import pandas as pd
import requests
import json

In [2]:
hpo_df = pd.read_csv('genes_to_phenotype_202203.txt', sep='\t', comment='#', header=None)
hpo_df.columns = ['entrez-gene-id',
                'entrez-gene-symbol', 
                'HPO-Term-ID', 
                'HPO-Term-Name',
                'Frequency-Raw', 
                'Frequency-HPO',
                'Additional Info from G-D source',
                'G-D source', 
                'disease-ID for link']
#print(hpo_df.head())

In [3]:
def parse_hpo_id_for_gene_symbol(gene_symbol):
    return hpo_df[hpo_df['entrez-gene-symbol'] == gene_symbol]['HPO-Term-ID'].to_list()

def parse_hpo_term_for_gene_symbol(gene_symbol):
    return hpo_df[hpo_df['entrez-gene-symbol'] == gene_symbol]['HPO-Term-Name'].to_list()


In [4]:
# getting hpo terms from varvis

def retrieve_hpo_terms_from_varvis(sample_id):
    hp_list=[]
    username='anthony.mackitz'
    passwd='User002#'

    r = requests.head("https://arcensus-diagnostics.varvis.com/authenticate")
    token = r.headers['X-CSRF-TOKEN']
    sessionId = r.cookies['session']

    r = requests.post("https://arcensus-diagnostics.varvis.com/login", data = {'_csrf': token, 'username': username, 'password': passwd}, cookies = dict(session=sessionId))
    sessionId = r.cookies['session']
    r = requests.head("https://arcensus-diagnostics.varvis.com/authenticate", cookies = dict(session=sessionId))


    hpor=requests.get("https://arcensus-diagnostics.varvis.com/api/person/"+sample_id, cookies = dict(session=sessionId))

    hpoj=json.loads(hpor.content)

    res=hpoj['response']['clinicalInformation']['hpoTerms']
    for i in res:
        #print(i)
        #hp_list.append((i['accession'], i['name'] ))
        hp_list.append( i['name'] )
    return hp_list 


In [5]:
retrieve_hpo_terms_from_varvis('603703W1a')

['Delayed speech and language development',
 'Global developmental delay',
 'Decreased body weight',
 'Severe muscular hypotonia',
 'Feeding difficulties']

In [6]:
 

master_STR_df = pd.read_excel('master_str_max_highlighted.xlsx')
required_columns = ['LOCUS'] + ['Pathogenic'] + [col for col in master_STR_df.columns if col.endswith('a')]

modified_master_STR_df = master_STR_df[required_columns]

In [7]:
modified_master_STR_df.head()

Unnamed: 0,LOCUS,Pathogenic,603703W1a,603730W1a,603733W1a,603736W1a,603742W1a,603744W1a,603749W1a,603753W1a,...,704867W1a,704870W1a,704874W1a,704878W1a,704881B1a,704886B1a,704887B1a,704888B1a,704889B1a,704890B1a
0,DAB1,31,13.0,13.0,13.0,17.0,12.0,13.0,12.0,14.0,...,12.0,21.0,10.0,17.0,16.0,12.0,23.0,17.0,28.0,14.0
1,NOTCH2NLC,90,11.0,16.0,17.0,19.0,19.0,20.0,15.0,19.0,...,12.0,25.0,11.0,15.0,18.0,16.0,17.0,18.0,20.0,15.0
2,NOTCH2NLC,60,11.0,16.0,17.0,19.0,19.0,20.0,15.0,19.0,...,12.0,25.0,11.0,15.0,18.0,16.0,17.0,18.0,20.0,15.0
3,STARD7,274,13.0,24.0,12.0,23.0,22.0,13.0,22.0,16.0,...,14.0,13.0,,12.0,13.0,,10.0,10.0,,
4,HOXD13,22,,,,,,,,,...,,,,,,,,,,


In [8]:
def parsing_master_eh_file(sample_id):
    locus_list = set()
    for index,val in enumerate(modified_master_STR_df[sample_id]):
        if val >= modified_master_STR_df['Pathogenic'][index]:
            locus_list.add(modified_master_STR_df['LOCUS'][index])
    return locus_list



 

In [9]:
print(len( parsing_master_eh_file('704465W1a')))

1


In [10]:
fout = open('hpo_varvis_match.csv', 'w')

In [11]:
for id in modified_master_STR_df.columns[2:]:
       print(id)
       locus_list = parsing_master_eh_file(id)

       if locus_list:
              varvis_hpo = retrieve_hpo_terms_from_varvis(id)
              for locus in locus_list:
                     #print(parse_hpo_term_for_gene_symbol(locus))
                     #print(locus, parse_hpo_term_for_gene_symbol(locus))
                     match = list(set(varvis_hpo).intersection(parse_hpo_term_for_gene_symbol(locus)))
                     fout.write('{};{};{};{}\n'.format(id,locus,match,len(match) ))
                     print(('{};{};{};{}\n'.format(id,locus,match,len(match) )))
fout.close()
        
                     

#https://stackoverflow.com/questions/2864842/common-elements-comparison-between-2-lists    

603703W1a
603703W1a;ARX_2;[];0

603730W1a
603730W1a;ARX_2;[];0

603730W1a;HOXA13_1;[];0

603733W1a
603733W1a;ARX_2;[];0

603736W1a
603736W1a;ARX_2;[];0

603736W1a;RUNX2;['Frontal bossing'];1

603742W1a
603742W1a;ARX_2;[];0

603742W1a;HOXA13_3;[];0

603744W1a
603744W1a;ARX_2;[];0

603749W1a
603749W1a;ARX_2;[];0

603753W1a
603760W1a
603760W1a;ARX_2;[];0

603763W1a
603763W1a;ARX_2;[];0

603763W1a;TCF4;['Seizure', 'Aggressive behavior'];2

603763W1a;RUNX2;[];0

603766W1a
603766W1a;RUNX2;[];0

603773W1a
603776W1a
603776W1a;ARX_2;[];0

603776W1a;HOXA13_1;[];0

603784W2a
603784W2a;ARX_2;[];0

603784W2a;HOXA13_3;[];0

603792W1a
603792W1a;ARX_2;[];0

603818W1a
603822W1a
603822W1a;ARX_2;[];0

603828W1a
603828W1a;ARX_2;[];0

603831W1a
603836W1a
603847W2a
603847W2a;ARX_1;[];0

603847W2a;HOXA13_3;[];0

603859W1a
603862W1a
603862W1a;ARX_2;[];0

603865W1a
603865W1a;ARX_2;[];0

603868W1a
603868W1a;ARX_2;[];0

603871W1a
603874W1a
603874W1a;ARX_2;[];0

603874W1a;HOXA13_3;[];0

603877W1a
603877W1a;HOXA13

In [12]:
hpo_varvis_df = pd.read_csv('hpo_varvis_match.csv', sep=';', header=None)
hpo_varvis_df.columns = ['sampleId','locus', 'matching_HPO_terms', 'count_of_matches']
print(hpo_varvis_df.head())

    sampleId     locus matching_HPO_terms  count_of_matches
0  603703W1a     ARX_2                 []                 0
1  603730W1a     ARX_2                 []                 0
2  603730W1a  HOXA13_1                 []                 0
3  603733W1a     ARX_2                 []                 0
4  603736W1a     ARX_2                 []                 0


In [13]:
hpo_varvis_df.shape

(327, 4)

In [14]:
observed_list = []
pathogenic_list = []
for index, row in hpo_varvis_df.iterrows():
    print(row['sampleId'], row['locus'])
    observed = modified_master_STR_df [modified_master_STR_df['LOCUS']==row['locus']][row['sampleId']].tolist()
    pathogenic = modified_master_STR_df [modified_master_STR_df['LOCUS']==row['locus']]['Pathogenic'].tolist()
    observed_list.append(set(observed))
    pathogenic_list.append(set(pathogenic))
    #print(observed)
    #print(pathogenic)
     



603703W1a ARX_2
603730W1a ARX_2
603730W1a HOXA13_1
603733W1a ARX_2
603736W1a ARX_2
603736W1a RUNX2
603742W1a ARX_2
603742W1a HOXA13_3
603744W1a ARX_2
603749W1a ARX_2
603760W1a ARX_2
603763W1a ARX_2
603763W1a TCF4
603763W1a RUNX2
603766W1a RUNX2
603776W1a ARX_2
603776W1a HOXA13_1
603784W2a ARX_2
603784W2a HOXA13_3
603792W1a ARX_2
603822W1a ARX_2
603828W1a ARX_2
603847W2a ARX_1
603847W2a HOXA13_3
603862W1a ARX_2
603865W1a ARX_2
603868W1a ARX_2
603874W1a ARX_2
603874W1a HOXA13_3
603877W1a HOXA13_3
603877W1a RUNX2
603880W1a ARX_2
603881W1a ARX_2
603881W1a HOXA13_1
603881W1a HOXA13_3
603891W1a ARX_2
603894W1a HOXA13_1
603894W1a HOXA13_3
603903W1a HOXA13_3
603903W1a RUNX2
603907W1a ARX_2
603907W1a RUNX2
603910W1a DMD
603910W1a HOXA13_3
603913W1a ARX_2
603913W1a RUNX2
603917W1a ARX_2
603917W1a ZIC3
603920W1a HOXA13_3
603929W1a ARX_2
603934W1a ARX_2
603936W1a ARX_2
603936W1a HOXA13_3
603940W1a ARX_2
603940W1a HTT
604018W1a ARX_2
604029W1a ARX_2
604033W1a ARX_2
604033W1a HOXA13_1
604033W1a HOXA

In [15]:
hpo_varvis_df['observed_str_count'] = observed_list
hpo_varvis_df['pathogenic_cutoff'] = pathogenic_list

In [16]:
hpo_varvis_df 

Unnamed: 0,sampleId,locus,matching_HPO_terms,count_of_matches,observed_str_count,pathogenic_cutoff
0,603703W1a,ARX_2,[],0,{30.0},"{20, 23}"
1,603730W1a,ARX_2,[],0,{57.0},"{20, 23}"
2,603730W1a,HOXA13_1,[],0,{52.0},{22}
3,603733W1a,ARX_2,[],0,{37.0},"{20, 23}"
4,603736W1a,ARX_2,[],0,{64.0},"{20, 23}"
...,...,...,...,...,...,...
322,704867W1a,HOXA13_3,[],0,{50.0},{24}
323,704887B1a,ARX_2,[],0,{40.0},"{20, 23}"
324,704888B1a,ARX_2,[],0,{62.0},"{20, 23}"
325,704888B1a,RUNX2,[],0,{56.0},{27}


In [17]:
hpo_varvis_df.to_excel('Final_Analyis_Expansion_Hunter.xlsx', index=False)