In [1]:
import pandas as pd
import polars as pl
import numpy as np
import re

# Proces the neuroblastoma somatic variants. 
They were pulled from VWB on June 24th 2025. These are just the VEP=HIGH impact variants

In [2]:

def fill_missing_cols(df):
    if 'node_id' not in df.columns:
        raise ValueError('Must have at least a "node_id" column.')
    
    all_cols = set([ 'node_label', 'node_synonyms', 'node_dbxrefs',
            'node_definition','node_namespace','value','lowerbound','upperbound','unit'])
    missing_cols = list(all_cols - set(df.columns))
    nan_cols_df = pd.DataFrame(np.full([len(df), len(missing_cols)], np.nan),columns=missing_cols)

    if isinstance(df, pd.DataFrame):
        nan_cols_df.index = df.index
        return pd.concat([df,nan_cols_df],axis=1)
    elif isinstance(df, pl.DataFrame):
        return pl.concat([df,pl.from_pandas(nan_cols_df)],how='horizontal') # no index for polars
    else:
        raise ValueError(f'Must Pass either a pandas DataFrame or a polars DataFrame but recieved "{type(df)}".')
    

def bin_column(df, col2bin=''):
    '''works with polars df'''
    
    '''tpm_bins = list([0.0000000,7e-4,8e-4,9e-4]) + list(np.linspace(1e-3,9e-3,9)) + \
               list(np.round(np.linspace(1e-2,9e-2,9),2)) + list(np.round(np.linspace(.1,1,10),2)) + \
               list(np.linspace(2,100,99)) + list(np.arange(100,1100,100)[1:]) +  \
                list(np.arange(2000,11000,1000)) + list(np.arange(20_000,110_000,10_000)) + [300_000] '''
    
    bins = np.unique(hscloChrom['lowerbound'].values) #hscloChrom['lowerbound'].values
    # check that none of the disease expression values are out of range, ie larger than the max bin value
    # use for pandas
    #if np.sum(df['Disease_MeanTpm'] > np.max(tpm_bins)) == 0:
        
    # use for polars
    if len(df.filter(df[col2bin] > np.max(bins))) == 0:
        df_binned = df.with_columns(  df[col2bin].cut(breaks=bins).alias('bins')  )
        # check that order has been maintained #assert pl.all(df[col2bin] == df_binned[col2bin])
    else:
        print('OUT OF RANGE ERROR!')
        assert False 
    return df_binned

In [3]:
df=pd.read_csv('/home/stearb/U24/data/somatic_variants/NBL_somatic_variants_VEP_HIGH_June24.csv')
df.head(5)

Unnamed: 0,sample_id,participant_id,variant_class,zygosity,study_code,ensembl_transcript_id,ensembl_gene_id,symbol,feature_type,hgvsp,aa_change,consequence,symbol.1,vep_impact,rsnumber,hgvsg
0,BS_SY6CCX4G,PT_3YW2V4JK,deletion,HET,KF-NBL,ENST00000428771,ENSG00000188290,HES4,Transcript,ENSP00000393198.2:p.Arg50GlyfsTer60,p.Gly49/247Xaa,['frameshift'],HES4,HIGH,,chr1:g.999832del
1,BS_SY6CCX4G,PT_3YW2V4JK,deletion,HET,KF-NBL,NM_001142467.2,57801,HES4,Transcript,NP_001135939.1:p.Arg50GlyfsTer60,p.Gly49/247Xaa,['frameshift'],HES4,HIGH,,chr1:g.999832del
2,BS_8313ZZ6E,PT_3YW2V4JK,deletion,WT,KF-NBL,ENST00000428771,ENSG00000188290,HES4,Transcript,ENSP00000393198.2:p.Arg50GlyfsTer60,p.Gly49/247Xaa,['frameshift'],HES4,HIGH,,chr1:g.999832del
3,BS_8313ZZ6E,PT_3YW2V4JK,deletion,WT,KF-NBL,NM_001142467.2,57801,HES4,Transcript,NP_001135939.1:p.Arg50GlyfsTer60,p.Gly49/247Xaa,['frameshift'],HES4,HIGH,,chr1:g.999832del
4,BS_Z40644EA,PT_W50HSNE2,SNV,WT,KF-NBL,XM_017001945.1,5792,PTPRF,Transcript,XP_016857434.1:p.Ser531Ter,p.Ser531/1807Ter,['stop gained'],PTPRF,HIGH,,chr1:g.43591872C>A


In [4]:
df[['hgvsg']].nunique()

hgvsg    1609
dtype: int64

In [4]:
# merge in ensembl gene ids for the rows where the ensembl_gene_id col actually has an entrez id
df_ensembl = df[df['ensembl_gene_id'].str.startswith('ENSG')]
df_entrez  = df[~df['ensembl_gene_id'].str.startswith('ENSG')]

hgnc_master = pd.read_csv('/home/stearb/U24/data/helper_files/hgnc_master.txt',
                          sep='\t')
hgnc_master.head()
ensembl_entrez_map = hgnc_master[['ensembl_gene_id','entrez_id']].dropna()
ensembl_entrez_map['entrez_id'] = ensembl_entrez_map['entrez_id'].astype(int).astype(str)

df_entrez_mapped = pd.merge(df_entrez.rename({'ensembl_gene_id':'entrez_id'},axis=1)
                           ,ensembl_entrez_map,on='entrez_id')\
                            .drop('entrez_id',axis=1)

# add them back together, sort columns first though they got out of order during merge
df = pd.concat([df_ensembl[df_ensembl.columns.sort_values()],
                df_entrez_mapped[df_entrez_mapped.columns.sort_values()]])

df

  hgnc_master = pd.read_csv('/home/stearb/U24/data/helper_files/hgnc_master.txt',


Unnamed: 0,aa_change,consequence,ensembl_gene_id,ensembl_transcript_id,feature_type,hgvsg,hgvsp,participant_id,rsnumber,sample_id,study_code,symbol,symbol.1,variant_class,vep_impact,zygosity
0,p.Gly49/247Xaa,['frameshift'],ENSG00000188290,ENST00000428771,Transcript,chr1:g.999832del,ENSP00000393198.2:p.Arg50GlyfsTer60,PT_3YW2V4JK,,BS_SY6CCX4G,KF-NBL,HES4,HES4,deletion,HIGH,HET
2,p.Gly49/247Xaa,['frameshift'],ENSG00000188290,ENST00000428771,Transcript,chr1:g.999832del,ENSP00000393198.2:p.Arg50GlyfsTer60,PT_3YW2V4JK,,BS_8313ZZ6E,KF-NBL,HES4,HES4,deletion,HIGH,WT
9,p.Ser199/1291Ter,['stop gained'],ENSG00000142949,ENST00000412568,Transcript,chr1:g.43591872C>A,ENSP00000391764.2:p.Ser199Ter,PT_W50HSNE2,,BS_Z40644EA,KF-NBL,PTPRF,PTPRF,SNV,HIGH,WT
15,p.Ser188/1553Ter,['stop gained'],ENSG00000142949,ENST00000429895,Transcript,chr1:g.43591872C>A,ENSP00000408952.1:p.Ser188Ter,PT_W50HSNE2,,BS_Z40644EA,KF-NBL,PTPRF,PTPRF,SNV,HIGH,WT
25,p.Ser531/1907Ter,['stop gained'],ENSG00000142949,ENST00000359947,Transcript,chr1:g.43591872C>A,ENSP00000353030.4:p.Ser531Ter,PT_W50HSNE2,,BS_Z40644EA,KF-NBL,PTPRF,PTPRF,SNV,HIGH,WT
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18223,p.Ser200/682Ter,['stop gained'],ENSG00000173681,XM_011545477.2,Transcript,chrX:g.19966092G>T,XP_011543779.1:p.Ser200Ter,PT_NBFP0BB7,rs1230094791,BS_54N90X5G,KF-NBL,BCLAF3,BCLAF3,SNV,HIGH,WT
18224,p.Ser200/711Ter,['stop gained'],ENSG00000173681,NM_001367774.2,Transcript,chrX:g.19966092G>T,NP_001354703.1:p.Ser200Ter,PT_NBFP0BB7,rs1230094791,BS_54N90X5G,KF-NBL,BCLAF3,BCLAF3,SNV,HIGH,WT
18225,p.Ser200/652Ter,['stop gained'],ENSG00000173681,XM_011545479.2,Transcript,chrX:g.19966092G>T,XP_011543781.1:p.Ser200Ter,PT_NBFP0BB7,rs1230094791,BS_54N90X5G,KF-NBL,BCLAF3,BCLAF3,SNV,HIGH,WT
18226,p.Ser200/681Ter,['stop gained'],ENSG00000173681,XM_005274475.3,Transcript,chrX:g.19966092G>T,XP_005274532.1:p.Ser200Ter,PT_NBFP0BB7,rs1230094791,BS_54N90X5G,KF-NBL,BCLAF3,BCLAF3,SNV,HIGH,WT


# Define codes

In [8]:
# variant
df['hgvsg'] = df['hgvsg'].replace({':': '.'},regex=True) 
df['hgvsg_variant_code'] = 'HGVSG:' + df['hgvsg'] 

# ensembl gene
df['ens_gene_code'] = 'ENSEMBL:' + df['ensembl_gene_id']

# ensembl transcript
df['ens_transcript_code'] = 'ENSEMBL:' + df['ensembl_transcript_id']

# disease/MONDO for Neuroblastoma
df['disease_code'] = 'MONDO:0005072'

# cohort
df['cohort_code'] = 'KFCOHORT:SD-DYPMEHHF'

# study
df['study_code'] = 'KFSTUDY:KF-NBL-SOMATIC'

df

Unnamed: 0,aa_change,consequence,ensembl_gene_id,ensembl_transcript_id,feature_type,hgvsg,hgvsp,participant_id,rsnumber,sample_id,...,symbol.1,variant_class,vep_impact,zygosity,hgvsg_variant_code,ens_gene_code,ens_transcript_code,ens_protein_code,disease_code,cohort_code
0,p.Gly49/247Xaa,['frameshift'],ENSG00000188290,ENST00000428771,Transcript,chr1.g.999832del,ENSP00000393198.2:p.Arg50GlyfsTer60,PT_3YW2V4JK,,BS_SY6CCX4G,...,HES4,deletion,HIGH,HET,HGVSG:chr1.g.999832del,ENSEMBL:ENSG00000188290,ENSEMBL:ENST00000428771,ENSEMBL:ENST00000428771,MONDO:0005072,KFCOHORT:SD-DYPMEHHF
2,p.Gly49/247Xaa,['frameshift'],ENSG00000188290,ENST00000428771,Transcript,chr1.g.999832del,ENSP00000393198.2:p.Arg50GlyfsTer60,PT_3YW2V4JK,,BS_8313ZZ6E,...,HES4,deletion,HIGH,WT,HGVSG:chr1.g.999832del,ENSEMBL:ENSG00000188290,ENSEMBL:ENST00000428771,ENSEMBL:ENST00000428771,MONDO:0005072,KFCOHORT:SD-DYPMEHHF
9,p.Ser199/1291Ter,['stop gained'],ENSG00000142949,ENST00000412568,Transcript,chr1.g.43591872C>A,ENSP00000391764.2:p.Ser199Ter,PT_W50HSNE2,,BS_Z40644EA,...,PTPRF,SNV,HIGH,WT,HGVSG:chr1.g.43591872C>A,ENSEMBL:ENSG00000142949,ENSEMBL:ENST00000412568,ENSEMBL:ENST00000412568,MONDO:0005072,KFCOHORT:SD-DYPMEHHF
15,p.Ser188/1553Ter,['stop gained'],ENSG00000142949,ENST00000429895,Transcript,chr1.g.43591872C>A,ENSP00000408952.1:p.Ser188Ter,PT_W50HSNE2,,BS_Z40644EA,...,PTPRF,SNV,HIGH,WT,HGVSG:chr1.g.43591872C>A,ENSEMBL:ENSG00000142949,ENSEMBL:ENST00000429895,ENSEMBL:ENST00000429895,MONDO:0005072,KFCOHORT:SD-DYPMEHHF
25,p.Ser531/1907Ter,['stop gained'],ENSG00000142949,ENST00000359947,Transcript,chr1.g.43591872C>A,ENSP00000353030.4:p.Ser531Ter,PT_W50HSNE2,,BS_Z40644EA,...,PTPRF,SNV,HIGH,WT,HGVSG:chr1.g.43591872C>A,ENSEMBL:ENSG00000142949,ENSEMBL:ENST00000359947,ENSEMBL:ENST00000359947,MONDO:0005072,KFCOHORT:SD-DYPMEHHF
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18223,p.Ser200/682Ter,['stop gained'],ENSG00000173681,XM_011545477.2,Transcript,chrX.g.19966092G>T,XP_011543779.1:p.Ser200Ter,PT_NBFP0BB7,rs1230094791,BS_54N90X5G,...,BCLAF3,SNV,HIGH,WT,HGVSG:chrX.g.19966092G>T,ENSEMBL:ENSG00000173681,ENSEMBL:XM_011545477.2,ENSEMBL:XM_011545477.2,MONDO:0005072,KFCOHORT:SD-DYPMEHHF
18224,p.Ser200/711Ter,['stop gained'],ENSG00000173681,NM_001367774.2,Transcript,chrX.g.19966092G>T,NP_001354703.1:p.Ser200Ter,PT_NBFP0BB7,rs1230094791,BS_54N90X5G,...,BCLAF3,SNV,HIGH,WT,HGVSG:chrX.g.19966092G>T,ENSEMBL:ENSG00000173681,ENSEMBL:NM_001367774.2,ENSEMBL:NM_001367774.2,MONDO:0005072,KFCOHORT:SD-DYPMEHHF
18225,p.Ser200/652Ter,['stop gained'],ENSG00000173681,XM_011545479.2,Transcript,chrX.g.19966092G>T,XP_011543781.1:p.Ser200Ter,PT_NBFP0BB7,rs1230094791,BS_54N90X5G,...,BCLAF3,SNV,HIGH,WT,HGVSG:chrX.g.19966092G>T,ENSEMBL:ENSG00000173681,ENSEMBL:XM_011545479.2,ENSEMBL:XM_011545479.2,MONDO:0005072,KFCOHORT:SD-DYPMEHHF
18226,p.Ser200/681Ter,['stop gained'],ENSG00000173681,XM_005274475.3,Transcript,chrX.g.19966092G>T,XP_005274532.1:p.Ser200Ter,PT_NBFP0BB7,rs1230094791,BS_54N90X5G,...,BCLAF3,SNV,HIGH,WT,HGVSG:chrX.g.19966092G>T,ENSEMBL:ENSG00000173681,ENSEMBL:XM_005274475.3,ENSEMBL:XM_005274475.3,MONDO:0005072,KFCOHORT:SD-DYPMEHHF


# Define edges

###  Variant - Gene

In [7]:
e_var_gene = df[['hgvsg_variant_code','ens_gene_code']].dropna().drop_duplicates().reset_index(drop=True)
e_var_gene.columns = ['subject','object']

e_var_gene['predicate'] = 'related_to_gene'
e_var_gene = e_var_gene[['subject','predicate','object']]

e_var_gene.sample(5)

Unnamed: 0,subject,predicate,object
1090,HGVSG:chr7.g.15686399C>A,related_to_gene,ENSEMBL:ENSG00000106511
1183,HGVSG:chr10.g.103999950T>C,related_to_gene,ENSEMBL:ENSG00000065613
1315,HGVSG:chr1.g.233258451C>A,related_to_gene,ENSEMBL:ENSG00000135749
173,HGVSG:chrX.g.139804599_139804600delinsAA,related_to_gene,ENSEMBL:ENSG00000101974
1293,HGVSG:chr14.g.99398980del,related_to_gene,ENSEMBL:ENSG00000183576


### Variant - Transcript

In [8]:
'''
e_var_trans = df[['hgvsg_variant_code','ens_transcript_code']].dropna().drop_duplicates().reset_index(drop=True)
e_var_trans.columns = ['subject','object']
e_var_trans['predicate'] = 'has_transcript'
e_var_trans = e_var_trans[['subject','predicate','object']]
e_var_trans.sample(5)
'''

"\ne_var_trans = df[['hgvsg_variant_code','ens_transcript_code']].dropna().drop_duplicates().reset_index(drop=True)\ne_var_trans.columns = ['subject','object']\ne_var_trans['predicate'] = 'has_transcript'\ne_var_trans = e_var_trans[['subject','predicate','object']]\ne_var_trans.sample(5)\n"

### Variant - Protein

In [9]:
# Some extra formatting needs to be done for the protein code...

# Drop rows where its NaN
df_prot = df.dropna(subset=['hgvsp'])

# DROP rows where the ENSP is not defined.
df_prot = df_prot[df_prot['hgvsp'].str.startswith('ENSP')]

df_prot['ens_protein_code'] = 'ENSEMBL:' + pd.Series([i.split('.')[0] for i in df_prot['hgvsp']])

e_var_prot = df_prot[['hgvsg_variant_code','ens_protein_code']]
e_var_prot.columns = ['subject','object']
e_var_prot['predicate'] = 'has_protein'
e_var_prot = e_var_prot[['subject','predicate','object']].dropna().drop_duplicates().reset_index(drop=True)
e_var_prot.sample(5)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  e_var_prot['predicate'] = 'has_protein'


Unnamed: 0,subject,predicate,object
1260,HGVSG:chr17.g.7940210_7940211del,has_protein,ENSEMBL:ENSP00000484398
1863,HGVSG:chr5.g.126488825C>A,has_protein,ENSEMBL:ENSP00000426845
97,HGVSG:chr1.g.160823530dup,has_protein,ENSEMBL:ENSP00000289409
1276,HGVSG:chr5.g.37179363C>A,has_protein,ENSEMBL:ENSP00000474140
888,HGVSG:chr1.g.181710978_181710979del,has_protein,ENSEMBL:ENSP00000441158


### Variant - Cohort

In [9]:
e_var_cohort = df[['hgvsg_variant_code','cohort_code']].dropna().drop_duplicates().reset_index(drop=True)
e_var_cohort.columns = ['subject','object']
e_var_cohort['predicate'] = 'belongs_to_cohort'
e_var_cohort = e_var_cohort[['subject','predicate','object']]
e_var_cohort.sample(5)

Unnamed: 0,subject,predicate,object
42,HGVSG:chr12.g.27961975del,belongs_to_cohort,KFCOHORT:SD-DYPMEHHF
1585,HGVSG:chr10.g.80079028G>T,belongs_to_cohort,KFCOHORT:SD-DYPMEHHF
1391,HGVSG:chr10.g.122085864G>T,belongs_to_cohort,KFCOHORT:SD-DYPMEHHF
543,HGVSG:chr7.g.4861432G>A,belongs_to_cohort,KFCOHORT:SD-DYPMEHHF
1378,HGVSG:chr1.g.40238794C>A,belongs_to_cohort,KFCOHORT:SD-DYPMEHHF


### Transcript - Protein

In [11]:
'''
e_trans_prot = df_prot[['ens_transcript_code','ens_protein_code']].dropna().drop_duplicates().reset_index(drop=True)
e_trans_prot.columns = ['subject','object']
e_trans_prot['predicate'] = 'has_protein'
e_trans_prot = e_trans_prot[['subject','predicate','object']]
e_trans_prot.sample(5)
'''

"\ne_trans_prot = df_prot[['ens_transcript_code','ens_protein_code']].dropna().drop_duplicates().reset_index(drop=True)\ne_trans_prot.columns = ['subject','object']\ne_trans_prot['predicate'] = 'has_protein'\ne_trans_prot = e_trans_prot[['subject','predicate','object']]\ne_trans_prot.sample(5)\n"

### Study - Cohort

In [10]:
e_study_cohort = df[['study_code','cohort_code']].dropna().drop_duplicates().reset_index(drop=True)
e_study_cohort.columns = ['subject','object']

e_study_cohort['predicate'] = 'study_has_cohort'
e_study_cohort = e_study_cohort[['subject','predicate','object']]
e_study_cohort

Unnamed: 0,subject,predicate,object
0,KFSTUDY:KF-NBL-SOMATIC,study_has_cohort,KFCOHORT:SD-DYPMEHHF


### Variant - HSCLO

In [11]:
q = pl.scan_csv('/home/stearb/U24/data/HSCLO/OWLNETS_edgelist_HSCLO.tsv',separator='\t')
hsclo = q.select(['node_id','lowerbound','upperbound']).collect()
hsclo = hsclo.to_pandas()

chroms = np.unique([i.split(' ')[-1].split('.')[0] for i in hsclo.node_id.astype(str)])
chroms = [i.tolist() for i in chroms]  
chroms.remove('Human_Genome_hg38')
chroms.remove('MtDNA')
chroms

['chr1',
 'chr10',
 'chr11',
 'chr12',
 'chr13',
 'chr14',
 'chr15',
 'chr16',
 'chr17',
 'chr18',
 'chr19',
 'chr2',
 'chr20',
 'chr21',
 'chr22',
 'chr3',
 'chr4',
 'chr5',
 'chr6',
 'chr7',
 'chr8',
 'chr9',
 'chrX',
 'chrY']

In [12]:
e_var_hsclo_MASTER = list()

# Go by chromosome and merge each variant to the correct HSCLO term/code
# most of this is extracting the variant position from the hgvsg_variant_code col
# and saving it to the variant_position col. Then bin on this col.

for CHROM in chroms:
    
    print(CHROM)
    
    # search for 'chr1.' or 'chr2.' if CHROM is either 1 or 2. This prevents matching on 11,12,13, when matching on 'chr1'
    if CHROM == 'chr1': CHROM = 'chr1\.'
    elif CHROM == 'chr2': CHROM == 'chr2\.'
    
    # Select variant col which has the variant chromosomal position for just current CHROM
    dfChrom = df[df['hgvsg_variant_code'].str.contains(CHROM)][['hgvsg_variant_code']].dropna()
    
    # format to extract just numerical position 
    dfChrom['variant_position'] = \
          [i.split('.')[-1].replace('del','').replace('ins','')\
            .replace('A','').replace('C','').replace('T','').replace('G','')\
            .replace('>','') if 'inv' not in i or 'dup' not in i else i for i\
            in dfChrom['hgvsg_variant_code']]

    dfChrom['variant_position'] = dfChrom['variant_position'].replace('dup','',regex=True).replace('inv','',regex=True)
    dfChrom['variant_position'] = [i.split('_')[0] for i in dfChrom['variant_position']]
    dfChrom['variant_position'] = dfChrom['variant_position'].astype(int)
    dfChrom = dfChrom.drop_duplicates().reset_index(drop=True)
    
    # get hsclo codes for current CHROM
    hscloChrom = hsclo[hsclo['node_id'].str.contains(CHROM)].dropna()

    ###### do the binning
    dfChromBins = bin_column(pl.DataFrame(dfChrom),col2bin='variant_position').to_pandas()

    # make lowerbound col to merge in hsclo codes
    dfChromBins['lowerbound'] = [float(i.split(',')[0][1:]) for i in dfChromBins['bins']]

    # merge in hsclo codes 
    dfMerged = pd.merge(dfChromBins,hscloChrom,on='lowerbound',how='left')
    
    # this may introduce an error, if im matching on lowerbound only, as there are multiple resolution
    # levels, so upperbound must also be specified!
    # ...check that the variant_position is b/t lower and upperbound
    print(len(dfMerged) ,len(dfMerged[(dfMerged['variant_position'] > dfMerged['lowerbound']) &\
        (dfMerged['variant_position'] < dfMerged['upperbound'])]))
    
    #assert dfMerged.shape == dfMerged[(dfMerged['variant_position'] > dfMerged['lowerbound']) &\
    #    (dfMerged['variant_position'] < dfMerged['upperbound'])].shape
    
    e_var_hsclo = dfMerged[['hgvsg_variant_code','node_id']].drop_duplicates().dropna().reset_index(drop=True)
    e_var_hsclo.columns = ['subject','object']
    e_var_hsclo['predicate'] = 'has_location'
    e_var_hsclo = e_var_hsclo[['subject','predicate','object']]
    
    # e_var_hsclo is longer than dfChrom bc each variant is mapped to multiple HSCLO resolution levels
    #print(len(e_var_hsclo),len(dfChrom)) 
    
    if len(e_var_hsclo_MASTER) == 0:
        e_var_hsclo_MASTER = e_var_hsclo
    else:
        e_var_hsclo_MASTER = pd.concat([e_var_hsclo_MASTER,e_var_hsclo])
       
e_var_hsclo_MASTER = e_var_hsclo_MASTER.drop_duplicates().dropna().reset_index(drop=True)


chr1
194 194
chr10
72 72
chr11
105 104
chr12
83 83
chr13
37 37
chr14
54 54
chr15
64 64
chr16
76 76
chr17
94 94
chr18
32 32
chr19
98 98
chr2
591 589
chr20
38 37
chr21
20 20
chr22
39 39
chr3
109 108
chr4
83 82
chr5
86 86
chr6
83 83
chr7
72 72
chr8
50 50
chr9
73 73
chrX
75 75
chrY
3 3


# Save edges

In [13]:
edges_all = pd.concat([e_var_gene,e_var_prot,#e_var_trans,e_trans_prot,
                       e_var_cohort,e_study_cohort,
                       e_var_hsclo_MASTER])

edges_all = edges_all.drop_duplicates().dropna().reset_index(drop=True)

edges_all.to_csv('/mnt/isilon/opentargets/U24KG/data/owlnets_nodes_and_edges/NBL_somatic/OWLNETS_edgelist.txt',
             sep= "\t",index=False)


In [17]:
from collections import Counter

# check that there is only 1 colon, or 0 colons (for HSCLO)
print(Counter([len(i.split(':')) for i in edges_all['object']]))
Counter([len(i.split(':')) for i in edges_all['subject']])

Counter({2: 5101, 1: 2134})


Counter({2: 7235})

In [15]:
# check edges look good
e = pd.read_csv('/mnt/isilon/opentargets/U24KG/data/owlnets_nodes_and_edges/NBL_somatic/OWLNETS_edgelist.txt',
           sep='\t')
e

Unnamed: 0,subject,predicate,object
0,HGVSG:chr1.g.999832del,related_to_gene,ENSEMBL:ENSG00000188290
1,HGVSG:chr1.g.43591872C>A,related_to_gene,ENSEMBL:ENSG00000142949
2,HGVSG:chr11.g.63952438C>T,related_to_gene,ENSEMBL:ENSG00000110583
3,HGVSG:chr11.g.125102239C>A,related_to_gene,ENSEMBL:ENSG00000150433
4,HGVSG:chr13.g.51467537G>T,related_to_gene,ENSEMBL:ENSG00000236778
...,...,...,...
22901,HGVSG:chrX.g.53084722del,has_location,HSCLO chrX.53084001-53085000
22902,HGVSG:chrX.g.20151338C>A,has_location,HSCLO chrX.20151001-20152000
22903,HGVSG:chrY.g.13160138_13160139del,has_location,HSCLO chrY.13160001-13170000
22904,HGVSG:chrY.g.13160138_13160139del,has_location,HSCLO chrY.13160001-13161000


# Save nodes

In [14]:
nodes_all = pd.concat([edges_all['subject'],edges_all['object']])\
                          .drop_duplicates().dropna().reset_index(drop=True)

nodes_all = pd.DataFrame(nodes_all,columns=['node_id'])
nodes_all = fill_missing_cols(nodes_all)

# NO Need to save HSCLO node ids
nodes_all = nodes_all[~nodes_all['node_id'].str.startswith('HSCLO')]\
    .drop_duplicates().dropna(subset=['node_id']).reset_index(drop=True)

nodes_all.to_csv(
    '/mnt/isilon/opentargets/U24KG/data/owlnets_nodes_and_edges/NBL_somatic/OWLNETS_node_metadata.txt',
             sep= "\t",index=False)

In [28]:
# check nodes look good
n = pd.read_csv('/mnt/isilon/opentargets/U24KG/data/owlnets_nodes_and_edges/NBL_somatic/OWLNETS_node_metadata.txt',
           sep='\t')
n

Unnamed: 0,node_id,node_dbxrefs,node_synonyms,value,node_namespace,unit,node_label,node_definition,lowerbound,upperbound
0,HGVSG:chr1.g.999832del,,,,,,,,,
1,HGVSG:chr1.g.43591872C>A,,,,,,,,,
2,HGVSG:chr11.g.63952438C>T,,,,,,,,,
3,HGVSG:chr11.g.125102239C>A,,,,,,,,,
4,HGVSG:chr13.g.51467537G>T,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...
17413,ENSEMBL:NM_001367774.2,,,,,,,,,
17414,ENSEMBL:XM_011545479.2,,,,,,,,,
17415,ENSEMBL:XM_005274475.3,,,,,,,,,
17416,ENSEMBL:XM_005274474.3,,,,,,,,,


In [23]:
from collections import Counter

In [25]:
Counter([i.split(':')[0] for i in n['node_id']])

Counter({'HSCLO chr1.999001-1000000': 1,
         'HSCLO chr1.43591001-43592000': 1,
         'HSCLO chr1.268001-269000': 1,
         'HSCLO chr1.39385001-39386000': 1,
         'HSCLO chr1.154251001-154252000': 1,
         'HSCLO chr1.160823001-160824000': 1,
         'HSCLO chr1.171270001-171280000': 1,
         'HSCLO chr1.171270001-171271000': 1,
         'HSCLO chr1.27349001-27350000': 1,
         'HSCLO chr1.91501001-91502000': 1,
         'HSCLO chr1.39357001-39358000': 1,
         'HSCLO chr1.63322001-63323000': 1,
         'HSCLO chr1.151070001-151080000': 1,
         'HSCLO chr1.151070001-151071000': 1,
         'HSCLO chr1.208045001-208046000': 1,
         'HSCLO chr1.248453001-248454000': 1,
         'HSCLO chr1.243165001-243166000': 1,
         'HSCLO chr1.197199001-197200000': 1,
         'HSCLO chr1.1321001-1322000': 1,
         'HSCLO chr1.157097001-157098000': 1,
         'HSCLO chr1.181034001-181035000': 1,
         'HSCLO chr1.78013001-78014000': 1,
         'HSCLO c