# Aggregated .maf files from CBTN

`scp /Users/stearb/Downloads/CBTN_somatic_variants_VEP_HIGH_July29.csv stearb-hpc9.research.chop.edu:/home/stearb/U24/data/somatic_variants/`

In [1]:
import pandas as pd
import polars as pl
import numpy as np

def fill_missing_cols(df):
    if 'node_id' not in df.columns:
        raise ValueError('Must have at least a "node_id" column.')
    
    all_cols = set([ 'node_label', 'node_synonyms', 'node_dbxrefs',
            'node_definition','node_namespace','value','lowerbound','upperbound','unit'])
    missing_cols = list(all_cols - set(df.columns))
    nan_cols_df = pd.DataFrame(np.full([len(df), len(missing_cols)], np.nan),columns=missing_cols)

    if isinstance(df, pd.DataFrame):
        nan_cols_df.index = df.index
        return pd.concat([df,nan_cols_df],axis=1)
    elif isinstance(df, pl.DataFrame):
        return pl.concat([df,pl.from_pandas(nan_cols_df)],how='horizontal') # no index for polars
    else:
        raise ValueError(f'Must Pass either a pandas DataFrame or a polars DataFrame but recieved "{type(df)}".')
    

def bin_column(df, col2bin=''):
    '''works with polars df'''
    
    '''tpm_bins = list([0.0000000,7e-4,8e-4,9e-4]) + list(np.linspace(1e-3,9e-3,9)) + \
               list(np.round(np.linspace(1e-2,9e-2,9),2)) + list(np.round(np.linspace(.1,1,10),2)) + \
               list(np.linspace(2,100,99)) + list(np.arange(100,1100,100)[1:]) +  \
                list(np.arange(2000,11000,1000)) + list(np.arange(20_000,110_000,10_000)) + [300_000] '''
    
    bins = np.unique(hscloChrom['lowerbound'].values) #hscloChrom['lowerbound'].values
    # check that none of the disease expression values are out of range, ie larger than the max bin value
    # use for pandas
    #if np.sum(df['Disease_MeanTpm'] > np.max(tpm_bins)) == 0:
        
    # use for polars
    if len(df.filter(df[col2bin] > np.max(bins))) == 0:
        df_binned = df.with_columns(  df[col2bin].cut(breaks=bins).alias('bins')  )
        # check that order has been maintained #assert pl.all(df[col2bin] == df_binned[col2bin])
    else:
        print('OUT OF RANGE ERROR!')
        assert False 
    return df_binned

In [2]:
df = pd.read_csv('/home/stearb/U24/data/somatic_variants/CBTN_somatic_variants_VEP_HIGH_Aug2.csv')
df.head()

Unnamed: 0,HGVSg,Hugo_Symbol,HGVSp,dbSNP_RS,Consequence,SYMBOL_SOURCE,HGNC_ID,BIOTYPE,IMPACT,gnomad_3_1_1_AF,gnomad_3_1_1_AF_popmax,gnomad_3_1_1_AF_non_cancer_popmax,gnomad_3_1_1_AF_non_cancer_all_popmax,Matched_Norm_Sample_Barcode,Tumor_Sample_Barcode,VARIANT_CLASS,path
0,chr7:g.138981269T>G,KIAA1549,p.Met1?,novel,start_lost,HGNC,HGNC:22219,protein_coding,HIGH,2.75e-05,3.05e-05,2.45e-05,0.000123,BS_65FNRFMM,BS_QWWYFRGZ,SNV,/sbgenomics/project-files/CBTN_somatic/002daa4...
1,chrX:g.45069977_45069978del,KDM6A,p.Ser775ArgfsTer15,novel,frameshift_variant,HGNC,HGNC:12637,protein_coding,HIGH,.,.,.,.,BS_65FNRFMM,BS_QWWYFRGZ,deletion,/sbgenomics/project-files/CBTN_somatic/002daa4...
2,chr1:g.153767777_153767786dup,INTS3,p.Val735GlyfsTer21,novel,frameshift_variant,HGNC,HGNC:26153,protein_coding,HIGH,.,.,.,.,BS_KX3258Y6,BS_MVQYCQMP,insertion,/sbgenomics/project-files/CBTN_somatic/0046a77...
3,chr2:g.11216160_11216161delinsAA,ROCK2,p.Glu486_Glu487delinsAspTer,novel,"stop_gained,splice_region_variant",HGNC,HGNC:10252,protein_coding,HIGH,.,.,.,.,BS_KX3258Y6,BS_MVQYCQMP,substitution,/sbgenomics/project-files/CBTN_somatic/0046a77...
4,chr2:g.95927379A>G,ANKRD36C,,rs1230259790,splice_donor_variant,HGNC,HGNC:32946,protein_coding,HIGH,.,.,.,.,BS_KX3258Y6,BS_MVQYCQMP,SNV,/sbgenomics/project-files/CBTN_somatic/0046a77...


In [3]:
hgnc_master = pd.read_csv('/home/stearb/U24/data/helper_files/hgnc_master.txt',sep='\t')

# merge in ensembl ids
df = pd.merge(df,hgnc_master[['hgnc_id','ensembl_gene_id']].rename({'hgnc_id':'HGNC_ID'},axis=1))

  hgnc_master = pd.read_csv('/home/stearb/U24/data/helper_files/hgnc_master.txt',sep='\t')


In [4]:
# need to get patiant specific mondo codes and also biospecimen to patient id mappings
meta = pd.read_csv('/home/stearb/U24/data/somatic_variants/kidsfirst-biospecimen-table-2025-07-29.tsv',sep='\t')

dfmeta = meta[['Participant ID','Sample ID','Histological Diagnosis (MONDO)']]


dfmeta['disease_code'] = [i.split(' ')[-1].replace('(','').replace(')','') for 
                                           i in dfmeta['Histological Diagnosis (MONDO)']]

dfmeta.drop('Histological Diagnosis (MONDO)',axis=1,inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfmeta['disease_code'] = [i.split(' ')[-1].replace('(','').replace(')','') for
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfmeta.drop('Histological Diagnosis (MONDO)',axis=1,inplace=True)


In [5]:
# merge in MONDO code
df = pd.merge(df.rename({'Tumor_Sample_Barcode':'Sample ID'},axis=1)
        , dfmeta, how='left', on= 'Sample ID') 

# need variant type

# ****No ENSEMBL protein IDs for the CBTN dataset****

In [6]:
#df = pd.merge(df,hgnc_master[['hgnc_id','ensembl_gene_id']].rename({'hgnc_id':'HGNC_ID'},axis=1))

In [7]:
# variant
df['HGVSg'] = df['HGVSg'].replace({':': '.'},regex=True) 
df['hgvsg_variant_code'] = 'HGVSG:' + df['HGVSg'] 

# chr1:g.26697414G>A
#df['HGVSg'] = 'HGVSG:' df['Chromosome'] + ':g.' + df['Start_Position'] + df['Reference_Allele'] + '>' \
#                                    df['Tumor_Seq_Allele2']


# ensembl gene
df['ens_gene_code'] = 'ENSEMBL:' + df['ensembl_gene_id']

# disease/MONDO code column taken care of when we merged them in above


# cohort
df['cohort_code'] = 'KFCOHORT:SD-BHJXBDQK'

# study
df['study_code'] = 'KFSTUDY:KF-CBTN-SOMATIC'


In [8]:
e_var_gene = df[['hgvsg_variant_code','ens_gene_code']].dropna().drop_duplicates().reset_index(drop=True)
e_var_gene.columns = ['subject','object']

e_var_gene['predicate'] = 'related_to_gene'
e_var_gene = e_var_gene[['subject','predicate','object']]

e_var_gene.sample(5)

Unnamed: 0,subject,predicate,object
5729,HGVSG:chr11.g.17501945G>T,related_to_gene,ENSEMBL:ENSG00000006611
4540,HGVSG:chr3.g.58831780del,related_to_gene,ENSEMBL:ENSG00000163689
245,HGVSG:chr1.g.205727749G>A,related_to_gene,ENSEMBL:ENSG00000069275
6891,HGVSG:chr8.g.142544240del,related_to_gene,ENSEMBL:ENSG00000181790
4328,HGVSG:chr6.g.13486843_13486844del,related_to_gene,ENSEMBL:ENSG00000145990


In [9]:
e_var_cohort = df[['hgvsg_variant_code','cohort_code']].dropna().drop_duplicates().reset_index(drop=True)
e_var_cohort.columns = ['subject','object']
e_var_cohort['predicate'] = 'belongs_to_cohort'
e_var_cohort = e_var_cohort[['subject','predicate','object']]
e_var_cohort.sample(5)

Unnamed: 0,subject,predicate,object
1528,HGVSG:chr3.g.89450218del,belongs_to_cohort,KFCOHORT:SD-BHJXBDQK
6828,HGVSG:chr21.g.42732068A>G,belongs_to_cohort,KFCOHORT:SD-BHJXBDQK
6782,HGVSG:chr1.g.180178825_180178835del,belongs_to_cohort,KFCOHORT:SD-BHJXBDQK
5476,HGVSG:chr6.g.22290254C>A,belongs_to_cohort,KFCOHORT:SD-BHJXBDQK
2111,HGVSG:chr22.g.32498233_32498234del,belongs_to_cohort,KFCOHORT:SD-BHJXBDQK


In [10]:
e_study_cohort = df[['study_code','cohort_code']].dropna().drop_duplicates().reset_index(drop=True)
e_study_cohort.columns = ['subject','object']

e_study_cohort['predicate'] = 'study_has_cohort'
e_study_cohort = e_study_cohort[['subject','predicate','object']]
e_study_cohort

Unnamed: 0,subject,predicate,object
0,KFSTUDY:KF-CBTN-SOMATIC,study_has_cohort,KFCOHORT:SD-BHJXBDQK


In [11]:
q = pl.scan_csv('/home/stearb/U24/data/HSCLO/OWLNETS_edgelist_HSCLO.tsv',separator='\t')
hsclo = q.select(['node_id','lowerbound','upperbound']).collect()
hsclo = hsclo.to_pandas()

chroms = np.unique([i.split(' ')[-1].split('.')[0] for i in hsclo.node_id.astype(str)])
chroms = [i.tolist() for i in chroms]  
chroms.remove('Human_Genome_hg38')
chroms.remove('MtDNA')
chroms

['chr1',
 'chr10',
 'chr11',
 'chr12',
 'chr13',
 'chr14',
 'chr15',
 'chr16',
 'chr17',
 'chr18',
 'chr19',
 'chr2',
 'chr20',
 'chr21',
 'chr22',
 'chr3',
 'chr4',
 'chr5',
 'chr6',
 'chr7',
 'chr8',
 'chr9',
 'chrX',
 'chrY']

In [12]:
e_var_hsclo_MASTER = list()

# Go by chromosome and merge each variant to the correct HSCLO term/code
# most of this is extracting the variant position from the hgvsg_variant_code col
# and saving it to the variant_position col. Then bin on this col.

for CHROM in chroms:
    
    print(CHROM)
    
    # search for 'chr1.' or 'chr2.' if CHROM is either 1 or 2. This prevents matching on 11,12,13, when matching on 'chr1'
    if CHROM == 'chr1': CHROM = 'chr1\.'
    elif CHROM == 'chr2': CHROM == 'chr2\.'
    
    # Select variant col which has the variant chromosomal position for just current CHROM
    dfChrom = df[df['hgvsg_variant_code'].str.contains(CHROM)][['hgvsg_variant_code']].dropna()
    
    # format to extract just numerical position 
    dfChrom['variant_position'] = \
          [i.split('.')[-1].replace('del','').replace('ins','')\
            .replace('A','').replace('C','').replace('T','').replace('G','')\
            .replace('>','') if 'inv' not in i or 'dup' not in i else i for i\
            in dfChrom['hgvsg_variant_code']]

    dfChrom['variant_position'] = dfChrom['variant_position'].replace('dup','',regex=True).replace('inv','',regex=True)
    dfChrom['variant_position'] = [i.split('_')[0] for i in dfChrom['variant_position']]
    dfChrom['variant_position'] = dfChrom['variant_position'].astype(int)
    dfChrom = dfChrom.drop_duplicates().reset_index(drop=True)
    
    # get hsclo codes for current CHROM
    hscloChrom = hsclo[hsclo['node_id'].str.contains(CHROM)].dropna()

    ###### do the binning
    dfChromBins = bin_column(pl.DataFrame(dfChrom),col2bin='variant_position').to_pandas()

    # make lowerbound col to merge in hsclo codes
    dfChromBins['lowerbound'] = [float(i.split(',')[0][1:]) for i in dfChromBins['bins']]

    # merge in hsclo codes 
    dfMerged = pd.merge(dfChromBins,hscloChrom,on='lowerbound',how='left')
    
    # this may introduce an error, if im matching on lowerbound only, as there are multiple resolution
    # levels, so upperbound must also be specified!
    # ...check that the variant_position is b/t lower and upperbound
    print(len(dfMerged) ,len(dfMerged[(dfMerged['variant_position'] > dfMerged['lowerbound']) &\
        (dfMerged['variant_position'] < dfMerged['upperbound'])]))
    
    #assert dfMerged.shape == dfMerged[(dfMerged['variant_position'] > dfMerged['lowerbound']) &\
    #    (dfMerged['variant_position'] < dfMerged['upperbound'])].shape
    
    e_var_hsclo = dfMerged[['hgvsg_variant_code','node_id']].drop_duplicates().dropna().reset_index(drop=True)
    e_var_hsclo.columns = ['subject','object']
    e_var_hsclo['predicate'] = 'has_location'
    e_var_hsclo = e_var_hsclo[['subject','predicate','object']]
    
    # e_var_hsclo is longer than dfChrom bc each variant is mapped to multiple HSCLO resolution levels
    #print(len(e_var_hsclo),len(dfChrom)) 
    
    if len(e_var_hsclo_MASTER) == 0:
        e_var_hsclo_MASTER = e_var_hsclo
    else:
        e_var_hsclo_MASTER = pd.concat([e_var_hsclo_MASTER,e_var_hsclo])
       
e_var_hsclo_MASTER = e_var_hsclo_MASTER.drop_duplicates().dropna().reset_index(drop=True)


chr1
905 904
chr10
398 398
chr11
512 512
chr12
537 535
chr13
150 149
chr14
301 301
chr15
331 331
chr16
338 335
chr17
648 648
chr18
130 129
chr19
602 599
chr2
3244 3243
chr20
225 225
chr21
89 89
chr22
232 232
chr3
588 588
chr4
434 434
chr5
413 412
chr6
503 500
chr7
496 495
chr8
331 329
chr9
378 377
chrX
358 358
chrY
6 6


In [39]:
!mkdir /mnt/isilon/opentargets/U24KG/data/owlnets_nodes_and_edges/CBTN_somatic

In [13]:
edges_all = pd.concat([e_var_gene,e_var_cohort,e_study_cohort,
                       e_var_hsclo_MASTER])   # e_var_prot,e_var_trans, e_trans_prot

edges_all = edges_all.drop_duplicates().dropna().reset_index(drop=True)

edges_all.to_csv('/mnt/isilon/opentargets/U24KG/data/owlnets_nodes_and_edges/CBTN_somatic/OWLNETS_edgelist.txt',
             sep= "\t",index=False)

In [14]:
nodes_all = pd.concat([edges_all['subject'],edges_all['object']])\
                          .drop_duplicates().dropna().reset_index(drop=True)

nodes_all = pd.DataFrame(nodes_all,columns=['node_id'])
nodes_all = fill_missing_cols(nodes_all)

# NO Need to save HSCLO node ids
nodes_all = nodes_all[~nodes_all['node_id'].str.startswith('HSCLO')]\
    .drop_duplicates().dropna(subset=['node_id']).reset_index(drop=True)

nodes_all

Unnamed: 0,node_id,value,lowerbound,node_label,node_dbxrefs,node_synonyms,unit,node_definition,upperbound,node_namespace
0,HGVSG:chr7.g.138981269T>G,,,,,,,,,
1,HGVSG:chrX.g.45069977_45069978del,,,,,,,,,
2,HGVSG:chr1.g.153767777_153767786dup,,,,,,,,,
3,HGVSG:chr2.g.11216160_11216161delinsAA,,,,,,,,,
4,HGVSG:chr2.g.95927379A>G,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...
14678,ENSEMBL:ENSG00000061987,,,,,,,,,
14679,ENSEMBL:ENSG00000165192,,,,,,,,,
14680,ENSEMBL:ENSG00000280725,,,,,,,,,
14681,ENSEMBL:ENSG00000132394,,,,,,,,,


In [15]:

nodes_all.to_csv(
    '/mnt/isilon/opentargets/U24KG/data/owlnets_nodes_and_edges/CBTN_somatic/OWLNETS_node_metadata.txt',
             sep= "\t",index=False)