In [None]:
import polars as pl
import os

In [None]:
# Please replace the following path with the path of the file containing the variants you want to score
input_path = 'data/testing/testing_data.tsv'

## 1- Load and annotate variants

This first section loads and annotate the set of variants to be scored, containing at least:

chr: str, chromosome bearing the variant, [1:22], X, Y

pos: int, hg38 genomic position of the variant

ref: str, reference allele at the position

alt: str, alternative allele at the position

For testing purposes, if NCBoost 2 feature file was not downloaded, please directly run section 2.


In [None]:
name, extension = os.path.splitext(input_path)
annotated_path = f'{name}_annotated.tsv'
output_path = f'{name}_scored.tsv'

In [None]:
variants = pl.read_csv(source=input_path,
                       separator = '\t',
                       null_values='NA',
                       schema_overrides={'chr':str, 'pos':int}
                       )

variants.head()

chr,pos,rsid,ref,alt,label
str,i64,str,str,str,i64
"""1""",7961859,"""null""","""C""","""G""",1
"""1""",11022737,"""rs80356744""","""T""","""C""",1
"""1""",11023351,"""rs387906334""","""G""","""A""",1
"""1""",11790916,"""rs777661576""","""C""","""T""",1
"""1""",13308295,"""rs566581137""","""C""","""A""",0


In [None]:
from src.ncboost_functions import add_ncboost_features

variants = add_ncboost_features(variants, db_path='data/WG_annotated')
variants = variants.drop('NCBoost')

Note: You have installed the 'manylinux2014' variant of XGBoost. Certain features such as GPU algorithms or federated learning are not available. To use these features, please upgrade to a recent Linux distro with glibc 2.28+, and install the 'manylinux_2_28' variant.


Chromosomes:   0%|          | 0/24 [00:00<?, ?it/s]

In [None]:
variants.write_csv(file=annotated_path, 
                   separator="\t", 
                   include_header=True
                   )

## 2- Score Variants

In [None]:
variants = pl.read_csv(source=annotated_path,
                       separator='\t',
                       null_values='NA',
                       schema_overrides={'chr':str}
                       )
variants.head()

chr,pos,rsid,ref,alt,label,region,closest_gene_name,closest_gene_ENSG,paml_dnds,slr_dnds,paml_pvalue,paml_bh,gene_age,pLI,zscore_mis,zscore_syn,loeuf,GDI,ncRVIS,ncGERP,RVIS_percentile,pcGERP,CADD_ENSG,CADD_GeneName,GC,CpG,priPhCons,mamPhCons,verPhCons,priPhyloP,mamPhyloP,verPhyloP,bStatistic,GerpRS,GerpRSpval,GerpN,GerpS,Aparent2,ZooPriPhyloP,ZooVerPhyloP,ZooRoCC,ZooUCE,Roulette-FILTER,Roulette-MR,Roulette-AR,CADD,CADD_phred,CDTS,mean_MAF,mean_MAF_afr,mean_MAF_ami,mean_MAF_amr,mean_MAF_asj,mean_MAF_eas,mean_MAF_fin,mean_MAF_mid,mean_MAF_nfe,mean_MAF_sas,ReMM,SpliceAI,UTR3,UTR5,downstream,intergenic,intronic,upstream,partition
str,i64,str,str,str,i64,str,str,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,str,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,str,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,i64,i64,i64,i64,i64,i64,i64
"""1""",7961859,"""null""","""C""","""G""",1,"""intronic""","""PARK7""","""ENSG00000116288""",0.06659,0.06503,0.2592,0.5241,0.0,0.0013008,0.36438,-0.51419,1.025,67.74927,0.085881,-1.289033,60.090823,45.404095,,,0.781,0.253,0.69,0.0,0.001,0.399,-0.152,-0.334,824.0,,,3.24,0.217,,0.091,0.0,,,"""TFBS""",0.083,0.094,1.042958,10.81,-11.9677,0.004686,0.012429,0.123988,0.015091,0.109764,0.012163,0.081921,0.074301,0.008839,0.025181,0.930087,0.0,0,0,0,0,1,0,4
"""1""",11022737,"""rs80356744""","""T""","""C""",1,"""UTR3""","""TARDBP""","""ENSG00000120948""",0.0279,0.02766,0.006474,0.03594,0.0,1.0,5.3187,0.24967,0.229,2.30437,-0.680325,3.245378,27.418023,13.653017,"""ENSG00000009724""","""MASP2""",0.298,0.0,0.979,1.0,1.0,0.467,3.166,5.032,848.0,9701.77,0.0,6.12,6.12,0.08907,0.999,2.411,,,"""low""",0.02,,1.400023,13.57,-5.50229,0.000184,0.000249,0.0252193,0.0008,0.007777,0.001048,0.009617,0.005102,0.000474,0.000673,0.972025,0.0,1,0,0,0,0,0,5
"""1""",11023351,"""rs387906334""","""G""","""A""",1,"""UTR3""","""TARDBP""","""ENSG00000120948""",0.0279,0.02766,0.006474,0.03594,0.0,1.0,5.3187,0.24967,0.229,2.30437,-0.680325,3.245378,27.418023,13.653017,"""ENSG00000009724""","""MASP2""",0.464,0.12,0.997,1.0,1.0,0.595,0.926,2.587,849.0,9701.77,0.0,6.2,2.63,,1.0,4.134,248.0,,"""high""",1.966,,2.002378,16.76,-12.9149,0.000168,0.000224,0.0252193,0.000746,0.007489,0.000272,0.007299,0.003401,0.000435,0.000442,0.914964,0.02,1,0,0,0,0,0,5
"""1""",11790916,"""rs777661576""","""C""","""T""",1,"""intronic""","""MTHFR""","""ENSG00000177000""",0.09733,0.09315,0.3499,0.6468,0.0,1.0506e-11,2.024,0.35912,0.859,4524.99996,-0.072474,-0.788072,60.710073,36.783405,"""ENSG00000177000""","""MTHFR""",0.57,0.053,0.004,0.0,0.0,0.407,-0.384,-0.372,900.0,,,1.46,-0.585,,0.001,0.43,,,"""TFBS""",0.073,,0.091441,1.519,1.45327,0.00197,0.005657,0.038807,0.0062,0.028183,0.011548,0.018624,0.043537,0.003341,0.015078,0.72465,0.91,0,0,0,0,1,0,6
"""1""",13308295,"""rs566581137""","""C""","""A""",0,"""intronic""","""PRAMEF33""","""ENSG00000237700""",,,,,,0.052995,1.9936,1.5978,1.53,,,,,,"""ENSG00000237700""","""PRAMEF33""",0.483,0.0,0.011,0.0,0.0,0.336,-0.124,-0.241,,,,2.12,0.17,,0.071,0.333,,,"""low""",0.062,,0.328103,3.582,,0.007554,0.012117,0.078504,0.014881,0.028994,0.017286,0.02967,0.057277,0.013875,0.022553,0.341683,,0,0,0,0,1,0,10


In [6]:
from src.ncboost_functions import ncboost_score
model_folder = 'ncboost_models'
variants = ncboost_score(variants, model_name='ncboost_models')
variants.head()

chr,pos,rsid,ref,alt,label,region,closest_gene_name,closest_gene_ENSG,paml_dnds,slr_dnds,paml_pvalue,paml_bh,gene_age,pLI,zscore_mis,zscore_syn,loeuf,GDI,ncRVIS,ncGERP,RVIS_percentile,pcGERP,CADD_ENSG,CADD_GeneName,GC,CpG,priPhCons,mamPhCons,verPhCons,priPhyloP,mamPhyloP,verPhyloP,bStatistic,GerpRS,GerpRSpval,GerpN,GerpS,Aparent2,ZooPriPhyloP,ZooVerPhyloP,ZooRoCC,ZooUCE,Roulette-FILTER,Roulette-MR,Roulette-AR,CADD,CADD_phred,CDTS,mean_MAF,mean_MAF_afr,mean_MAF_ami,mean_MAF_amr,mean_MAF_asj,mean_MAF_eas,mean_MAF_fin,mean_MAF_mid,mean_MAF_nfe,mean_MAF_sas,ReMM,SpliceAI,UTR3,UTR5,downstream,intergenic,intronic,upstream,partition,partition_right,NCBoost
str,i64,str,str,str,i64,str,str,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,str,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,str,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,i64,i64,i64,i64,i64,i64,i64,i64,f64
"""1""",7961859,"""null""","""C""","""G""",1,"""intronic""","""PARK7""","""ENSG00000116288""",0.06659,0.06503,0.2592,0.5241,0.0,0.0013008,0.36438,-0.51419,1.025,67.74927,0.085881,-1.289033,60.090823,45.404095,,,0.781,0.253,0.69,0.0,0.001,0.399,-0.152,-0.334,824.0,,,3.24,0.217,,0.091,0.0,,,"""TFBS""",0.083,0.094,1.042958,10.81,-11.9677,0.004686,0.012429,0.123988,0.015091,0.109764,0.012163,0.081921,0.074301,0.008839,0.025181,0.930087,0.0,0,0,0,0,1,0,4,4,0.031678
"""1""",11022737,"""rs80356744""","""T""","""C""",1,"""UTR3""","""TARDBP""","""ENSG00000120948""",0.0279,0.02766,0.006474,0.03594,0.0,1.0,5.3187,0.24967,0.229,2.30437,-0.680325,3.245378,27.418023,13.653017,"""ENSG00000009724""","""MASP2""",0.298,0.0,0.979,1.0,1.0,0.467,3.166,5.032,848.0,9701.77,0.0,6.12,6.12,0.08907,0.999,2.411,,,"""low""",0.02,,1.400023,13.57,-5.50229,0.000184,0.000249,0.0252193,0.0008,0.007777,0.001048,0.009617,0.005102,0.000474,0.000673,0.972025,0.0,1,0,0,0,0,0,5,5,0.115872
"""1""",11023351,"""rs387906334""","""G""","""A""",1,"""UTR3""","""TARDBP""","""ENSG00000120948""",0.0279,0.02766,0.006474,0.03594,0.0,1.0,5.3187,0.24967,0.229,2.30437,-0.680325,3.245378,27.418023,13.653017,"""ENSG00000009724""","""MASP2""",0.464,0.12,0.997,1.0,1.0,0.595,0.926,2.587,849.0,9701.77,0.0,6.2,2.63,,1.0,4.134,248.0,,"""high""",1.966,,2.002378,16.76,-12.9149,0.000168,0.000224,0.0252193,0.000746,0.007489,0.000272,0.007299,0.003401,0.000435,0.000442,0.914964,0.02,1,0,0,0,0,0,5,5,0.174726
"""1""",11790916,"""rs777661576""","""C""","""T""",1,"""intronic""","""MTHFR""","""ENSG00000177000""",0.09733,0.09315,0.3499,0.6468,0.0,1.0506e-11,2.024,0.35912,0.859,4524.99996,-0.072474,-0.788072,60.710073,36.783405,"""ENSG00000177000""","""MTHFR""",0.57,0.053,0.004,0.0,0.0,0.407,-0.384,-0.372,900.0,,,1.46,-0.585,,0.001,0.43,,,"""TFBS""",0.073,,0.091441,1.519,1.45327,0.00197,0.005657,0.038807,0.0062,0.028183,0.011548,0.018624,0.043537,0.003341,0.015078,0.72465,0.91,0,0,0,0,1,0,6,6,0.864655
"""1""",13308295,"""rs566581137""","""C""","""A""",0,"""intronic""","""PRAMEF33""","""ENSG00000237700""",,,,,,0.052995,1.9936,1.5978,1.53,,,,,,"""ENSG00000237700""","""PRAMEF33""",0.483,0.0,0.011,0.0,0.0,0.336,-0.124,-0.241,,,,2.12,0.17,,0.071,0.333,,,"""low""",0.062,,0.328103,3.582,,0.007554,0.012117,0.078504,0.014881,0.028994,0.017286,0.02967,0.057277,0.013875,0.022553,0.341683,,0,0,0,0,1,0,10,10,0.00498


In [None]:
variants.write_csv(file=output_path, 
                   separator="\t", 
                   include_header=True
                   )