# ASXL1 analysis
Check individuals with heart failure for variants known to be implicated known mutations in CH or lymphoid and myeloid cancer in the ASXL1 gene

In [40]:
import pandas as pd
import pickle
import numpy as np
import h5py

In [41]:
query_gene_id =  #put in gene id of ASXL1 as assigned by the preprocessing pipeline
gt_filename = 'genotypes.h5' #genotypes as returned by the DeepRVAT preprocessing pipeline
annotation_file = 'annotations.parquet' #variant annotations as returned by the DeepRVAT annotation pipeline
known_variant_ids_file = 'known_ids.parquet' #ids of variants known to be implicated known mutations in CH3 or lymphoid and myeloid cancer
burden_file = "asxl1_burdens.parquet" #extracted burdens of ASXL1 extracted for all individuals with heart failure


In [2]:
b = pd.read_parquet() #burdens for samples with hear failure
b = b.sort_values('ASXL1', ascending = False)\
    .reset_index().reset_index().drop(columns = 'index').rename(columns = {'level_0': 'rank'})

In [4]:
annos = pd.read_parquet(annotation_file)
annos = annos.set_index('id')

In [42]:
vars_to_keep = list(annos.query('UKB_MAF < 0.001 & gene_id ==@query_gene_id').index)
len(vars_to_keep)

2380

In [13]:
gt_filename = 'genotypes.h5'
gt_file = h5py.File(gt_filename, "r")
variant_matrix = gt_file["variant_matrix"][:]
genotype_matrix = gt_file["genotype_matrix"][:]
samples = gt_file["samples"][:]
samples  = np.array([item.decode("utf-8") for item in samples])

In [8]:
known_ids = pd.read_parquet(known_variant_ids_file) #
known_ids = set(known_ids['id'])

In [39]:
len(known_ids)

118

In [14]:
samples_oi = [int(i) for i in b['sample_id']]
res_dict = {}
for sample_oi in samples_oi:
    sample_idx = np.where(samples == f'{sample_oi}')[0][0]
    this_vars = set(variant_matrix[sample_idx]).intersection(vars_to_keep)
    inters_known = this_vars.intersection(known_ids)
    has_known = True if len(inters_known) > 0 else False
    res_dict[sample_oi] = has_known

In [15]:
has_known_df = pd.DataFrame({'sample_id': res_dict.keys(), 'has_known_variant': res_dict.values()})
has_known_df = b.merge(has_known_df, how = 'left')


In [28]:
has_known_df.to_parquet("burdens_with_has_known.parquet") #export data for plotting with R