In [16]:
import anndata as ad
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re

from lab_scripts.utils import utils
utils.change_directory_to_repo()

import pybedtools

In [17]:
# Genes we need to predict
rna_data = ad.read_h5ad('data/official/common/openproblems_bmmc_multiome_phase1/openproblems_bmmc_multiome_phase1.manual_formatting.output_rna.h5ad')
usefull_genes = rna_data.var['gene_ids']
usefull_genes = set(usefull_genes.tolist())
del rna_data
usefull_genes

{'ENSG00000129911',
 'ENSG00000105968',
 'ENSG00000170248',
 'ENSG00000163820',
 'ENSG00000011451',
 'ENSG00000150316',
 'ENSG00000084207',
 'ENSG00000099341',
 'ENSG00000232912',
 'ENSG00000164889',
 'ENSG00000172354',
 'ENSG00000163110',
 'ENSG00000137274',
 'ENSG00000178562',
 'ENSG00000120837',
 'ENSG00000140905',
 'ENSG00000137843',
 'ENSG00000160593',
 'ENSG00000072694',
 'ENSG00000250155',
 'ENSG00000124299',
 'ENSG00000164494',
 'ENSG00000176018',
 'ENSG00000090615',
 'ENSG00000133069',
 'ENSG00000108639',
 'ENSG00000064961',
 'ENSG00000149346',
 'ENSG00000167107',
 'ENSG00000172172',
 'ENSG00000135074',
 'ENSG00000235481',
 'ENSG00000198722',
 'ENSG00000264522',
 'ENSG00000152193',
 'ENSG00000102882',
 'ENSG00000245534',
 'ENSG00000071575',
 'ENSG00000187605',
 'ENSG00000198625',
 'ENSG00000225484',
 'ENSG00000159479',
 'ENSG00000034533',
 'ENSG00000198805',
 'ENSG00000158869',
 'ENSG00000126088',
 'ENSG00000127824',
 'ENSG00000255310',
 'ENSG00000189233',
 'ENSG00000165312',


In [18]:
# Parse atac regions
def parse_atac_adata(adata):
    intervals = []
    for line in adata.var.index.tolist():
        line = line.split('-')
        chromosome, start, end = line
        # Skip all regions out of chromosomes
        if not chromosome.startswith('chr'):
            continue
        start = int(start)
        end = int(end)
        intervals.append((chromosome, start, end))
    return pybedtools.BedTool(intervals)

atac_data = ad.read_h5ad('data/official/common/openproblems_bmmc_multiome_phase1/openproblems_bmmc_multiome_phase1.manual_formatting.output_mod2.h5ad')
atac_regions = parse_atac_adata(atac_data)
del atac_data
atac_regions.head(5)

chr1	9776	10668
 chr1	180726	181005
 chr1	181117	181803
 chr1	191133	192055
 chr1	267562	268456
 

In [19]:
# Parse gene annotation
def parse_gtf(gtf_file, upstream):
    intervals = []
    with open(gtf_file) as f:
        for line in f:
            if not line.startswith('##'):
                line = line.rstrip('\n').split('\t')

                # Skip features out of chromosomes
                chromosome = line[0]
                if not chromosome.startswith('chr'):
                    continue

                # Skip not genes
                feature = line[2]
                if feature != 'gene':
                    continue

                # Take gene_id (ENSG0000...) 
                annotation = line[-1].split(';')[0]
                gene_id = re.search(r'gene_id \"(.+)\..+\"', annotation).groups(1)[0]

                # Substract 1 because gtf file starts intervals with 1
                start = int(line[3]) - 1
                end = int(line[4]) - 1
                
                strand = line[6]
                if strand == '-':
                    end += upstream
                else:
                    start = max(0, start - upstream)
                
                intervals.append((chromosome, start, end, feature, gene_id))
    return pybedtools.BedTool(intervals)

annotation_path = 'data/annotation.gtf'
genes = parse_gtf(annotation_path, 2000).sort()
genes.head()

chr1	9868	14408	gene	ENSG00000223972
 chr1	14403	31569	gene	ENSG00000227232
 chr1	17368	19435	gene	ENSG00000278267
 chr1	27553	31108	gene	ENSG00000243485
 chr1	28365	30502	gene	ENSG00000284332
 chr1	34553	38080	gene	ENSG00000237613
 chr1	50472	53311	gene	ENSG00000268020
 chr1	55597	64115	gene	ENSG00000240361
 chr1	63418	71584	gene	ENSG00000186092
 chr1	89294	135722	gene	ENSG00000238009
 

In [20]:
# Keep only genes that we are going to predict
selected_genes = genes.filter(lambda interval: interval[-1] in usefull_genes).saveas()
selected_genes.head(5)

chr1	141473	175861	gene	ENSG00000241860
 chr1	776746	810064	gene	ENSG00000237491
 chr1	823137	868201	gene	ENSG00000228794
 chr1	944202	961308	gene	ENSG00000188976
 chr1	958583	965718	gene	ENSG00000187961
 

In [21]:
# Find closest genes
closest_genes = atac_regions.intersect(selected_genes, wa=True, wb=True).sort()

In [22]:
closest_genes.head(5)

chr1	778276	779191	chr1	776746	810064	gene	ENSG00000237491
 chr1	822804	823597	chr1	823137	868201	gene	ENSG00000228794
 chr1	827067	827948	chr1	823137	868201	gene	ENSG00000228794
 chr1	838003	838737	chr1	823137	868201	gene	ENSG00000228794
 chr1	841079	841916	chr1	823137	868201	gene	ENSG00000228794
 

In [23]:
# Create mapping from region name to gene name
region_to_gene = {}
for interval in closest_genes:
    region = interval[0] + '-' + str(interval[1]) + '-' + str(interval[2])
    gene = interval[-1]
    region_to_gene[region] = gene

In [24]:
# Select only regions that are mapped to a gene
atac_data = ad.read_h5ad('data/official/common/openproblems_bmmc_multiome_phase1/openproblems_bmmc_multiome_phase1.manual_formatting.output_mod2.h5ad')
usefull_regions = atac_data.var.index.map(lambda region: region in region_to_gene.keys()).tolist()
len(usefull_regions)

116490

In [25]:
atac_data = atac_data[:, usefull_regions]

In [26]:
atac_data.var['gene_id'] = atac_data.var.index.map(lambda region: region_to_gene[region]).tolist()

Trying to set attribute `.var` of view, copying.


In [33]:
df = atac_data.var['gene_id'].to_frame()

In [34]:
df.to_csv('region_to_gene.csv')

In [36]:
df.index

Index(['chr1-778276-779191', 'chr1-822804-823597', 'chr1-827067-827948',
       'chr1-838003-838737', 'chr1-841079-841916', 'chr1-842497-843414',
       'chr1-844132-844995', 'chr1-857951-858596', 'chr1-865397-866322',
       'chr1-958865-959755',
       ...
       'chrX-155086170-155087034', 'chrX-155094214-155095119',
       'chrX-155148731-155149597', 'chrX-155215991-155216852',
       'chrX-155218526-155219443', 'chrX-155334398-155335238',
       'chrX-155610959-155611700', 'chrX-155612357-155613243',
       'chrX-155632202-155633106', 'chrX-155880870-155881771'],
      dtype='object', length=69534)