In [1]:
from liftover import get_lifter
import scanpy as sc
import pandas as pd
import numpy as np
import anndata
from pybiomart import Server

In [55]:
from intervaltree import Interval, IntervalTree

### Data Upload

In [37]:
Kong_adata = sc.read('/Users/anna.maguza/Desktop/GCA_social_network/data/raw_anndata/Kong_2023/Kong_2023_raw_anndata.h5ad')

### Obtain genomic positions of genes in hg19

In [38]:
# Set up the server
server = Server(host='http://www.ensembl.org')

In [39]:
# Choose the database
dataset = (server.marts['ENSEMBL_MART_ENSEMBL']
                  .datasets['hsapiens_gene_ensembl'])

In [40]:
# Get the genes in your anndata object
genes = Kong_adata.var_names.tolist()

In [41]:
def chunker(seq, size):
    return (seq[pos:pos + size] for pos in range(0, len(seq), size))

# Split the gene IDs into chunks of 500 (you can adjust this value as needed)
chunks = list(chunker(genes, 250))

# Create an empty dataframe to store the results
gene_positions = pd.DataFrame()

# Loop through each chunk
for chunk in chunks:
    # Query the database for the positions of the genes in the chunk
    result = dataset.query(attributes=['ensembl_gene_id', 'chromosome_name', 'start_position', 'end_position'],
                           filters={'link_ensembl_gene_id': chunk})
    
    # Append the result to the main dataframe
    gene_positions = pd.concat([gene_positions, result])

# Rename the columns to match the ones in the previous example
gene_positions.columns = ['gene_id', 'chrom', 'start', 'end']


In [42]:
gene_positions.head()

Unnamed: 0,gene_id,chrom,start,end
0,ENSG00000007923,1,6634168,6701924
1,ENSG00000008128,1,1702379,1724357
2,ENSG00000008130,1,1751232,1780457
3,ENSG00000009724,1,11022009,11047239
4,ENSG00000011021,1,11806096,11848079


### Convert genes positions to hg38 using liftover

In [43]:
# Create a liftover object for hg19 to hg38 conversion
converter = get_lifter('hg19', 'hg38')

In [44]:
# A function that converts one coordinate and returns the converted position
def convert_position(row):
    chrom = str(row['chrom'])
    start = int(row['start'])
    end = int(row['end'])
    # Liftover for the start and end coordinates
    new_start = converter.convert_coordinate(chrom, start)
    new_end = converter.convert_coordinate(chrom, end)
    # If liftover is successful (it can fail for some positions), return the new positions
    if new_start and new_end:
        return pd.Series({'new_start': new_start[0][1], 'new_end': new_end[0][1]})
    else:
        return pd.Series({'new_start': None, 'new_end': None})

In [45]:
# Ensure 'chrom' column is of type string
gene_positions['chrom'] = gene_positions['chrom'].astype(str)

# Drop rows where 'chrom' is NaN
gene_positions = gene_positions.dropna(subset=['chrom'])

# Filter out rows where chrom is 'MT' or contains 'HSCHR'
gene_positions = gene_positions[~gene_positions['chrom'].str.contains('MT|HSCHR')]

# Apply the conversion function to all rows (i.e., all genes)
gene_positions[['new_start', 'new_end']] = gene_positions.apply(convert_position, axis=1)

In [46]:
# Apply the conversion function to all rows (i.e., all genes)
gene_positions[['new_start', 'new_end']] = gene_positions.apply(convert_position, axis=1)

In [47]:
gene_positions.head()

Unnamed: 0,gene_id,chrom,start,end,new_start,new_end
0,ENSG00000007923,1,6634168,6701924,6574108.0,6641864.0
1,ENSG00000008128,1,1702379,1724357,1770940.0,1792918.0
2,ENSG00000008130,1,1751232,1780457,1819793.0,1849018.0
3,ENSG00000009724,1,11022009,11047239,10961952.0,10987182.0
4,ENSG00000011021,1,11806096,11848079,11746039.0,11788022.0


### Extract new genes

In [57]:
# First, create an interval tree from your original gene positions
gene_tree = IntervalTree()

for i, row in gene_positions.iterrows():
    interval = Interval(row['start'], row['end'], row['gene_id'])
    gene_tree[interval.begin:interval.end] = interval

In [None]:
# Now, you can query the tree with your new positions to find overlapping genes
overlapping_genes = []

for i, row in gene_positions.iterrows():
    intervals = gene_tree[row['new_start']:row['new_end']]
    for interval in intervals:
        print(interval.data)
        overlapping_genes.append(interval.data)

overlapping_genes = pd.DataFrame(overlapping_genes, columns=['ensembl_gene_id'])

In [61]:
# Create a DataFrame from overlapping_genes
df_overlapping_genes = pd.DataFrame(overlapping_genes, columns=['start', 'end', 'ensembl_gene_id'])

# View the DataFrame
df_overlapping_genes

Unnamed: 0,start,end,ensembl_gene_id
0,6575189,6623362,ENSG00000013288
1,6603642,6604420,ENSG00000254400
2,6579994,6589280,ENSG00000204859
3,6640985,6644541,ENSG00000129235
4,6599239,6633291,ENSG00000037474
...,...,...,...
585865,75375511,75377294,ENSG00000225203
585866,75318076,75405709,ENSG00000177885
585867,75355207,75368612,ENSG00000140400
585868,75293698,75433503,ENSG00000153774


In [54]:
# Initialize an empty DataFrame to store the results
new_gene_names = pd.DataFrame()

# Loop over the rows of gene_positions
for i, row in gene_positions.iterrows():
    # Query the dataset for genes that overlap with new_start and new_end
    results = dataset.query(attributes=['ensembl_gene_id', 'ensembl_gene_id_version', 
                                        'ensembl_transcript_id', 'hgnc_symbol', 'external_gene_name',],
                            filters={'chromosome_name': row['chrom'],
                                     'start': {'lte': row['new_end']},
                                     'end': {'gte': row['new_start']}})
    
    # Append the results to new_gene_names
    new_gene_names = pd.concat([new_gene_names, results])

# Remove duplicate rows, if any
new_gene_names = new_gene_names.drop_duplicates()

BiomartException: Query ERROR: caught BioMart::Exception::Usage: Wrong format value for Start

In [48]:
# Pseudocode for step 1
new_gene_names = query_ensembl_for_gene_names(gene_positions['new_start'], gene_positions['new_end'])

# Pseudocode for step 2
old_to_new_gene_names = dict(zip(gene_positions['gene_name'], new_gene_names))

# Pseudocode for step 3
for old_gene, new_gene in old_to_new_gene_names.items():
    # This assumes 'gene_name' is an attribute of your anndata object.
    # You'll need to adjust this to match the actual structure of your object.
    anndata_object.gene_name[anndata_object.gene_name == old_gene] = new_gene


NameError: name 'query_ensembl_for_gene_names' is not defined