In this notebook we create a mapping between CDS-relative positions and genome relative positions. 
Our mfe z-scores are saved in CDS-relative positions and many times we are asked about a specific genomic positions. 

## Imports

In [1]:
import pandas as pd
import numpy as np
import os, sys, re
import pickle5 as pickle
import matplotlib.pyplot as plt
import glob
import pyranges as pr
from Bio import SeqIO
from tqdm import tqdm
import json


## Functions

In [2]:
def get_range(cds_part_start:int, cds_part_end:int):
    '''
    In this function we take the start and end of the current part of the cds and return all the positions in the range
    '''
    return(list(range(cds_part_start,cds_part_end)))
    

## Main

In [9]:
''' use GTF annotation file for genes in GRCh38 to get the cds positions of each gene '''

annotations_file = "../co_trans_data/Homo_sapiens.GRCh38.104.gtf"

gr = pr.read_gtf(annotations_file) #this line reads the gtf file and also *changes coordinates from 1-based to 0-based
ano_df = gr.df

'''
keep only CDSs
'''
ano_df = ano_df[ano_df['Feature'] == 'CDS'].copy()
ano_df = ano_df[['Start','End','Strand','protein_id','exon_number','gene_id','gene_name']] #keep only relevant columns


In [None]:
'''
We will create a mapping between cds positions relative to the cds and relative to the chromosome. The keys will be
the positions relative to the cds and the values the positions relative to the chromosome.
For example, for a gene on the forward strand:
if the cds is from position 300 to 310 (including) on the chromosome, then dict[0] = 300 and dict[11] = 310.
For a gene on the reverse starnd that is from 300 to 310 then dict[0] = 310 and dict[11] = 300. 
'''
cds_to_chrom = {} 

for gene_id,protein_id in genes_dict.keys():    
    
    try:     
        this_cds = ano_df[(ano_df['gene_id'] == gene_id) & (ano_df['protein_id'] == protein_id)].copy() #get the cds parts of the ccds
        if this_cds.shape[0] != 0:
            cds_positions = this_cds.apply(lambda x: get_range(x.Start, x.End), axis=1) #get the cds positions relative to the chromosome
            cds_positions = cds_positions.sum() #turn to one list containing all cds positions relative to the chromosome - ascending order

            strand = this_cds.iloc[0]['Strand'] #info in "Strand" will be the same for all parts of the CDS so we can look at the first line
            if strand == '-':
                cds_positions.sort(reverse=True)
            cds_pos_dict = {} # create a dictionary for this specific cds
            for rel_cds, rel_chrom in enumerate(cds_positions):
                cds_pos_dict[rel_cds] = rel_chrom
            cds_to_chrom[gene_id,protein_id] = cds_pos_dict  
    
    except Exception as e:
        print(f"gene {gene_id} failed with error: {e}")
        
        


## Save the dictionary

In [206]:
data_path = "../co_trans_data/cds_to_chrom_dict_with_protein_id.pickle"
with open(data_path, 'wb') as handle:
    pickle.dump(cds_to_chrom, handle, protocol=pickle.HIGHEST_PROTOCOL)
