## Loading up packages

In [2]:
# Let the program know where to find my tools file.
import sys
sys.path.append('/home/keeganfl/Desktop/Work_Fall_2021/Fall_2021_Work_journal/tools/')

In [3]:
# Load up the necessary packages 
from plastid import BAMGenomeArray, GenomeArray, VariableFivePrimeMapFactory, \
                        GTF2_TranscriptAssembler, GFF3_TranscriptAssembler, \
                        Transcript, ThreePrimeMapFactory, CenterMapFactory
import plastid
from plastid.plotting.plots import *
import numpy as np
import os
from Bio import SeqIO
import numpy
import math
import pandas as pd
from scipy import stats
import keegan_analysis_tools as kat
from statsmodels.nonparametric.smoothers_lowess import lowess
import warnings
import matplotlib.pyplot as plt
%matplotlib inline
import csv
from scipy.sparse.linalg import lsqr
from tqdm import tqdm

In [4]:
# Define important paths. 
data_path = "/home/keeganfl/Desktop/Work_Fall_2021/genomes_&_samples/dmel/"
position_count_path = "/home/keeganfl/Desktop/Work_Fall_2021/data_tables/position_counts_codon_resolution/dmel/"
te_table_path = '/home/keeganfl/Desktop/Work_Fall_2021/data_tables/TE_tables/dmel/'
save_path = "/home/keeganfl/Desktop/Work_Fall_2021/data_tables/elongation_rates/dmel/"
save_path2 = "/home/keeganfl/Desktop/Work_Fall_2021/data_tables/initiation_rates/dmel/"
save_path3 = "/home/keeganfl/Desktop/Work_Fall_2021/data_tables/termination_rates/dmel/"
gtf_name = "Drosophila_melanogaster.BDGP6.32.103.gtf"
mutation = 'Fmr1'
samp_num = '2'

## Loading up the data files 

In [5]:
# load the transcript annotations from the GTF file.
# GTF2_TranscriptAssembler returns an iterator, so here we convert it to a list.
transcripts = list(GTF2_TranscriptAssembler(open(data_path + gtf_name),return_type=Transcript))

In [6]:
# Remove non-protein coding transcripts from transcripts list. 
protein_coding = []
for transcript in transcripts:
    if transcript.attr['gene_biotype'] == 'protein_coding':
        protein_coding.append(transcript)

In [7]:
# clear up some memory by deleting original transcript list
transcripts.clear()

## Loading up the position counts file.

In [8]:
data = kat.load_count_positions(mutation + "_" + samp_num + "_counts.csv",position_count_path)

## Loading up the TE table

In [9]:
te_table = pd.read_csv(te_table_path + mutation + "_" + samp_num + "_te_tablefc.csv")

## Getting the codon sequence

In [10]:
# Use SeqIO to load up the sequence information for all the cds regions
genome = SeqIO.to_dict(SeqIO.parse(
    open(data_path + "Drosophila_melanogaster.BDGP6.32.cds.all.fa"),"fasta"))

In [11]:
# Load up the sequences for all of the protein coding genes into a list
cds_sequence=[]
for transcript in protein_coding:
    t_id = transcript.attr["transcript_id"]
    cds_sequence.append(str(genome[t_id].seq))

In [12]:
# Create a dictionary that matches every codon to a number. This will be used later for deconvolution
cd_dict = {'TTT':0,'TTC':1,'TTG':2,'TTA':3,
          'TCT':4,'TCC':5,'TCG':6,'TCA':7,
          'TGT':8,'TGC':9,'TGG':10,'TGA':11,
          'TAT':12,'TAC':13,'TAG':14,'TAA':15,
          'CTT':16,'CTC':17,'CTG':18,'CTA':19,
          'CCT':20,'CCC':21,'CCG':22,'CCA':23,
          'CGT':24,'CGC':25,'CGG':26,'CGA':27,
          'CAT':28,'CAC':29,'CAG':30,'CAA':31,
          'GTT':32,'GTC':33,'GTG':34,'GTA':35,
          'GCT':36,'GCC':37,'GCG':38,'GCA':39,
          'GGT':40,'GGC':41,'GGG':42,'GGA':43,
          'GAT':44,'GAC':45,'GAG':46,'GAA':47,
          'ATT':48,'ATC':49,'ATG':50,'ATA':51,
          'ACT':52,'ACC':53,'ACG':54,'ACA':55,
          'AGT':56,'AGC':57,'AGG':58,'AGA':59,
          'AAT':60,'AAC':61,'AAG':62,'AAA':63}

In [13]:
# create a list that will hold a set of numbers corresponding to each codon for each gene.
codon_seq_list = []
for seq in cds_sequence:
    codon_seq = []
    for i in range(0, len(seq), 3):
        try:
            codon_seq.append(cd_dict[seq[i:i+3]])
        except:
            pass
    codon_seq_list.append(codon_seq)

## Calculating Elongation rates

In [14]:
# Create a function to obtain a normalized profile (p) of ribosome footprints using the chi adjustment from sharma et al. 
def calculate_p_chi(data_counts, data_te, chi = 0.00289):
    p_list = []
    for i, j in zip(data_counts, te_table.TE_rpkm):
        pi = j*chi
        i = i+1
        M = sum(i)
        p = (i/M)*float(pi)*(len(i)-1)
        p_list.append(p)
    return(p_list)

In [15]:
# determine the normalized profile densities for all genes
p_list = calculate_p_chi(data, te_table, 0.00289)

In [16]:
# determine the 10 codon sliding averages of the particle densities for all genes. 
pbar_list = kat.calculate_pbar(p_list)

In [17]:
# determine the scaled elongation rates for all genes. 
lbar_list = kat.calculate_lbar(pbar_list)

---------------------------------------------------------------------------
invalid value encountered in double_scalars
in /home/keeganfl/Desktop/Work_Fall_2021/Fall_2021_Work_journal/tools/keegan_analysis_tools.py, line 411:

409                 lbar_x=9999
410             else:
411                 lbar_x = (1-9*pbarx)/(pbarx*(1-pbarx))
412             lbar.append(lbar_x)
413         lbar_list.append(np.array(lbar))

---------------------------------------------------------------------------


In [18]:
x = -1
for pbar in pbar_list:
    pbar[0] == 0
    x = x+1

In [19]:
# calculate the scaled initiation and termination rates
init_r= []
for pbar in pbar_list:
    if pbar[0] == 0:
        init_r.append(1/(1-10*0.00001))
    else:
        init_r.append(1/(1-10*pbar[0]))

term_r = []
for p in p_list:
    if p[-1] ==0:
        term_r.append(1/0.00001)
    else:
        term_r.append(1/(p[-1]))


In [20]:
# determine the scaling constant tau for all genes (note that this will utilize all of your computers processors) 
tau_list = kat.calculate_tau(lbar_list, codon_seq_list)

30710it [04:11, 122.33it/s]


In [23]:
# Calculate the unscaled elongation rates
elongation_list = np.array(lbar_list, dtype=object)/np.array(tau_list, dtype=object)

# Convert the unscaled elongation rates to a list so they can be more easily saved
elongation_list = elongation_list.tolist()
for elon, i in zip(elongation_list, list(range(len(elongation_list)))):
    elongation_list[i] = elon.tolist()

In [127]:
# Calculate the unscaled initiation rates and termination rates for all genes.
initiation_list = np.array(init_r, dtype=object)/np.array(tau_list, dtype =object)
termination_list = np.array(term_r, dtype=object)/np.array(tau_list, dtype =object)

# save unscaled initiation and termination rates as CSVs
np.savetxt(save_path2 + mutation + '_' + samp_num + '_adjusted_initiation_rates', initiation_list, delimiter = ",")
np.savetxt(save_path3 + mutation + '_' + samp_num + '_adjusted_termination_rates', termination_list, delimiter = ",")

## Save unscaled elongation rates as a csv

In [24]:
# Create a list that contains all of the gene_ids and transcript_ids of the transcripts
gene_id = []
transcript_id = []

for transcript in protein_coding:
    gene_id.append(transcript.attr["gene_name"])
    transcript_id.append(transcript.attr["transcript_id"])

In [25]:
# Insert the gene ids and transcript ids into the codon_count list. 
for i,j in zip(elongation_list, range(len(gene_id))):
    i.insert(0,gene_id[j])
    i.insert(0,transcript_id[j])

In [26]:
# Calculate the longest cds region in our new list of counts
l_tr = kat.find_max_list(elongation_list)

# Define a header that includes labels for the transcript and gene ID as 
# well as numbers that index the cds region position.
header=["transcript_id","gene_id"]+list(range(l_tr))

# insert that header into our counts list. 
elongation_list.insert(0,header)

In [131]:
# save as a csv
with open(save_path + mutation + '_' + samp_num + '_adjusted_elongation_rates', 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerows(elongation_list)