## Loading up packages

In [1]:
# Let the program know where to find my tools file.
import sys
sys.path.append('/home/keeganfl/Desktop/Work_Fall_2021/Fall_2021_Work_journal/tools/')

In [2]:
# Load up the necessary packages 
from plastid import BAMGenomeArray, GenomeArray, VariableFivePrimeMapFactory, \
                        GTF2_TranscriptAssembler, GFF3_TranscriptAssembler, \
                        Transcript, ThreePrimeMapFactory, CenterMapFactory
import plastid
from plastid.plotting.plots import *
import numpy as np
import os
from Bio import SeqIO
import numpy
import math
import pandas as pd
from scipy import stats
import keegan_analysis_tools as kat
from statsmodels.nonparametric.smoothers_lowess import lowess
import warnings
import matplotlib.pyplot as plt
%matplotlib inline
import csv
from scipy.sparse.linalg import lsqr
from tqdm import tqdm

In [3]:
# Define important paths. 
data_path = "/home/keeganfl/Desktop/Work_Fall_2021/genomes_&_samples/ecol/"
position_count_path = "/home/keeganfl/Desktop/Work_Fall_2021/data_tables/position_counts_codon_resolution/ecol/"
te_table_path = '/home/keeganfl/Desktop/Work_Fall_2021/data_tables/TE_tables/ecol/'
save_path = "/home/keeganfl/Desktop/Work_Fall_2021/data_tables/elongation_rates/ecol/"
save_path2 = "/home/keeganfl/Desktop/Work_Fall_2021/data_tables/initiation_rates/ecol/"
save_path3 = "/home/keeganfl/Desktop/Work_Fall_2021/data_tables/termination_rates/ecol/"
gtf_name = "Escherichia_coli_str_k_12_substr_mg1655_gca_000005845.ASM584v2.51.gtf"
mutation = 'efp'
samp_num = '2'

## Loading up the data files 

In [4]:
# load the transcript annotations from the GTF file.
# GTF2_TranscriptAssembler returns an iterator, so here we convert it to a list.
transcripts = list(GTF2_TranscriptAssembler(open(data_path + gtf_name),return_type=Transcript))

In [5]:
# Remove non-protein coding transcripts from transcripts list. 
protein_coding = []
for transcript in transcripts:
    if transcript.attr['gene_biotype'] == 'protein_coding':
        protein_coding.append(transcript)

In [6]:
# clear up some memory by deleting original transcript list
transcripts.clear()

## Define some absurd functions

In [7]:
def load_count_positions(csv_name, csv_path):
    data = []
    gene = []
    with open(csv_path + csv_name, newline = '') as csvfile:
        reader = csv.reader(csvfile)
        for row in reader:
            data.append(row)
    blank=data.pop(0)
            
    for i,ii in zip(data, range(len(data))):
        for j,jj in zip(i, range(len(i))):
            try:
                x = int(float(j))
                data[ii][jj] = x
            except:
                pass
            
    # Remove empty space
    for i,ii in zip(data, range(len(data))):
        x = list(filter(('').__ne__, i))
        data[ii] = x
        
    # Convert lists to np.arrays
    for i,ii in zip(data, range(len(data))):
        gene.append(data[ii][1])
        data[ii] = np.array(data[ii][2:])
    
    return data, gene

## Loading up the position counts file.

In [8]:
data, genes = load_count_positions(mutation + "_" + samp_num + "_counts.csv",position_count_path)

## Load up the TE values

In [9]:
te_table = pd.read_csv('/home/keeganfl/Desktop/Work_Fall_2021/data_tables/ecol_TE_thingy.csv')

## Fix some stuff

In [10]:
gene_names = pd.DataFrame(genes, columns = ['Gene'])

In [11]:
inter = pd.merge(gene_names, te_table, how = 'left', on = 'Gene')

In [12]:
fixed_te_table = inter[np.isnan(inter.TE) == False]

In [13]:
fixed_data = []
for i in fixed_te_table.index:
    fixed_data.append(data[i])

In [14]:
master_index = fixed_te_table.index

## Getting the codon sequence

In [15]:
# Use SeqIO to load up the sequence information for all the cds regions
genome = SeqIO.to_dict(SeqIO.parse(
    open(data_path + "Escherichia_coli_str_k_12_substr_mg1655_gca_000005845.ASM584v2.cdna.all.fa"),"fasta"))

In [16]:
# Load up the sequences for all of the protein coding genes into a list
cds_sequence=[]
for transcript in protein_coding:
    t_id = transcript.attr["transcript_id"]
    cds_sequence.append(str(genome[t_id].seq))

In [17]:
# Create a dictionary that matches every codon to a number. This will be used later for deconvolution
cd_dict = {'TTT':0,'TTC':1,'TTG':2,'TTA':3,
          'TCT':4,'TCC':5,'TCG':6,'TCA':7,
          'TGT':8,'TGC':9,'TGG':10,'TGA':11,
          'TAT':12,'TAC':13,'TAG':14,'TAA':15,
          'CTT':16,'CTC':17,'CTG':18,'CTA':19,
          'CCT':20,'CCC':21,'CCG':22,'CCA':23,
          'CGT':24,'CGC':25,'CGG':26,'CGA':27,
          'CAT':28,'CAC':29,'CAG':30,'CAA':31,
          'GTT':32,'GTC':33,'GTG':34,'GTA':35,
          'GCT':36,'GCC':37,'GCG':38,'GCA':39,
          'GGT':40,'GGC':41,'GGG':42,'GGA':43,
          'GAT':44,'GAC':45,'GAG':46,'GAA':47,
          'ATT':48,'ATC':49,'ATG':50,'ATA':51,
          'ACT':52,'ACC':53,'ACG':54,'ACA':55,
          'AGT':56,'AGC':57,'AGG':58,'AGA':59,
          'AAT':60,'AAC':61,'AAG':62,'AAA':63}

In [18]:
# create a list that will hold a set of numbers corresponding to each codon for each gene.
codon_seq_list = []
for seq in cds_sequence:
    codon_seq = []
    for i in range(0, len(seq), 3):
        try:
            codon_seq.append(cd_dict[seq[i:i+3]])
        except:
            pass
    codon_seq_list.append(codon_seq)

In [19]:
fixed_codon_list = []
for i in master_index:
    fixed_codon_list.append(codon_seq_list[i])

## Calculating Elongation rates

In [20]:
# Create a function to obtain a normalized profile (p) of ribosome footprints using the chi adjustment from sharma et al. 
def calculate_p_chi(data_counts, data_te, chi = 0.00289):
    p_list = []
    for i, j in zip(data_counts, te_table.TE):
        pi = j*chi
        i = i+1
        M = sum(i)
        p = (i/M)*float(pi)*(len(i)-1)
        p_list.append(p)
    return(p_list)

In [21]:
# determine the normalized profile densities for all genes
p_list = calculate_p_chi(fixed_data, fixed_te_table, 0.00299)

In [22]:
# determine the 10 codon sliding averages of the particle densities for all genes. 
pbar_list = kat.calculate_pbar(p_list)

In [23]:
# determine the scaled elongation rates for all genes. 
lbar_list = kat.calculate_lbar(pbar_list)

In [24]:
x = -1
for pbar in pbar_list:
    pbar[0] == 0
    x = x+1

In [25]:
# calculate the scaled initiation and termination rates
init_r= []
for pbar in pbar_list:
    if pbar[0] == 0:
        init_r.append(1/(1-10*0.00001))
    else:
        init_r.append(1/(1-10*pbar[0]))

term_r = []
for p in p_list:
    if p[-1] ==0:
        term_r.append(1/0.00001)
    else:
        term_r.append(1/(p[-1]))


In [26]:
# determine the scaling constant tau for all genes (note that this will utilize all of your computers processors) 
tau_list = kat.calculate_tau(lbar_list, fixed_codon_list)

1779it [00:13, 135.04it/s]


In [27]:
# Calculate the unscaled elongation rates
elongation_list = np.array(lbar_list, dtype=object)/np.array(tau_list, dtype=object)

# Convert the unscaled elongation rates to a list so they can be more easily saved
elongation_list = elongation_list.tolist()
for elon, i in zip(elongation_list, list(range(len(elongation_list)))):
    elongation_list[i] = elon.tolist()

In [28]:
# Calculate the unscaled initiation rates and termination rates for all genes.
initiation_list = np.array(init_r, dtype=object)/np.array(tau_list, dtype =object)
termination_list = np.array(term_r, dtype=object)/np.array(tau_list, dtype =object)

# save unscaled initiation and termination rates as CSVs
np.savetxt(save_path2 + mutation + '_' + samp_num + '_adjusted_initiation_rates', initiation_list, delimiter = ",")
np.savetxt(save_path3 + mutation + '_' + samp_num + '_adjusted_termination_rates', termination_list, delimiter = ",")
np.savetxt(save_path3 + mutation + '_' + samp_num + 'master_index', master_index, delimiter = ",")

## Save unscaled elongation rates as a csv

In [29]:
# Create a list that contains all of the gene_ids and transcript_ids of the transcripts
gene_id = fixed_te_table.reset_index().Gene


In [30]:
# Insert the gene ids and transcript ids into the codon_count list. 
for i,j in zip(elongation_list, range(len(gene_id))):
    i.insert(0,gene_id[j])

In [31]:
# Calculate the longest cds region in our new list of counts
l_tr = kat.find_max_list(elongation_list)

# Define a header that includes labels for the transcript and gene ID as 
# well as numbers that index the cds region position.
header=["gene_id"]+list(range(l_tr))

# insert that header into our counts list. 
elongation_list.insert(0,header)

In [32]:
# save as a csv
with open(save_path + mutation + '_' + samp_num + '_adjusted_elongation_rates', 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerows(elongation_list)

In [35]:
fixed_te_table[fixed_te_table.Gene == "rpoB"]

Unnamed: 0.1,Gene,Unnamed: 0,MOPS_complete,mRNAs_cell,proteins_mRNA,proteins_min_mRNA,length(aa),ribosomes_mRNA,RPKM,TE,length(bp)
3838,rpoB,38.0,16156.0,37.512542,430.682622,20.03175,1342.0,29.869565,1502.0,0.83,4026.0


In [36]:
fixed_te_table[fixed_te_table.Gene == "cysQ"]

Unnamed: 0.1,Gene,Unnamed: 0,MOPS_complete,mRNAs_cell,proteins_mRNA,proteins_min_mRNA,length(aa),ribosomes_mRNA,RPKM,TE,length(bp)
4061,cysQ,1054.0,902.0,1.148853,785.131031,36.517722,246.0,9.981511,46.0,1.5,738.0
