## Loading up packages

In [1]:
# import plastid
# data structure for mapping read alignments to genomic positions
from plastid import BAMGenomeArray, VariableFivePrimeMapFactory, \
                        GTF2_TranscriptAssembler, Transcript, ThreePrimeMapFactory
import numpy as np
import numpy
import pandas as pd
import warnings
import csv
from scipy.sparse.linalg import lsqr

In [2]:
save_path = "/Users/keeganflanagan/Desktop/Khanh_position/Eggtart/"

# Loading up the csv file.

In [3]:
# Read in data row by row.
data = []
with open("toy_data.csv", newline = '') as csvfile:
    reader = csv.reader(csvfile)
    for row in reader:
        data.append(row)

In [4]:
# Pop out the header row.
blank=data.pop(0)

In [5]:
# convert everything to an integer if possible. 
for i,ii in zip(data, range(len(data))):
    for j,jj in zip(i, range(len(i))):
        try:
            x = int(j)
            data[ii][jj] = x
        except:
            pass

In [6]:
# Remove empty space
for i,ii in zip(data, range(len(data))):
    x = list(filter(('').__ne__, i))
    data[ii] = x

In [7]:
# Convert lists to np.arrays
for i,ii in zip(data, range(len(data))):
    data[ii] = np.array(data[ii][2:])

## Perform the calculations

In [8]:
# Create a function to obtain a normalized profile (p) of ribosome footprints.
def calculate_p(data):
    p_list=[]
    for i in data:
        M = sum(i)
        p = i/M
        p_list.append(p)
    return(p_list)

# Note that I am assuming here that by number of transcripts they meant number of reads.

In [9]:
p_list = calculate_p(data)

In [10]:
# Calculate the smoothed density vector pbar for xth entry with length n-9
def calculate_pbar(p_list):
    pbar_list=[]
    for p in p_list:
        x=0
        pbar=[]
        for px in p:
            pbar_x = 0.1*sum(p[x:x+10]) #it is x+10 not x+9 because python does not include the final index.
            pbar.append(pbar_x)
            x = x+1
            if x  == len(p)-9:
                break
        pbar_list.append(np.array(pbar))
    return(pbar_list)



In [11]:
pbar_list=calculate_pbar(p_list)

In [15]:
# calculate the smoothed, scaled elongation rate lambda bar 
def calculate_lbar(pbar_list):
    lbar_list=[]
    for pbar in pbar_list:
        lbar = []
        for pbarx in pbar:
            if pbarx == 0:
                lbar_x=9999
            else:
                lbar_x = (1-9*pbarx)/(pbarx*(1-pbarx))
            lbar.append(lbar_x)
        lbar_list.append(np.array(lbar))
    return(lbar_list)
            

In [16]:
lbar_list=calculate_lbar(pbar_list)

In [17]:
# calculate the scaled initiation and termination rates

init_r= []
for pbar in pbar_list:
    if pbar[0] == 0:
        init_r.append(1/(1-10*0.00001))
    else:
        init_r.append(1/(1-10*pbar[0]))

term_r = []
for p in p_list:
    if p[-1] ==0:
        term_r.append(1/0.00001)
    else:
        term_r.append(1/(p[-1]))


In [18]:
# deconvolve the smooth scaled elongation rates to calculate 
# the scaled codon specific elongation rates
def calculate_tau (lbar_list,p_list,L=1):
    tau_list = []
    for Lam,p in zip(lbar_list,p_list):
        b = 10*Lam
        A = np.zeros((len(Lam),len(p)))
        x=0
        for row in A:
            row[x:x+10].fill(1)
            x = x+1
        if L != 1:
            b = b[L-1:]
            A = A[L-1:] # Must double check that this is proper for A. 
        
        test=lsqr(A,b)
        Ci = test[0]
        tau = Ci.mean()
        tau_list.append(tau)
    return(tau_list)

In [19]:
tau_list=calculate_tau(lbar_list,p_list,L=1)

In [20]:
prod_r = 1/tau

NameError: name 'tau' is not defined

In [64]:
# Computing the unscaled rates
elongation = lbar_list[0]/tau_list[0]
termination = term_r[0]
initiation = init_r[0]/tau_list[0]

In [65]:
initiation

0.016669853627482297

In [260]:
with open(save_path + 'keeg_gene_test.csv', 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerows(np.atleast_2d(elongation).T)

How it egg tart calculating the initiation and termination rates? at the moment it produces numbers totally different from what I calculated. Am I supposed to enter in what I calculated? Am I supposed to attach these to the start and end of the elongation file?

# Brought together into a single function

In [111]:
def determine_rates(data, L = 1):
    elongation_list = []
    termination_list = []
    initiation_list = []

    for i in data:
        # obtain a normalized profile (p) of ribosome footprints.
        M = sum(i)
        p = i/M 
    
        # Calculate the smoothed density vector pbar for xth entry with length n-9
        x=0
        pbar=[]
        for px in p:
            pbar_x = 0.1*sum(p[x:x+10]) #it is x+10 not x+9 because python does not include the final index.
            pbar.append(pbar_x)
            x = x+1
            if x  == len(p)-9:
                break
        pbar = np.array(pbar)
        
        # calculate the smoothed, scaled elongation rate lambda bar 
        lbar = []
        for pbarx in pbar:
            if pbarx == 0:
                lbar_x=9999
            else:
                lbar_x = (1-9*pbarx)/(pbarx*(1-pbarx))

            lbar.append(lbar_x)
        lbar = np.array(lbar)
        
        # Calculate the scaled elongation and initiation rates
        if pbar[0] == 0:
            sc_init = 1/(1-10*0.00001)
        else:
            sc_init = 1/(1-10*pbar[0])
        if p[-1] ==0:
            sc_term = 1/0.00001
        else:
            sc_term = 1/(p[-1])
            
    # deconvolve the smooth scaled elongation rates to calculate 
    # the scaled codon specific elongation rates
        b = 10*lbar
        A = np.zeros((len(lbar),len(p)))
        x=0
        for row in A:
            row[x:x+10].fill(1)
            x = x+1
        if L != 1:
            b = b[L-1:]
            A = A[L-1:] # Must double check that this is proper for A. 
        
        test=lsqr(A,b)
        Ci = test[0]
        tau = Ci.mean()
        
        elongation = lbar/tau
        termination = sc_term # This makes no sense, it always ends up being huge... maybe it is just len(p)/M?
        initiation = sc_init/tau
        
        elongation_list.append(elongation)
        termination_list.append(termination)
        initiation_list.append(initiation)
    return(elongation_list,termination_list,initiation_list)
    

In [112]:
el,tl,il = determine_rates(data)

In [115]:
il

[0.016669853627482297, 0.01865871865832989, 0.029289236953547824]