## Step 1:
Let the program know where to find your tools file.

In [3]:
import sys, os
sys.path.append(os.path.join(os.path.realpath(".."), "Tools"))

## Step 2:
Load up all of the necessary packages

In [4]:
# Loading up required packages
from plastid import BAMGenomeArray, VariableFivePrimeMapFactory, \
                        GTF2_TranscriptAssembler, GFF3_TranscriptAssembler, \
                        Transcript, ThreePrimeMapFactory, CenterMapFactory
from plastid.plotting.plots import *
import numpy as np
import os
from Bio import SeqIO
import numpy
import math
import pandas as pd
from scipy import stats
import utilities as utils
from statsmodels.nonparametric.smoothers_lowess import lowess
import warnings
import matplotlib.pyplot as plt
%matplotlib inline
import csv
from scipy.sparse.linalg import lsqr

## Step 3:
Define the paths to our reference genome and annotation files as well as the path to our data for the Autism Spectrum Disorder (ASD) genes identified by Sfari, the top 200 genes whose regulation is altered by FMRP as determined by RiboDiff, The list of genes and their associated TPM values (used to select for genes with sufficient TPM), and the list of primary transcript isoforms provided by APRIS.

In [7]:
reference_path = "/home/keeganfl/Desktop/Work_Fall_2021/Protocol_test/genome/mouse/"
data_path = "/home/keeganfl/Desktop/measuring_binding_domains/data/"
save_path = "/home/keeganfl/Desktop/measuring_binding_domains/Extra_tables/TE_tables/"

## Step 4: 
load the transcript files so that we can find the sequence of each gene

In [9]:
# Loading up the transcrip[t information.
transcripts = list(GTF2_TranscriptAssembler(open(reference_path + "mm10.refGene.gtf"),return_type=Transcript))

## Step 5:
Create a table that lists all genes and their associated lengths. 

### Right now I am using cds length instead of full lenght, I think that is correct. 

In [58]:
lengths_cds = []
gene_name = []
gene_id = []
for tran in transcripts:
    try:
        tran.attr["gene_name"]
    except:
        continue
    try:
        lengths_cds.append(tran.cds_end - tran.cds_start)
    except:
        continue
    gene_name.append(tran.attr["gene_name"])
    gene_id.append(tran.attr["gene_id"])
all_genes = pd.DataFrame(list(zip(gene_name, gene_id, lengths_cds)))
all_genes.columns = ["gene_name", "gene_id", "lengths_cds"]

## Step 6:
Load up the list of APRIS genes. This list will include the transcript IDs of all of the primary isoforms, and will be used to select a single transcript for all genes.

In [59]:
# Load up the Apris genes. 
prin_trans = pd.read_csv(data_path + "apris_mouse_data.txt", names = ["gene_name", "???", "gene_id", "rank"])
prin_trans.drop(columns = ["???", "rank"], inplace = True)

## Step 7:
Merge the list of APRIS genes with the list of all gene lengths in order to remove the non-primary isoforms. 

In [61]:
# Merge the count table with the table of APRIS genes
prime_genes = pd.merge(all_genes, prin_trans, how="left", on="gene_name", indicator = True)

# Only keep counts with a transcript ID that matched one of the APRIS primary IDs.
prime_genes = prime_genes.loc[prime_genes._merge == "both"].copy()

# Drop any duplicate genes (duplicates may remain if multiple transcripts have identical cds regions).
prime_genes.drop_duplicates(subset ="gene_name",keep = "first", inplace = True) 

# Remove unnecesary columns. 
prime_genes.drop(columns = ["_merge"], inplace = True)

## Step 8:
Load up the list counts from each example and then merge them with the list of the pimary gene lengths.

In [73]:
# Load up count data.
mouse_counts = pd.read_csv(data_path + "counts/mmus_counts_SIM.csv")

# Merge with gene length data. 
len_counts = pd.merge(prime_genes, mouse_counts, how = "inner", left_on = "gene_name", right_on = "Entry")

## Step 9: 
Calculate TPM for each column using the equation 
<br />
$TPM = 10^6 * \frac{reads\: mapped\: to\: the\: transcript\:/\: transcript\: length }{Sum(reads\: mapped\: to\: each\: transcript\:/\: each\: transcript's\: length)}
$
<br />
Then add them as new columns to the pandas dataframe.

In [74]:
# Calculate RPK for each gene so that we can use it to calculate TPM. 
for i in len_counts.columns[5:19]:
    rpk = len_counts[i]/len_counts["lengths_cds"]
    len_counts["%s_rpk" % (i)] = rpk

In [75]:
# Calculate TPM for each gene. 
for i in len_counts.columns[5:19]:
    per_m_s = sum(len_counts["%s_rpk" % i])/1e6
    len_counts["%s_tpm" % (i)] = (len_counts["%s_rpk" % i])/per_m_s

In [76]:
# Drop all of the RPK columns as we only needed them to get TPM. 
len_counts.drop(columns = ['RbCtlR1_rpk', 'RbCtlR2_rpk', 'RbCtlR3_rpk',
        'RbTrtR1_rpk', 'RbTrtR2_rpk', 'RbTrtR3_rpk',
        'RnaCtlR1_rpk','RnaCtlR2_rpk', 'RnaCtlR3_rpk', 'RnaCtlR4_rpk',
        'RnaTrtR1_rpk', 'RnaTrtR2_rpk','RnaTrtR3_rpk', 'RnaTrtR4_rpk'], inplace = True)

## Step 10:
Calculate the Average TPM for the mutant and control RNA and RB samples and then calculate average TE. 

In [77]:
# Avergae tpm for the ribosome profiling controls
len_counts['RbCtl_avg_tpm'] = len_counts[['RbCtlR1_tpm', 'RbCtlR2_tpm', 'RbCtlR3_tpm']].mean(axis = 1)

# Average tpm for ribosome profiling treated
len_counts['RbTrt_avg_tpm'] = len_counts[['RbTrtR1_tpm', 'RbTrtR2_tpm', 'RbTrtR3_tpm']].mean(axis = 1)

# Average tpm for the RNA sequncing control
len_counts['RnaCtl_avg_tpm'] = len_counts[['RnaCtlR1_tpm', 'RnaCtlR2_tpm', 'RnaCtlR3_tpm', 'RnaCtlR4_tpm']].mean(axis = 1)

# Avergae tpm for the RNA sequencing treated
len_counts['RnaTrt_avg_tpm'] = len_counts[['RnaTrtR1_tpm', 'RnaTrtR2_tpm','RnaTrtR3_tpm','RnaTrtR4_tpm']].mean(axis= 1)

In [78]:
len_counts['Ctl_TE'] = len_counts['RbCtl_avg_tpm']/len_counts['RnaCtl_avg_tpm']
len_counts['Trt_TE'] = len_counts['RbTrt_avg_tpm']/len_counts['RnaTrt_avg_tpm']
len_counts['FldChng_TE'] = len_counts['Trt_TE']/len_counts['Ctl_TE']

## Step 11
Drop uneccessary columns and save the csv. 

In [79]:
len_counts.drop(columns = ["gene_name", "gene_id_x", "lengths_cds"], inplace = True)
len_counts.to_csv(save_path + "TE_mouse.csv", index = False)

In [80]:
len_counts

Unnamed: 0,gene_id_y,Entry,RbCtlR1,RbCtlR2,RbCtlR3,RbTrtR1,RbTrtR2,RbTrtR3,RnaCtlR1,RnaCtlR2,...,RnaTrtR2_tpm,RnaTrtR3_tpm,RnaTrtR4_tpm,RbCtl_avg_tpm,RbTrt_avg_tpm,RnaCtl_avg_tpm,RnaTrt_avg_tpm,Ctl_TE,Trt_TE,FldChng_TE
0,XM_006495550.3,Xkr4,138,202,195,151,183,137,1870,1910,...,55.921464,57.927163,55.148683,23.310738,23.564042,58.021414,56.614057,0.401761,0.416222,1.035995
1,NM_011283.2,Rp1,1,6,5,3,1,2,3,4,...,0.051003,0.021386,0.038781,0.292023,0.181681,0.083707,0.056736,3.488617,3.202203,0.917901
2,XM_011238364.2,Sox17,9,11,16,10,15,19,39,67,...,3.768338,1.446195,4.516580,3.464439,4.879884,3.555081,3.411406,0.974503,1.430461,1.467887
3,NM_001177658.1,Mrpl15,220,257,304,269,252,282,511,495,...,54.767556,49.896480,49.080406,108.263079,127.759845,52.791752,53.382329,2.050757,2.393298,1.167031
4,NM_008866.2,Lypla1,111,153,150,107,128,136,665,780,...,92.114362,97.057403,88.757785,59.521904,61.150223,75.740772,91.160124,0.785863,0.670800,0.853584
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19952,NM_201530.2,Sly,0,0,0,0,0,0,0,0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,,,
19953,NM_001017393.3,Gm21943,0,0,0,0,0,0,0,0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,,,
19954,XM_017318833.1,LOC108168625,0,0,0,0,0,0,0,0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,,,
19955,NM_001160135.2,Gm20806,0,0,0,0,0,0,0,0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,,,
