# Count Arrays for Woolstenhulme E.coli data

This notebook contains a variety of code that allows me to create arrays of counts at each transcript position (nucleotide precision). 

Note that depending on the number of samples you are using, this notebook may use up a large amount of RAM (about 2-4 GB per sample). 

## loading packages

In [34]:
# Let the program know where to find your tools file.
import sys
sys.path.append('/home/keeganfl/Desktop/Work_Fall_2021/Fall_2021_Work_journal/tools/')

In [35]:
# import plastid
# data structure for mapping read alignments to genomic positions
from plastid import BAMGenomeArray, VariableFivePrimeMapFactory, \
                        GTF2_TranscriptAssembler, GFF3_TranscriptAssembler, \
                        Transcript, ThreePrimeMapFactory
import numpy as np
import os
from Bio import SeqIO
import numpy
import math
import pandas as pd
from plastid.plotting.plots import *
from scipy import stats
from scipy.stats import kstwo
import keegan_analysis_tools as kat
from statsmodels.nonparametric.smoothers_lowess import lowess
import warnings
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
%matplotlib inline
import csv
from scipy.sparse.linalg import lsqr
from tqdm import tqdm
from multiprocess import Pool
import copy
import seaborn as sns

In [197]:
# Define the path to our Bam files
data_path = '/home/keeganfl/Desktop/Work_Fall_2021/genomes_&_samples/ecol/'
save_path = "/home/keeganfl/Desktop/Work_Fall_2021/data_tables/position_counts_codon_resolution/ecol/"
csv_path = '/home/keeganfl/Desktop/Work_Fall_2021/data_tables/position_counts_bp_resolution/ecol/'
p_site_path = '/home/keeganfl/Desktop/Work_Fall_2021/data_tables/p-site_offsets/ecol/'
mutation = 'control'
samp_num = '5'

In [3]:
def variable_threeprime_map_function(alignments,segment,p_offsets):
        '''
        This function is used to map read alignments to the location of the ribosomal p-site 
        from their 3' end. The offsets to use for each read length are specified by file
        generated using RiboWaltz.

        alignments:
            Information on the genome alignment of an individual read which is passed 
            to the function from a BamGenome array created by plastid. 

        segment:
            Information on the individual read segment which is passed 
            to the function from a BamGenome array created by plastid. 

        p_offsets:
            A pandas dataframe that has been loaded into the python environmemt.
            This dataframe should follow this template. 
                length          P_offsets
                 28              12
                 29              12
                 30              13
                ...             ...

        '''
        reads_out = []
        count_array = numpy.zeros(len(segment))
        for read in alignments: 
            for length, offset in zip(p_offsets["length"],p_offsets["p_offset"]): 
                if length != len(read.positions):
                    continue # skip read if it is not the length we are currently offsetting.

             # count offset 3' to 5' if the `segment` is on the plus-strand
             # or is unstranded
                if segment.strand == "+":
                    p_site = read.positions[-offset - 1]
                elif segment.strand == ".":
                    p_site = read.positions[-offset - 1]
             # count offset from other end if `segment` is on the minus-strand
                elif segment.strand == "-":
                    p_site = read.positions[offset]

                if p_site >= segment.start and p_site < segment.end:
                    reads_out.append(read)
                    count_array[p_site - segment.start] += 1
        return reads_out, count_array

In [4]:
def VariableThreePrimeMapFactory(p_offsets):
    '''
    BamGenome array objects will only be able to pass the alignments and segment
    arguments to the variable_threeprime_map_function. This wrapper allows me to
    also specify the offset that needs to be passed to the function. 
    '''
    def new_func(alignments,segment):
        return variable_threeprime_map_function(alignments,segment,p_offsets=p_offsets)

    return new_func

In [5]:
# Create a function that finds the proteins I need. 
def find_transcript(gene,transcripts, count_vectors):
    '''
    A function that takes the name of a gene as input and finds 
    the corresponding transcript from a transcript list. 
    
    returns both the transcript in question and the vector of counts for that transcript.
    
    This function is still a work in progress as for now it simply gives the last 
    transcript in the list that matches the gene ID. 
    '''
    for i in transcripts:
        if i.attr['transcript_biotype'] == 'protein_coding':
            if i.attr['gene_name'] == gene:
                my_transcript = i
                my_vector = count_vectors[transcripts.index(i)]
                index = transcripts.index(i)
                
    return my_transcript, my_vector, index

In [6]:
def find_max_list(list):
    ''' 
    A function that finds the longest list/array in a list of lists. 
    '''
    list_len = [len(i) for i in list]
    return(max(list_len))

## Loading up the data files 

In [4]:
# load the transcript annotations from the GTF file.
# GTF2_TranscriptAssembler returns an iterator, so here we convert it to a list.
transcripts = list(GTF2_TranscriptAssembler(open(data_path + "Escherichia_coli_str_k_12_substr_mg1655_gca_000005845.ASM584v2.51.gtf"),return_type=Transcript))

In [5]:
# Remove non-protein coding transcripts from transcripts list. 
protein_coding = []
for transcript in transcripts:
    if transcript.attr['transcript_biotype'] == 'protein_coding':
        protein_coding.append(transcript)

In [6]:
transcripts.clear()

# Load up the count vectors

In [198]:
# Load in the count positions data.
count_vectors = kat.load_count_positions(mutation+"_RPF_"+samp_num+"_counts.csv",csv_path)

## Alter the count vectors to look at the codons over the cds region.

In [199]:
# Convert the count lists from nucleotide resolution to codon resolution
codon_counts = []

for i in count_vectors:
    codon_counts.append(np.add.reduceat(i, np.arange(0, len(i),3)))

In [200]:
# Convert the unscaled elongation rates to a list so they can be more easily saved
for count, i in zip(codon_counts, list(range(len(codon_counts)))):
    codon_counts[i] = count.tolist()

## Save as a csv

In [201]:
# Create a list that contains all of the gene_ids and transcript_ids of the transcripts
gene_id = []
transcript_id = []

for transcript in protein_coding:
    gene_id.append(transcript.attr["gene_name"])
    transcript_id.append(transcript.attr["transcript_id"])

In [202]:
# Insert the gene ids and transcript ids into the codon_count list. 
for i,j in zip(codon_counts, range(len(gene_id))):
    i.insert(0,gene_id[j])
    i.insert(0,transcript_id[j])

In [203]:
# Calculate the longest cds region in our new list of counts
l_tr = kat.find_max_list(codon_counts)

# Define a header that includes labels for the transcript and gene ID as 
# well as numbers that index the cds region position.
header=["transcript_id","gene_id"]+list(range(l_tr))

# insert that header into our counts list. 
codon_counts.insert(0,header)

In [204]:
with open(save_path + mutation + "_" + samp_num + '_counts.csv', 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerows(codon_counts)