In [3]:
import sys
sys.path.append('/home/keeganfl/Desktop/Work_Fall_2021/Fall_2021_Work_journal/tools/')

In [4]:
# import plastid
# data structure for mapping read alignments to genomic positions
from plastid import BAMGenomeArray, VariableFivePrimeMapFactory, \
                        GTF2_TranscriptAssembler, GFF3_TranscriptAssembler, \
                        Transcript, ThreePrimeMapFactory
import numpy as np
from Bio import SeqIO
import numpy
import Loess as lo
import pandas as pd
from scipy import stats
import keegan_analysis_tools as kat
from statsmodels.nonparametric.smoothers_lowess import lowess
import warnings
import matplotlib.pyplot as plt
%matplotlib inline
import csv
import os
from scipy.sparse.linalg import lsqr

In [19]:
# Define the path important files. 
data_path = '/home/keeganfl/Desktop/Work_Fall_2021/genomes_&_samples/ecol/'
csv_path = '/home/keeganfl/Desktop/Work_Fall_2021/data_tables/position_counts/ecol/'
save_path = '/home/keeganfl/Desktop/Work_Fall_2021/data_tables/position_counts/ecol/'
gtf_name = "Escherichia_coli_str_k_12_substr_mg1655_gca_000005845.ASM584v2.51.gtf"

In [20]:
# get a list of all the files you want to average together. 
all_counts = os.listdir('/home/keeganfl/Desktop/Work_Fall_2021/data_tables/position_counts/ecol')

In [21]:
mutant_counts = []
control_counts = []
for count in all_counts:
    if count.startswith("efp") == True:
        mutant_counts.append(count)
    elif count.startswith("control") == True:
        control_counts.append(count)

In [22]:
control_counts

['control_RPF_5_counts.csv',
 'control_RPF_2_counts.csv',
 'control_RPF_4_counts.csv',
 'control_RPF_3_counts.csv',
 'control_RPF_1_counts.csv']

Note too self: If you ever wanted a good example of something that could be run with multiprocessing, this would be it. 

In [23]:
def get_summed_array(counts):
    array_list = []
    for c in counts:
        array_list.append(kat.load_count_positions(c, '/home/keeganfl/Desktop/Work_Fall_2021/data_tables/position_counts/ecol/'))
    
    # Convert count lists to numpy arrays so they can be quickly and easily summmed. 
    for al, i in zip(array_list, list(range(len(array_list)))):
        array_list[i] = np.array(al, dtype=object)
    
    # Sum all of your newly made count arrays together. 
    summed_array = sum(array_list)

    # Convert everything back to lists so that you can easily make your data tables. 
    summed_array = summed_array.tolist()
    for i, j in zip(summed_array, list(range(len(summed_array)))):
        summed_array[j] = i.tolist()
    return summed_array


In [24]:
mutant_summed_array = get_summed_array(mutant_counts)

In [25]:
control_summed_array = get_summed_array(control_counts)

In [26]:
# load the transcript annotations from the GTF file.
# GTF2_TranscriptAssembler returns an iterator, so here we convert it to a list.
transcripts = list(GTF2_TranscriptAssembler(open(data_path + gtf_name),return_type=Transcript))

In [27]:
# Remove non-protein coding transcripts from transcripts list. 
protein_coding = []
for transcript in transcripts:
    if transcript.attr['transcript_biotype'] == 'protein_coding':
        protein_coding.append(transcript)

In [28]:
# clear up some memory by deleting original transcript list
transcripts.clear()

In [29]:
def load_array_as_csv(transcripts, summed_array):
    
    # Create a list that contains all of the gene_ids and transcript_ids of the transcripts
    gene_id = []
    transcript_id = []

    for transcript in transcripts:
        gene_id.append(transcript.attr["gene_name"])
        transcript_id.append(transcript.attr["transcript_id"])
    
    # Insert the gene ids and transcript ids into the codon_count list. 
    for i, j in zip(summed_array, range(len(gene_id))):
        i.insert(0,gene_id[j])
        i.insert(0,transcript_id[j])
        
    # Calculate the longest cds region in our new list of counts
    l_tr = kat.find_max_list(summed_array)

    # Define a header that includes labels for the transcript and gene ID as 
    # well as numbers that index the cds region position.
    header=["transcript_id","gene_id"]+list(range(l_tr))

    # insert that header into our counts list. 
    summed_array.insert(0,header)
    return summed_array

In [30]:
mutant_summed_array = load_array_as_csv(protein_coding, mutant_summed_array)
control_summed_array = load_array_as_csv(protein_coding, control_summed_array)

In [31]:
with open(save_path + 'mutant_summed_counts.csv', 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerows(mutant_summed_array)
    
with open(save_path + 'control_summed_counts.csv', 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerows(control_summed_array)