# Summed Position Counts

This notebook is used to create a table of position counts where all of the samples are summed together rather than kept as seperate samples. 

This notebook needs considerable reworking

In [4]:
# Loading up the path to my tools file. 
import sys
sys.path.append('/home/keeganfl/Desktop/Work_Fall_2021/Fall_2021_Work_journal/tools/')

In [5]:
# Loading up required packages
from plastid import BAMGenomeArray, VariableFivePrimeMapFactory, \
                        GTF2_TranscriptAssembler, GFF3_TranscriptAssembler, \
                        Transcript, ThreePrimeMapFactory
import numpy as np
from Bio import SeqIO
import numpy
import pandas as pd
from scipy import stats
import keegan_analysis_tools as kat
import matplotlib.pyplot as plt
%matplotlib inline
import csv
import os
from scipy.sparse.linalg import lsqr

In [6]:
# Define the path to important files. 
data_path = '/home/keeganfl/Desktop/Work_Fall_2021/genomes_&_samples/ecol/'
csv_path = '/home/keeganfl/Desktop/Work_Fall_2021/data_tables/position_counts/ecol/'
save_path = '/home/keeganfl/Desktop/Work_Fall_2021/data_tables/position_counts/ecol/'
gtf_name = "Escherichia_coli_str_k_12_substr_mg1655_gca_000005845.ASM584v2.51.gtf"

## Load up the transcript data

In [7]:
# load the transcript annotations from the GTF file.
# GTF2_TranscriptAssembler returns an iterator, so here we convert it to a list.
transcripts = list(GTF2_TranscriptAssembler(open(data_path + gtf_name),return_type=Transcript))

In [8]:
# Remove non-protein coding transcripts from transcripts list. 
protein_coding = []
for transcript in transcripts:
    if transcript.attr['transcript_biotype'] == 'protein_coding':
        protein_coding.append(transcript)

In [9]:
# clear up some memory by deleting original transcript list
transcripts.clear()

## Create the summed samples

In [10]:
# get a list of all the files you want to sum together. 
all_samples = os.listdir('/home/keeganfl/Desktop/Work_Fall_2021/data_tables/position_counts/ecol')

In [11]:
# Split up the files you want to sum together into mutants and controls
mutant_samples = []
control_samples = []
for sample in all_samples:
    if sample.startswith("efp") == True:
        mutant_samples.append(sample)
    elif sample.startswith("control") == True:
        control_samples.append(sample)

In [12]:
# Create a function that will load up the necessary count arrays and sum them together
def get_summed_array(samples):
    count_list = []
    for s in samples:
        count_list.append(kat.load_count_positions(s, '/home/keeganfl/Desktop/Work_Fall_2021/data_tables/position_counts/ecol/'))
    
    # Convert count lists to numpy arrays so they can be quickly and easily summmed. 
    for count, i in zip(count_list, list(range(len(count_list)))):
        count_list[i] = np.array(count, dtype=object)
    
    # Sum all of your newly made count arrays together. 
    summed_counts = sum(count_list)

    # Convert everything back to lists so that you can easily make your data tables. 
    summed_counts = summed_counts.tolist()
    for i, j in zip(summed_counts, list(range(len(summed_counts)))):
        summed_counts[j] = i.tolist()
    return summed_counts

In [13]:
# Load up the summed count arrays
mutant_summed_array = get_summed_array(mutant_samples)
control_summed_array = get_summed_array(control_samples)

## Alter the summed arrays to be saved as CSVs

In [14]:
def load_array_as_csv(transcripts, summed_array):
    
    # Create a list that contains all of the gene_ids and transcript_ids of the transcripts
    gene_id = []
    transcript_id = []

    for transcript in transcripts:
        gene_id.append(transcript.attr["gene_name"])
        transcript_id.append(transcript.attr["transcript_id"])
    
    # Insert the gene ids and transcript ids into the codon_count list. 
    for i, j in zip(summed_array, range(len(gene_id))):
        i.insert(0,gene_id[j])
        i.insert(0,transcript_id[j])
        
    # Calculate the longest cds region in our new list of counts
    l_tr = kat.find_max_list(summed_array)

    # Define a header that includes labels for the transcript and gene ID as 
    # well as numbers that index the cds region position.
    header=["transcript_id","gene_id"]+list(range(l_tr))

    # insert that header into our counts list. 
    summed_array.insert(0,header)
    return summed_array

In [15]:
# Convert the summed count arrays to the proper csv format
mutant_summed_array = load_array_as_csv(protein_coding, mutant_summed_array)
control_summed_array = load_array_as_csv(protein_coding, control_summed_array)

In [16]:
# save both summed count arrays as CSVs
with open(save_path + 'mutant_summed_counts.csv', 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerows(mutant_summed_array)
    
with open(save_path + 'control_summed_counts.csv', 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerows(control_summed_array)