# Count Arrays for Greenblatt and Spradling D. Melanogaster data

## loading packages

In [122]:
# Let the program know where to find your tools file.
import sys
sys.path.append('/home/keeganfl/Desktop/Work_Fall_2021/Fall_2021_Work_journal/tools/')

In [123]:
# import plastid
# data structure for mapping read alignments to genomic positions
from plastid import BAMGenomeArray, VariableFivePrimeMapFactory, \
                        GTF2_TranscriptAssembler, GFF3_TranscriptAssembler, \
                        Transcript, ThreePrimeMapFactory
import numpy as np
import os
from Bio import SeqIO
import numpy
import math
import pandas as pd
from plastid.plotting.plots import *
from scipy import stats
from scipy.stats import kstwo
import keegan_analysis_tools as kat
from statsmodels.nonparametric.smoothers_lowess import lowess
import warnings
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
%matplotlib inline
import csv
from scipy.sparse.linalg import lsqr
from tqdm import tqdm
from multiprocess import Pool
import copy
import seaborn as sns

In [150]:
# Define the path to our Bam files
data_path = "/home/keeganfl/Desktop/Work_Fall_2021/genomes_&_samples/dmel/"
save_path = "/home/keeganfl/Desktop/Work_Fall_2021/data_tables/position_counts_codon_resolution/dmel/"
csv_path = '/home/keeganfl/Desktop/Work_Fall_2021/data_tables/position_counts_bp_resolution/dmel/'
p_site_path = "/home/keeganfl/Desktop/Work_Fall_2021/data_tables/p-site_offsets/dmel/"
mutation = 'Fmr1'
samp_num = '3'

In [9]:
# load the transcript annotations from the GTF file.
# GTF2_TranscriptAssembler returns an iterator, so here we convert it to a list.
transcripts = list(GTF2_TranscriptAssembler(open(data_path + "Drosophila_melanogaster.BDGP6.32.103.gtf"),return_type=Transcript))

In [10]:
# Remove non-protein coding transcripts from transcripts list. 
protein_coding = []
for transcript in transcripts:
    if transcript.attr['transcript_biotype'] == 'protein_coding':
        protein_coding.append(transcript)

In [11]:
# clear up some memory by deleting original transcript list
transcripts.clear()

# Load up the count vectors

In [151]:
# Load in the count positions data.
count_vectors = kat.load_count_positions(mutation+"_RPF_"+samp_num+"_counts.csv",csv_path)

## Alter the count vectors to look at the codons over the cds region.

In [152]:
#Damnit damnit damnit, I ended up putting them though the cds thingy twice.

In [153]:
# Convert the count lists from nucleotide resolution to codon resolution
codon_counts = []

for i in count_vectors:
    codon_counts.append(np.add.reduceat(i, np.arange(0, len(i),3)))

In [154]:
# Convert the unscaled elongation rates to a list so they can be more easily saved
for count, i in zip(codon_counts, list(range(len(codon_counts)))):
    codon_counts[i] = count.tolist()

## Save as a csv

In [155]:
# Create a list that contains all of the gene_ids and transcript_ids of the transcripts
gene_id = []
transcript_id = []

for transcript in protein_coding:
    gene_id.append(transcript.attr["gene_name"])
    transcript_id.append(transcript.attr["transcript_id"])

In [156]:
# Insert the gene ids and transcript ids into the codon_count list. 
for i, j in zip(codon_counts, range(len(gene_id))):
    i.insert(0,gene_id[j])
    i.insert(0,transcript_id[j])

In [157]:
# Calculate the longest cds region in our new list of counts
l_tr = kat.find_max_list(codon_counts)

# Define a header that includes labels for the transcript and gene ID as 
# well as numbers that index the cds region position.
header=["transcript_id","gene_id"]+list(range(l_tr))

# insert that header into our counts list. 
codon_counts.insert(0,header)

In [158]:
with open(save_path + mutation + "_" + samp_num + '_counts.csv', 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerows(codon_counts)