# Quantifying poly(A) sites as junction counts relative to 5' TE splice junction


The throw it out there idea is that if this is reliable (i.e. just considering counts across poly(A) sites (accounting for minimum overhang)

In [None]:
import os
import pandas as pd
import pysam
os.getcwd()

paqr_rel_usages_path = "../../data/relative_usages.filtered.tsv"
rel_usages_header_path = "../../data/relative_usages.header.out"
m323k_wt_1_bam_path = "/home/sam/cluster/TDP43_RNA/TDP_F210I_M323K/M323K/New_adult_brain/processed/M323K_WT_1/M323K_WT_1_unique_rg_fixed.bam"
m323k_hom_1_bam_path = "/home/sam/cluster/TDP43_RNA/TDP_F210I_M323K/M323K/New_adult_brain/processed/M323K_HOM_1/M323K_HOM_1_unique_rg_fixed.bam"

rel_usages = pd.read_csv(paqr_rel_usages_path,sep="\t")

with open(rel_usages_header_path) as inpt:
    
    sample_names = [line.rstrip() for line in inpt]


#print(rel_usages)
#print(sample_names)

#First 10 columns are the same in relative usages df
colnames_rel_usages = ["chr",
 "cluster_start",
 "cluster_end",
 "site_id",
 "score",
 "strand",
 "n_along_exon",
 "total_sites_on_exon", 
 "paqr_exon_id", 
 "gene_id" # this is technically transcript_id but for now I'll leave it...
]

#Rest of columns are samples names in order found in relative usages output df
colnames_rel_usages.extend(sample_names)

print(colnames_rel_usages)


#Because samples are paired according to config - can appear multiple times in df...
sample_names = set(sample_names)
print(sample_names)

rel_usages.columns = colnames_rel_usages
print(rel_usages)

In [None]:

#grouped = rel_usages.groupby("gene_id")
#print(grouped)

#for a, b in grouped:
#    print(a)
#    print(b["site_id"].to_list())

    
    
def polyASite_id_to_coordinate_tuple(paqr_df, site_id_colname = "site_id", group_col = "gene_id"):
    '''
    Nested dict of {<group_col_key>: {site_id: (chr, start, end)}}}
    Assume coord is 1 based, output will be 0 based, 1/2 open
    '''
    
    df_grouped = paqr_df.groupby(group_col)
    
    out_dict = {}
    
    for group_name, group in df_grouped:
        
        site_ids = group[site_id_colname].to_list()
        nested_dict = {}
        
        for site in site_ids:
            #'chr11:+:55110898:TE'
            ID = site.split(':')
            seq_tuple = (ID[0], int(ID[2]), int(ID[2]) + 1)
            
            nested_dict[site] = seq_tuple 
            
        out_dict[group_name] = nested_dict
        
        
    return out_dict


polya_jnc_coords = polyASite_id_to_coordinate_tuple(rel_usages)
#print(polya_jnc_coords)


### For testing, let's just look at a few transcripts
small_polya_jnc_coords = {key: polya_jnc_coords[key] for key in list(polya_jnc_coords.keys())[:5]}
#print(small_polya_jnc_coords)

Now I've got my PAQR inferred/ PolyASite poly(A) sites , I want to try and count the number of reads that span these positions - 'junction reads'

As I will want to quantify these junctions relative to the splice junction at the 5'end of the terminal exon, I have to treat the poly(A) junction alignments as if they were a splice junction (where sequence is disjointly aligned to the genome

STAR (and likely other splice-aware aligners) has a --alignSJDBoverhangMin parameter, which requires that a putative junction read 'overhangs' the junction by at least x nt (x = 3 by default) in orfer for it to be assigned to the junction


Reads aligned to genome at poly(A) sites aren't subject to this parameter, so junction counts would be inflated relative to the 5' splice junction


using pysam, for each read crossing the poly(A) site, check whether start or end of read alignment falls within 3nt of the poly(A) site coordinate

abs(polya_site - alignment_start) > 3 and abs(alignment_end - polya_site) > 3 = valid poly(A) site junction read





In [None]:
#WT 1

wt_1 = pysam.AlignmentFile(m323k_wt_1_bam_path, "rb")


jnc_counts_dict = {}

for transcript, polya_dict in small_polya_jnc_coords.items():
    # store junction read counts for every poly(A) site in transcript
    tr_polya_dict = {}
    #print(polya_dict)
    
    for polya_site, coords_tuple in polya_dict.items():
        
        #Reads overlapping poly(A) site
        site_jnc_read_count = 0
        
        for read_entry in wt_1.fetch(coords_tuple[0], coords_tuple[1], coords_tuple[2]):
            
            #0-based leftmost reference coordinate of the aligned sequence
            align_start = read_entry.reference_start
            
            # Aligned reference position of the read on the reference genome.
            # Reference_end points to one past the last aligned residue. Returns None if not available (read is unmapped or no cigar alignment present).

            align_end = read_entry.reference_end
            
            #print(read_entry.query_name)
            #print("align_start: {0}, align_end: {1}".format(str(align_start), str(align_end)))
            
            #polyA site - align_start & align_end - polya_site
            if abs(coords_tuple[1] - align_start) > 3 and abs(align_end - coords_tuple[1]) > 3:
                site_jnc_read_count += 1
                
            else:
                pass
        
        #Now have checked every read overlapping poly(A) site
        tr_polya_dict[polya_site] = site_jnc_read_count
        
    
    #Now have counted junction reads for each poly(A) site in transcript
    jnc_counts_dict[transcript] = tr_polya_dict
    
    

print(jnc_counts_dict)


def is_polya_junction_read(read_entry,polya_start, sj_overhang):
    '''
    
    '''
    
    #0-based leftmost reference coordinate of the aligned sequence
    align_start = read_entry.reference_start
           
    # Aligned reference position of the read on the reference genome.
    # Reference_end points to one past the last aligned residue. Returns None if not available (read is unmapped or no cigar alignment present).

    align_end = read_entry.reference_end
    
    if abs(polya_start - align_start) > sj_overhang and abs(align_end - polya_start) > sj_overhang:
        return True
    
    else:
        return False
    

    
#Try a more compartmentalised version of count_dict
jnc_counts_dict_2 = {}

for transcript, polya_dict in small_polya_jnc_coords.items():
    
    tr_polya_counts = {polya_site: sum(is_polya_junction_read(read, coords_tuple[1], 3) for read in wt_1.fetch(coords_tuple[0], 
                                                                            coords_tuple[1], 
                                                                            coords_tuple[2]))
                       for polya_site, coords_tuple in polya_dict.items()}

    jnc_counts_dict_2[transcript] = tr_polya_counts

    
print(jnc_counts_dict_2)

wt_1.close()

def get_polya_junction_counts_dict(bam_path, jnc_coords_dict, sj_overhang):
    '''
    '''

    bam = pysam.AlignmentFile(bam_path, "rb")
    
    #final output dict of {tr: {site_id: jnc_counts}}
    counts_dict = {}
    
    for transcript, polya_coords_dict in jnc_coords_dict.items():
        
        polya_coords_counts = {polya_site: sum(is_polya_junction_read(read,
                                                                      coords_tuple[1],
                                                                      sj_overhang) 
                                               
                                               for read in bam.fetch(coords_tuple[0], 
                                                                     coords_tuple[1], 
                                                                     coords_tuple[2])
                                              )
                               for polya_site, coords_tuple in polya_coords_dict.items()}
        
        counts_dict[transcript] = polya_coords_counts
        
    
    bam.close()
    
    return counts_dict

wt_1_polya_jnc_counts = get_polya_junction_counts_dict(m323k_wt_1_bam_path, small_polya_jnc_coords, 3)
hom_1_polya_jnc_counts = get_polya_junction_counts_dict(m323k_hom_1_bam_path, small_polya_jnc_coords, 3)

print(len(wt_1_polya_jnc_counts))
print(len(hom_1_polya_jnc_counts))

print(wt_1_polya_jnc_counts)
print(hom_1_polya_jnc_counts)