# TODO:

fig. 1

Stamoulakatou, E., Pinoli, P., Ceri, S., & Piro, R. M. (2020). Impact of mutational signatures on microRNA and their response elements. Pacific Symposium on Biocomputing. Pacific Symposium on Biocomputing, 25, 250–261. 


- DNA nucleotide alphabet be represented by the set A = {a, c, g, t}

- nucleotide triplets t = 〈a1, a2, a3〉 ∈ A3

- single nucleotide variants described by 96 possible somatic mutation types m = 〈a1, [a2 → a4] , a3〉

- Pg(t) as the empirical probability (i.e., frequency) of observing the triplet t in the given genome g (human assembly)

such that: sum(Pg(t)) = 1

- s: the mutational signature associated with a specific mutational process


# Code

In [None]:
# importing dependencies
from Bio import SeqIO
import pandas as pd

from itertools import product

In [None]:
# import & parse fasta

with open ("../data/bcl2_canonical_3utr.fa") as f:
    records = SeqIO.parse(f, "fasta")
    
    transcripts = [str(rec.seq) for rec in records]
    
    # string containing whole sequence
    sequence = "".join(transcripts)

len(sequence)

In [None]:
# function that finds triplets

def find_triplets(sequence):
    return [sequence[nucleotide : nucleotide + 3] for nucleotide in range(len(sequence) - 2)]

# generate triplets
triplets = find_triplets(sequence)
len(triplets)

triplets[-1]

In [None]:
# generating the dict that holds triplet:frequency pairs
alphabet = ["A", "C", "G", "T"]
triplet_combinations = [a+b+c for a in alphabet for b in alphabet for c in alphabet]



# functions that take complement of a triplet
def complement(n):
    if n == "A":
        return "T"
    elif n == "C":
        return "G"
    elif n == "G":
        return "C"
    else:
        return "A"
    
def complement_of_triplet(triplet):
    return "".join(complement(character) for character in triplet)
    

# loop that iterates over triplet combinations to eliminate complementary triplets (their freq. of occurency is the same)
for triplet in triplet_combinations:
    if complement_of_triplet(triplet) in triplet_combinations:
        triplet_combinations.remove(triplet)



# initializing and populating dict that holds triplet:frequency pairs
observation_probability = {triplet : 0 for triplet in triplet_combinations}
print(observation_probability)



In [None]:
observation_probability["TTG"]

In [None]:
def get_triplet_probability(triplet):
    if triplet in observation_probability:
        return observation_probability[triplet]
    else:
        return observation_probability[complement_of_triplet(triplet)]



probs = get_triplet_probability(triplets[0]) 

print(probs)

for triplet in triplets:
    print(get_triplet_probability(triplet))


# misc. codes

In [None]:
def complement(n):
    if n == "A":
        return "T"
    elif n == "C":
        return "G"
    elif n == "G":
        return "C"
    else:
        return "A"

In [None]:
def reverse_complement(t):
    return "".join([complement(x) for x in t[::-1]])