In [1]:
from utils import *
from tools import *
import pandas as pd

In [2]:
# parameters for TP53

seq_file = "../sequences/tp53/Homo_sapiens_ENST00000420246_2_sequence.fa"
canonical_result_file = (
    "../sequences/tp53/TargetScan_8.0_ENST00000420246.2_predicted_targeting_details.txt"
)


In [3]:
sequence = import_fasta(seq_file)

targetscan_df = pd.read_csv(
    "../data/supplementary_files/targetscan.tsv", sep="\t")

targetscan_df.head()

Unnamed: 0,name,sequence,seed,conservation,accession
0,hsa-let-7a-5p,AACUAUACAACCUACUACCUCA,UACCUC,2,MIMAT0000062
1,hsa-let-7b-5p,AACCACACAACCUACUACCUCA,UACCUC,2,MIMAT0000063
2,hsa-let-7c-5p,AACCAUACAACCUACUACCUCA,UACCUC,2,MIMAT0000064
3,hsa-let-7d-5p,AACUAUGCAACCUACUACCUCU,UACCUC,2,MIMAT0000065
4,hsa-let-7e-5p,AACUAUACAACCUCCUACCUCA,UACCUC,2,MIMAT0000066


In [4]:
results_df = find_matches(sequence, targetscan_df,
                          ignore_first_15_nucleotides=True)

results_df = results_df[results_df["start"] > 30]

len(results_df)

646

In [5]:
def mrna_to_dna(string):
    """converts a mRNA string into a DNA string
    """
    return "".join("T" if nuc == "U" else nuc for nuc in string)

In [6]:
mrna_to_dna(sequence)

'AGAGAGCATGAAAATGGTTCTATGACTTTGCCTGATACAGATGCTACTTGACTTACGATGGTGTTACTTCCTGATAAACTCGTCGTAAGTTGAAAATATTATCCGTGGGCGTGAGCGCTTCGAGATGTTCCGAGAGCTGAATGAGGCCTTGGAACTCAAGGATGCCCAGGCTGGGAAGGAGCCAGGGGGGAGCAGGGCTCACTCCAGCCACCTGAAGTCCAAAAAGGGTCAGTCTACCTCCCGCCATAAAAAACTCATGTTCAAGACAGAAGGGCCTGACTCAGACTGACATTCTCCACTTCTTGTTCCCCACTGACAGCCTCCCACCCCCATCTCTCCCTCCCCTGCCATTTTGGGTTTTGGGTCTTTGAACCCTTGCTTGCAATAGGTGTGCGTCAGAAGCACCCAGGACTTCCATTTGCTTTGTCCCGGGGCTCCACTGAACAAGTTGGCCTGCACTGGTGTTTTGTTGTGGGGAGGAGGATGGGGAGTAGGACATACCAGCTTAGATTTTAAGGTTTTTACTGTGAGGGATGTTTGGGAGATGTAAGAAATGTTCTTGCAGTTAAGGGTTAGTTTACAATCAGCCACATTCTAGGTAGGGGCCCACTTCACCGTACTAACCAGGGAAGCTGTCCCTCACTGTTGAATTTTCTCTAACTTCAAGGCCCATATCTGTGAAATGCTGGCATTTGCACCTACCTCACAGAGTGCATTGTGAGGGTTAATGAAATAATGTACATCTGGCCTTGAAACCACCTTTTATTACATGGGGTCTAGAACTTGACCCCCTTGAGGGTGCTTGTTCCCTCTCCCTGTTGGTCGGTGGGTTGGTAGTTTCTACAGTTGGGCAGCTGGTTAGGTAGAGGGAGTTGTCAAGTCTCTGCTGGCCCAGCCAAACCCTGTCTGACAACCTCTTGGTGAACCTTAGTACCTAAAAGGAAATCTCACCCCATCCCACACCCTGGAGGATTTCATCTCTTGTATATGATGATCTGG

In [7]:
starts = results_df["start"].values
ends = results_df["end"].values
mirna_seqs = results_df["mirna_sequence"].values.tolist()

seqq = []
for i in mirna_seqs:
    i = uracil_to_thymine(i[::-1])
    seqq.append(i)


seqs = [mrna_to_dna(sequence)[starts[i]-30:ends[i]+15] for i in range(len(starts))]

df = pd.DataFrame(list(zip(seqq, seqs)), columns=["mirna_sequence", "sequence"])

In [8]:
# adding mRNA lengths as new col
df["mrna_lens"] = [len(x) for x in df["sequence"].values.tolist()]

# adding mirna lengths as new col
df["mirna_lens"] = [len(x) for x in df["mirna_sequence"].values.tolist()]

df = df[df["mirna_lens"] >= 20]
df = df[df["mrna_lens"] >= 50]

df['mirna_sequence'] = df['mirna_sequence'].str.slice(0, 20)
df["sequence"] = df['sequence'].str.slice(0, 50)

In [9]:
df

Unnamed: 0,mirna_sequence,sequence,mrna_lens,mirna_lens
0,AGACATAAGAGGAAACGGAC,GCATGAAAATGGTTCTATGACTTTGCCTGATACAGATGCTACTTGA...,51,23
1,TTTCATCGACATGGTAAACG,ATGGTTCTATGACTTTGCCTGATACAGATGCTACTTGACTTACGAT...,51,20
2,AGTTCATCAAAGTACTATTT,TGGTTCTATGACTTTGCCTGATACAGATGCTACTTGACTTACGATG...,52,22
4,AAGTTCATTAGGTCCTATCC,GGTTCTATGACTTTGCCTGATACAGATGCTACTTGACTTACGATGG...,51,22
5,AAGTTCATTAAGTCCTATCC,GGTTCTATGACTTTGCCTGATACAGATGCTACTTGACTTACGATGG...,51,21
...,...,...,...,...
637,GTTTCAAATTCTAGGAACTT,TTTTTATATCGATCTCTTATTTTACAATAAAACTTTGCTGCCACCT...,51,22
638,AGTCGTTTGTAAATAACACA,TATATCGATCTCTTATTTTACAATAAAACTTTGCTGCCACCTGTGT...,51,22
639,TCCGTCGCCCCACATCACCT,ATCGATCTCTTATTTTACAATAAAACTTTGCTGCCACCTGTGTGTC...,51,22
640,ATCCACCGCGGCCTCCTCAG,GATCTCTTATTTTACAATAAAACTTTGCTGCCACCTGTGTGTCTGA...,50,23


In [10]:
results = df[["mirna_sequence", "sequence"]]

results.to_csv("pred.tsv", sep='\t', index=False, header=False)

In [11]:
results

Unnamed: 0,mirna_sequence,sequence
0,AGACATAAGAGGAAACGGAC,GCATGAAAATGGTTCTATGACTTTGCCTGATACAGATGCTACTTGA...
1,TTTCATCGACATGGTAAACG,ATGGTTCTATGACTTTGCCTGATACAGATGCTACTTGACTTACGAT...
2,AGTTCATCAAAGTACTATTT,TGGTTCTATGACTTTGCCTGATACAGATGCTACTTGACTTACGATG...
4,AAGTTCATTAGGTCCTATCC,GGTTCTATGACTTTGCCTGATACAGATGCTACTTGACTTACGATGG...
5,AAGTTCATTAAGTCCTATCC,GGTTCTATGACTTTGCCTGATACAGATGCTACTTGACTTACGATGG...
...,...,...
637,GTTTCAAATTCTAGGAACTT,TTTTTATATCGATCTCTTATTTTACAATAAAACTTTGCTGCCACCT...
638,AGTCGTTTGTAAATAACACA,TATATCGATCTCTTATTTTACAATAAAACTTTGCTGCCACCTGTGT...
639,TCCGTCGCCCCACATCACCT,ATCGATCTCTTATTTTACAATAAAACTTTGCTGCCACCTGTGTGTC...
640,ATCCACCGCGGCCTCCTCAG,GATCTCTTATTTTACAATAAAACTTTGCTGCCACCTGTGTGTCTGA...
