In [3]:
# importing dependencies
from Bio import SeqIO
import pandas as pd
from collections import Counter


# importing main sequence



In [4]:
# import & parse fasta
record_count = 0
with open ("../../data/bcl2_3utr.fasta") as f:
    records = SeqIO.parse(f, "fasta")
    
    transcripts = [str(rec.seq) for rec in records]
    
    # string containing whole sequence
    sequence = "".join(transcripts)


# importing miRNA sequence database

In [5]:
# opening miRNA sequence database

with open("../../data/mature_miRNAs.fa") as fasta_file:
    identifiers = []
    lengths = []
    sequences = []
    cDNA = []

    for seq_record in SeqIO.parse(fasta_file, "fasta"):
        identifiers.append(seq_record.id)
        lengths.append(len(seq_record.seq))
        sequences.append(str(seq_record.seq))
        
        #back_transcribe() changes U's into T's, complement() takes the complement
        temp = seq_record.seq.back_transcribe()
        temp = temp.complement()
        cDNA.append(str(temp))


# zipping lists into a pandas df
mirna_df = pd.DataFrame(list(zip(identifiers, sequences, cDNA)), columns = ["name", "miRNA_seq", "cDNA_seq"])

# removing non-human entries
mirna_df = mirna_df[mirna_df["name"].str.contains("hsa")]

# deriving MRE sequences and adding as a new column
mirna_df["MRE"] = mirna_df["cDNA_seq"].str[1:9]

# fixing index numbers
mirna_df.index = range(1, len(mirna_df)+1)

# deriving 7mer columns
mirna_df["7mer1"] = mirna_df["cDNA_seq"].str[1:8]
mirna_df["7mer2"] = mirna_df["cDNA_seq"].str[2:9]

#deriving 6mer columns
mirna_df["6mer1"] = mirna_df["cDNA_seq"].str[1:7]
mirna_df["6mer2"] = mirna_df["cDNA_seq"].str[2:8]
mirna_df["6mer3"] = mirna_df["cDNA_seq"].str[3:9]


mirna_df.head()

Unnamed: 0,name,miRNA_seq,cDNA_seq,MRE,7mer1,7mer2,6mer1,6mer2,6mer3
1,hsa-let-7a-5p,UGAGGUAGUAGGUUGUAUAGUU,ACTCCATCATCCAACATATCAA,CTCCATCA,CTCCATC,TCCATCA,CTCCAT,TCCATC,CCATCA
2,hsa-let-7a-3p,CUAUACAAUCUACUGUCUUUC,GATATGTTAGATGACAGAAAG,ATATGTTA,ATATGTT,TATGTTA,ATATGT,TATGTT,ATGTTA
3,hsa-let-7a-2-3p,CUGUACAGCCUCCUAGCUUUCC,GACATGTCGGAGGATCGAAAGG,ACATGTCG,ACATGTC,CATGTCG,ACATGT,CATGTC,ATGTCG
4,hsa-let-7b-5p,UGAGGUAGUAGGUUGUGUGGUU,ACTCCATCATCCAACACACCAA,CTCCATCA,CTCCATC,TCCATCA,CTCCAT,TCCATC,CCATCA
5,hsa-let-7b-3p,CUAUACAACCUACUGCCUUCCC,GATATGTTGGATGACGGAAGGG,ATATGTTG,ATATGTT,TATGTTG,ATATGT,TATGTT,ATGTTG


# EDA

In [6]:
#checking miRNAs for duplicates

print(f"Total number of miRNA sequences: {len(mirna_df)}")
print("Duplicated MRE sequences: ", mirna_df["MRE"].duplicated().sum())
print("Duplicated miRNA sequences: ", mirna_df["miRNA_seq"].duplicated().sum())
print("Duplicated IDs: ", mirna_df["name"].duplicated().sum())
mirna_df.head()

Total number of miRNA sequences: 2656
Duplicated MRE sequences:  298
Duplicated miRNA sequences:  24
Duplicated IDs:  0


Unnamed: 0,name,miRNA_seq,cDNA_seq,MRE,7mer1,7mer2,6mer1,6mer2,6mer3
1,hsa-let-7a-5p,UGAGGUAGUAGGUUGUAUAGUU,ACTCCATCATCCAACATATCAA,CTCCATCA,CTCCATC,TCCATCA,CTCCAT,TCCATC,CCATCA
2,hsa-let-7a-3p,CUAUACAAUCUACUGUCUUUC,GATATGTTAGATGACAGAAAG,ATATGTTA,ATATGTT,TATGTTA,ATATGT,TATGTT,ATGTTA
3,hsa-let-7a-2-3p,CUGUACAGCCUCCUAGCUUUCC,GACATGTCGGAGGATCGAAAGG,ACATGTCG,ACATGTC,CATGTCG,ACATGT,CATGTC,ATGTCG
4,hsa-let-7b-5p,UGAGGUAGUAGGUUGUGUGGUU,ACTCCATCATCCAACACACCAA,CTCCATCA,CTCCATC,TCCATCA,CTCCAT,TCCATC,CCATCA
5,hsa-let-7b-3p,CUAUACAACCUACUGCCUUCCC,GATATGTTGGATGACGGAAGGG,ATATGTTG,ATATGTT,TATGTTG,ATATGT,TATGTT,ATGTTG


In [7]:
# exploring the imported sequence

print(f"The length of target sequence: {len(sequence)}")
print(f"Total number of transcripts in the file: {len(transcripts)}")
print(f"Total number of A: {sequence.count('A')}")
print(f"Total number of T: {sequence.count('T')}")
print(f"Total number of C: {sequence.count('C')}")
print(f"Total number of G: {sequence.count('G')}")
print(f"GC ratio:  {(sequence.count('G') + sequence.count('C')) / len(sequence)}" )


The length of target sequence: 229908
Total number of transcripts in the file: 9
Total number of A: 63162
Total number of T: 69170
Total number of C: 47588
Total number of G: 49988
GC ratio:  0.4244132435582929


# required functions

In [8]:
# generator function to find kmers of k size:

def generate_kmers(sequence, k=6):
    for nucleotide in range(len(sequence) - k + 1):
        yield sequence[nucleotide: nucleotide + k]

In [9]:
# function to find seed sequences in kmers
def find_seed_in_kmers(kmer_generator, seeds):
    
    # copying seeds dict into the function
    temp = seeds.copy()
    
    # counter that iterates through kmers to find matches
    counter = Counter([kmer for kmer in kmer_generator if kmer in temp])

    # returns dict, sorted by the number of matches            
    return dict(sorted(counter.items(), key = lambda item: item[1], reverse=True))

In [10]:
# creating a dict that holds all possible kmers to iterate on

columns_to_merge = ["MRE", "7mer1", "7mer2", "6mer1", "6mer2", "6mer3"]
temp = []
for column in columns_to_merge:
    temp.extend(mirna_df[column].tolist())

# final dict containing all possible kmers and their corresponding hit values (currently 0)
seed_kmers = {k: 0 for k in temp}

# results

In [11]:
# generate_kmers will find kmers in the imported fasta
# find_seed_in_kmers will find matches and counts them 
seeds = find_seed_in_kmers(generate_kmers(sequence, 8), seed_kmers)


# for loop that prints only the seeds with at least one hit
for kmer, count in seeds.items():
    if count != 0:
        # fetches miRNA names having that specific seed sequence
        matching_mirnas = mirna_df.loc[(mirna_df["MRE"] == kmer)]["name"].values
        print(f"Seed: {kmer},    {count} matches,    matching miRNA(s): {matching_mirnas}")

Seed: TGTGTGTG,    142 matches,    matching miRNA(s): ['hsa-miR-8485']
Seed: CACACACA,    81 matches,    matching miRNA(s): ['hsa-miR-6867-5p']
Seed: ATGTGTGT,    32 matches,    matching miRNA(s): ['hsa-miR-3941']
Seed: TTTTCATT,    31 matches,    matching miRNA(s): ['hsa-miR-548n']
Seed: ATTAAAAA,    30 matches,    matching miRNA(s): ['hsa-miR-4775']
Seed: CCTGCCTC,    30 matches,    matching miRNA(s): ['hsa-miR-184']
Seed: TTTATTAT,    29 matches,    matching miRNA(s): ['hsa-miR-5692a']
Seed: TTTCTCCT,    27 matches,    matching miRNA(s): ['hsa-miR-583']
Seed: GGGAGGAA,    26 matches,    matching miRNA(s): ['hsa-miR-4667-3p']
Seed: TTGTTTGT,    25 matches,    matching miRNA(s): ['hsa-miR-495-3p' 'hsa-miR-5688']
Seed: CCTTTTTT,    25 matches,    matching miRNA(s): ['hsa-miR-3148']
Seed: GGGAAAAA,    24 matches,    matching miRNA(s): ['hsa-miR-6832-3p']
Seed: GAGAAAAG,    24 matches,    matching miRNA(s): ['hsa-miR-130a-5p']
Seed: TTTCCAAA,    24 matches,    matching miRNA(s): ['hsa-mi

## **top 3 miRNAs by match size:**





In [12]:
seeds = find_seed_in_kmers(generate_kmers(sequence, 8), seed_kmers)

for kmer, count in seeds.items():
    if count != 0:
        possible_sequences = mirna_df.loc[(mirna_df["MRE"] == kmer)]["name"].values
        print(f"{kmer}: {count} - {possible_sequences}")

TGTGTGTG: 142 - ['hsa-miR-8485']
CACACACA: 81 - ['hsa-miR-6867-5p']
ATGTGTGT: 32 - ['hsa-miR-3941']
TTTTCATT: 31 - ['hsa-miR-548n']
ATTAAAAA: 30 - ['hsa-miR-4775']
CCTGCCTC: 30 - ['hsa-miR-184']
TTTATTAT: 29 - ['hsa-miR-5692a']
TTTCTCCT: 27 - ['hsa-miR-583']
GGGAGGAA: 26 - ['hsa-miR-4667-3p']
TTGTTTGT: 25 - ['hsa-miR-495-3p' 'hsa-miR-5688']
CCTTTTTT: 25 - ['hsa-miR-3148']
GGGAAAAA: 24 - ['hsa-miR-6832-3p']
GAGAAAAG: 24 - ['hsa-miR-130a-5p']
TTTCCAAA: 24 - ['hsa-miR-12135']
CCTGACCT: 24 - ['hsa-miR-4515']
TTTCATTT: 23 - ['hsa-miR-559']
AGAGAAAG: 23 - ['hsa-miR-4753-3p']
CCTCCTCC: 22 - ['hsa-miR-11181-3p']
TTTCATTA: 22 - ['hsa-miR-548b-5p' 'hsa-miR-548a-5p' 'hsa-miR-548c-5p' 'hsa-miR-548d-5p'
 'hsa-miR-548j-5p' 'hsa-miR-548h-5p' 'hsa-miR-548i' 'hsa-miR-548y'
 'hsa-miR-548o-5p' 'hsa-miR-548ab' 'hsa-miR-548ad-5p' 'hsa-miR-548ae-5p'
 'hsa-miR-548am-5p' 'hsa-miR-548ap-5p' 'hsa-miR-548aq-5p'
 'hsa-miR-548ar-5p' 'hsa-miR-548as-5p' 'hsa-miR-548au-5p'
 'hsa-miR-548ay-5p']
ATAATAAT: 22 - ['hsa-mi

In [14]:
## HATA

seeds = find_seed_in_kmers(generate_kmers(sequence, 7), seed_kmers)

for kmer, count in seeds.items():
    if count != 0:
        possible_sequences = mirna_df.loc[(mirna_df['7mer1'] == kmer) | (mirna_df['7mer2'] == kmer)]["name"].values
        print(f"{kmer}: {count} - {possible_sequences}")

TTTTTTT: 467 - ['hsa-miR-3613-3p']
TGTGTGT: 256 - ['hsa-miR-3941' 'hsa-miR-8485']
GTGTGTG: 178 - ['hsa-miR-8485']
TTTTAAA: 136 - ['hsa-miR-4282']
TTAAAAA: 125 - ['hsa-miR-4775']
ACACACA: 117 - ['hsa-miR-6867-5p']
TTGTTTT: 115 - ['hsa-miR-3065-5p']
CTTTTTT: 98 - ['hsa-miR-3148']
CACACAC: 94 - ['hsa-miR-6867-5p']
AAAAGAA: 91 - ['hsa-miR-627-3p']
AGAGAGA: 89 - ['hsa-miR-4768-5p' 'hsa-miR-6833-3p' 'hsa-miR-6873-3p']
TTTTCAT: 82 - ['hsa-miR-548n']
AGAAAGA: 74 - ['hsa-miR-6830-3p']
TATATAT: 72 - ['hsa-miR-1277-5p' 'hsa-miR-5011-5p']
AGAGAAA: 72 - ['hsa-miR-4753-3p']
TTTCATT: 71 - ['hsa-miR-559' 'hsa-miR-548b-5p' 'hsa-miR-548a-5p' 'hsa-miR-548c-5p'
 'hsa-miR-548d-5p' 'hsa-miR-548j-5p' 'hsa-miR-548n' 'hsa-miR-548h-5p'
 'hsa-miR-548i' 'hsa-miR-548w' 'hsa-miR-548y' 'hsa-miR-548o-5p'
 'hsa-miR-548ab' 'hsa-miR-548ad-5p' 'hsa-miR-548ae-5p' 'hsa-miR-548ak'
 'hsa-miR-548am-5p' 'hsa-miR-548ap-5p' 'hsa-miR-548aq-5p'
 'hsa-miR-548ar-5p' 'hsa-miR-548as-5p' 'hsa-miR-548au-5p'
 'hsa-miR-548ay-5p' 'hsa-miR-

In [15]:
# writes whole sequence string to a file

with open("sequence.txt", "w") as export:
    export.write(sequence)