In [2]:
# importing dependencies
from Bio import SeqIO
import pandas as pd
from collections import Counter


# Importing main sequence

write the relative path of your fasta file below


In [3]:
# import & parse fasta

with open ("../../data/bcl2_3utr.fasta") as f:
    records = SeqIO.parse(f, "fasta")
    
    transcripts = [str(rec.seq) for rec in records]
    
    # string containing whole sequence
    sequence = "".join(transcripts)


# Importing miRNA sequence database

download "mature.fa" from [miRBase Database](https://www.mirbase.org/ftp.shtml) and use in the cell below.

In [4]:
# opening miRNA sequence database


with open("../../data/mature_miRNAs.fa") as fasta_file:
    identifiers = []
    lengths = []
    sequences = []
    cDNA = []

    for seq_record in SeqIO.parse(fasta_file, "fasta"):
        identifiers.append(seq_record.id)
        lengths.append(len(seq_record.seq))
        sequences.append(str(seq_record.seq))

        #back_transcribe() changes U's into T's, complement() takes the complement
        temp = seq_record.seq.back_transcribe()
        temp = temp.complement()
        cDNA.append(str(temp))


# zipping lists into a pandas df
mirna_df = pd.DataFrame(list(zip(identifiers, sequences, cDNA)), columns = ["name", "miRNA_seq", "cDNA_seq"])

# removing non-human entries
mirna_df = mirna_df[mirna_df["name"].str.contains("hsa")]

# fixing index numbers
mirna_df.index = range(1, len(mirna_df)+1)


# deriving MRE sequences and adding as a new column
mirna_df["MRE"] = mirna_df["cDNA_seq"].str[1:9]

# deriving kmer columns
mirna_df["8mer"] = mirna_df["cDNA_seq"].str[:8]
mirna_df["7mer-A1"] = mirna_df["cDNA_seq"].str[:7]
mirna_df["7mer-m8"] = mirna_df["cDNA_seq"].str[1:8]
mirna_df["6mer"] = mirna_df["cDNA_seq"].str[1:7]
mirna_df["offset_6mer"] = mirna_df["cDNA_seq"].str[2:8]


# dropping 8mer and 7mer-A8 values that doesn't start with an A
mirna_df["8mer"].replace({"^[C,T,G]": None}, regex=True, inplace=True)
mirna_df["7mer-A1"].replace({"^[C,T,G]": None}, regex=True, inplace=True)


mirna_df.head()

Unnamed: 0,name,miRNA_seq,cDNA_seq,MRE,8mer,7mer-A1,7mer-m8,6mer,offset_6mer
1,hsa-let-7a-5p,UGAGGUAGUAGGUUGUAUAGUU,ACTCCATCATCCAACATATCAA,CTCCATCA,ACTCCATC,ACTCCAT,CTCCATC,CTCCAT,TCCATC
2,hsa-let-7a-3p,CUAUACAAUCUACUGUCUUUC,GATATGTTAGATGACAGAAAG,ATATGTTA,,,ATATGTT,ATATGT,TATGTT
3,hsa-let-7a-2-3p,CUGUACAGCCUCCUAGCUUUCC,GACATGTCGGAGGATCGAAAGG,ACATGTCG,,,ACATGTC,ACATGT,CATGTC
4,hsa-let-7b-5p,UGAGGUAGUAGGUUGUGUGGUU,ACTCCATCATCCAACACACCAA,CTCCATCA,ACTCCATC,ACTCCAT,CTCCATC,CTCCAT,TCCATC
5,hsa-let-7b-3p,CUAUACAACCUACUGCCUUCCC,GATATGTTGGATGACGGAAGGG,ATATGTTG,,,ATATGTT,ATATGT,TATGTT


# Exploratory Data Analysis

In [5]:
#checking miRNAs for duplicates

print(f"Total number of miRNA sequences: {len(mirna_df)}")
print("Duplicated MRE sequences: ", mirna_df["MRE"].duplicated().sum())
print("Duplicated miRNA sequences: ", mirna_df["miRNA_seq"].duplicated().sum())
print("Duplicated IDs: ", mirna_df["name"].duplicated().sum())
mirna_df.head()

Total number of miRNA sequences: 2656
Duplicated MRE sequences:  298
Duplicated miRNA sequences:  24
Duplicated IDs:  0


Unnamed: 0,name,miRNA_seq,cDNA_seq,MRE,8mer,7mer-A1,7mer-m8,6mer,offset_6mer
1,hsa-let-7a-5p,UGAGGUAGUAGGUUGUAUAGUU,ACTCCATCATCCAACATATCAA,CTCCATCA,ACTCCATC,ACTCCAT,CTCCATC,CTCCAT,TCCATC
2,hsa-let-7a-3p,CUAUACAAUCUACUGUCUUUC,GATATGTTAGATGACAGAAAG,ATATGTTA,,,ATATGTT,ATATGT,TATGTT
3,hsa-let-7a-2-3p,CUGUACAGCCUCCUAGCUUUCC,GACATGTCGGAGGATCGAAAGG,ACATGTCG,,,ACATGTC,ACATGT,CATGTC
4,hsa-let-7b-5p,UGAGGUAGUAGGUUGUGUGGUU,ACTCCATCATCCAACACACCAA,CTCCATCA,ACTCCATC,ACTCCAT,CTCCATC,CTCCAT,TCCATC
5,hsa-let-7b-3p,CUAUACAACCUACUGCCUUCCC,GATATGTTGGATGACGGAAGGG,ATATGTTG,,,ATATGTT,ATATGT,TATGTT


In [6]:
# exploring the imported sequence

print(f"The length of target sequence: {len(sequence)}")
print(f"Total number of transcripts in the file: {len(transcripts)}")
print(f"Total number of A: {sequence.count('A')}")
print(f"Total number of T: {sequence.count('T')}")
print(f"Total number of C: {sequence.count('C')}")
print(f"Total number of G: {sequence.count('G')}")
print(f"GC ratio:  {(sequence.count('G') + sequence.count('C')) / len(sequence)}" )


The length of target sequence: 229908
Total number of transcripts in the file: 9
Total number of A: 63162
Total number of T: 69170
Total number of C: 47588
Total number of G: 49988
GC ratio:  0.4244132435582929


# required functions

In [7]:
# generator function to find kmers of k size:

def generate_kmers(sequence, k=6):
    for nucleotide in range(len(sequence) - k + 1):
        yield sequence[nucleotide: nucleotide + k]

In [8]:
# function to find seed sequences in kmers
def find_seed_in_kmers(kmer_generator, seeds):
    
    # copying seeds dict into the function
    temp = seeds.copy()
    
    # counter that iterates through kmers to find matches
    counter = Counter([kmer for kmer in kmer_generator if kmer in temp])

    # returns dict, sorted by the number of matches            
    return dict(sorted(counter.items(), key = lambda item: item[1], reverse=True))

In [9]:
# creating a dict that holds all possible kmers to iterate on

columns_to_merge = ["MRE", "7mer-A1", "7mer-m8", "6mer"]
temp = []
for column in columns_to_merge:
    temp.extend(mirna_df[column].tolist())

# final dict containing all possible kmers and their corresponding hit values (currently 0)
seed_kmers = {k: 0 for k in temp}

# Results

the cell below computes and prints 8mer matches


In [14]:
# generate_kmers will find kmers in the imported fasta
# find_seed_in_kmers will find matches and counts them 
seeds = find_seed_in_kmers(generate_kmers(sequence, k=8), seed_kmers)


# for loop that prints only the seeds with at least one hit
for kmer, count in seeds.items():
    if count != 0:
        # fetches miRNA names having that specific seed sequence
        matching_mirnas = mirna_df.loc[(mirna_df["MRE"] == kmer)]["name"].values
        print(f"Seed: {kmer},    {count} matches,    matching miRNA(s): {matching_mirnas}")

Seed: TGTGTGTG,    142 matches,    matching miRNA(s): ['hsa-miR-8485']
Seed: CACACACA,    81 matches,    matching miRNA(s): ['hsa-miR-6867-5p']
Seed: ATGTGTGT,    32 matches,    matching miRNA(s): ['hsa-miR-3941']
Seed: TTTTCATT,    31 matches,    matching miRNA(s): ['hsa-miR-548n']
Seed: ATTAAAAA,    30 matches,    matching miRNA(s): ['hsa-miR-4775']
Seed: CCTGCCTC,    30 matches,    matching miRNA(s): ['hsa-miR-184']
Seed: TTTATTAT,    29 matches,    matching miRNA(s): ['hsa-miR-5692a']
Seed: TTTCTCCT,    27 matches,    matching miRNA(s): ['hsa-miR-583']
Seed: GGGAGGAA,    26 matches,    matching miRNA(s): ['hsa-miR-4667-3p']
Seed: TTGTTTGT,    25 matches,    matching miRNA(s): ['hsa-miR-495-3p' 'hsa-miR-5688']
Seed: CCTTTTTT,    25 matches,    matching miRNA(s): ['hsa-miR-3148']
Seed: GGGAAAAA,    24 matches,    matching miRNA(s): ['hsa-miR-6832-3p']
Seed: GAGAAAAG,    24 matches,    matching miRNA(s): ['hsa-miR-130a-5p']
Seed: TTTCCAAA,    24 matches,    matching miRNA(s): ['hsa-mi

the cell below computes 7mer matches

In [11]:
# top matching 7mers

seeds = find_seed_in_kmers(generate_kmers(sequence, 7), seed_kmers)

for kmer, count in seeds.items():
    if count != 0:
        possible_sequences = mirna_df.loc[(mirna_df['7mer-A1'] == kmer) | (mirna_df['7mer-m8'] == kmer)]["name"].values
        print(f"{kmer}: {count} - {possible_sequences}")

TGTGTGT: 256 - ['hsa-miR-8485']
TTTTAAA: 136 - ['hsa-miR-4282']
ACACACA: 117 - ['hsa-miR-6867-5p']
CACACAC: 94 - ['hsa-miR-6867-5p']
AGAGAGA: 89 - ['hsa-miR-6817-3p' 'hsa-miR-6873-3p' 'hsa-miR-7110-3p']
TTTTCAT: 82 - ['hsa-miR-548n']
ATATTTT: 74 - ['hsa-miR-3163']
ATTTCTT: 73 - ['hsa-miR-3133']
TATATAT: 72 - ['hsa-miR-5011-5p']
AGAGAAA: 72 - ['hsa-miR-4753-3p']
TTTCATT: 71 - ['hsa-miR-559' 'hsa-miR-548b-5p' 'hsa-miR-548a-5p' 'hsa-miR-548c-5p'
 'hsa-miR-548d-5p' 'hsa-miR-548j-5p' 'hsa-miR-548h-5p' 'hsa-miR-548i'
 'hsa-miR-548w' 'hsa-miR-548y' 'hsa-miR-548o-5p' 'hsa-miR-548ab'
 'hsa-miR-548ad-5p' 'hsa-miR-548ae-5p' 'hsa-miR-548ak' 'hsa-miR-548am-5p'
 'hsa-miR-548ap-5p' 'hsa-miR-548aq-5p' 'hsa-miR-548ar-5p'
 'hsa-miR-548as-5p' 'hsa-miR-548au-5p' 'hsa-miR-548ay-5p'
 'hsa-miR-548bb-5p']
TTTCTCT: 71 - ['hsa-miR-4311']
TTTTTCA: 70 - ['hsa-miR-12136']
GAGAGAG: 70 - ['hsa-miR-7110-3p']
AATATTT: 68 - ['hsa-miR-340-5p']
TATTTTA: 66 - ['hsa-miR-3163']
TGTTGTT: 65 - ['hsa-miR-3529-3p']
TTGTATT: 65 

# experiments

These temporary cells are not a part of the main code. They're here to test stuff & will be deleted later.

In [12]:
# code that drops 7mers that doesn't start with an A (experiment for anchor A filtering)
mirna_df["7mer-A1"].replace({"^[C,T,G]": None}, regex=True, inplace=True)

mirna_df.head()

Unnamed: 0,name,miRNA_seq,cDNA_seq,MRE,8mer,7mer-A1,7mer-m8,6mer,offset_6mer
1,hsa-let-7a-5p,UGAGGUAGUAGGUUGUAUAGUU,ACTCCATCATCCAACATATCAA,CTCCATCA,ACTCCATC,ACTCCAT,CTCCATC,CTCCAT,TCCATC
2,hsa-let-7a-3p,CUAUACAAUCUACUGUCUUUC,GATATGTTAGATGACAGAAAG,ATATGTTA,,,ATATGTT,ATATGT,TATGTT
3,hsa-let-7a-2-3p,CUGUACAGCCUCCUAGCUUUCC,GACATGTCGGAGGATCGAAAGG,ACATGTCG,,,ACATGTC,ACATGT,CATGTC
4,hsa-let-7b-5p,UGAGGUAGUAGGUUGUGUGGUU,ACTCCATCATCCAACACACCAA,CTCCATCA,ACTCCATC,ACTCCAT,CTCCATC,CTCCAT,TCCATC
5,hsa-let-7b-3p,CUAUACAACCUACUGCCUUCCC,GATATGTTGGATGACGGAAGGG,ATATGTTG,,,ATATGTT,ATATGT,TATGTT


In [13]:
# writes whole sequence string to a file

with open("sequence.txt", "w") as export:
    export.write(sequence)