In [3]:
# importing dependencies
from Bio import SeqIO
import pandas as pd
from collections import Counter


# Importing main sequence

write the relative path of your fasta file below


In [4]:
# import & parse fasta

with open ("../../data/bcl2_canonical_3utr.fa") as f:
    records = SeqIO.parse(f, "fasta")
    
    transcripts = [str(rec.seq) for rec in records]
    
    # string containing whole sequence
    sequence = "".join(transcripts)


# Importing miRNA sequence database

download "mature.fa" from [miRBase Database](https://www.mirbase.org/ftp.shtml) and use in the cell below.

In [5]:
# opening miRNA sequence database


with open("../../data/mature_miRNAs.fa") as fasta_file:
    identifiers = []
    lengths = []
    sequences = []
    cDNA = []

    for seq_record in SeqIO.parse(fasta_file, "fasta"):
        identifiers.append(seq_record.id)
        lengths.append(len(seq_record.seq))
        sequences.append(str(seq_record.seq))

        #back_transcribe() changes U's into T's, complement() takes the complement
        temp = seq_record.seq.back_transcribe()
        temp = temp.complement()
        cDNA.append(str(temp))


# zipping lists into a pandas df
mirna_df = pd.DataFrame(list(zip(identifiers, sequences, cDNA)), columns = ["name", "miRNA_seq", "cDNA_seq"])

# removing non-human entries
mirna_df = mirna_df[mirna_df["name"].str.contains("hsa")]

# fixing index numbers
mirna_df.index = range(1, len(mirna_df)+1)


# deriving MRE sequences and adding as a new column
mirna_df["MRE"] = mirna_df["cDNA_seq"].str[1:9]

# deriving kmer columns
mirna_df["8mer"] = mirna_df["cDNA_seq"].str[:8]
mirna_df["7mer-A1"] = mirna_df["cDNA_seq"].str[:7]
mirna_df["7mer-m8"] = mirna_df["cDNA_seq"].str[1:8]
mirna_df["6mer"] = mirna_df["cDNA_seq"].str[1:7]
mirna_df["offset_6mer"] = mirna_df["cDNA_seq"].str[2:8]


# dropping 8mer and 7mer-A8 values that doesn't start with an A
mirna_df["8mer"].replace({"^[C,T,G]": None}, regex=True, inplace=True)
mirna_df["7mer-A1"].replace({"^[C,T,G]": None}, regex=True, inplace=True)


mirna_df.head()

Unnamed: 0,name,miRNA_seq,cDNA_seq,MRE,8mer,7mer-A1,7mer-m8,6mer,offset_6mer
1,hsa-let-7a-5p,UGAGGUAGUAGGUUGUAUAGUU,ACTCCATCATCCAACATATCAA,CTCCATCA,ACTCCATC,ACTCCAT,CTCCATC,CTCCAT,TCCATC
2,hsa-let-7a-3p,CUAUACAAUCUACUGUCUUUC,GATATGTTAGATGACAGAAAG,ATATGTTA,,,ATATGTT,ATATGT,TATGTT
3,hsa-let-7a-2-3p,CUGUACAGCCUCCUAGCUUUCC,GACATGTCGGAGGATCGAAAGG,ACATGTCG,,,ACATGTC,ACATGT,CATGTC
4,hsa-let-7b-5p,UGAGGUAGUAGGUUGUGUGGUU,ACTCCATCATCCAACACACCAA,CTCCATCA,ACTCCATC,ACTCCAT,CTCCATC,CTCCAT,TCCATC
5,hsa-let-7b-3p,CUAUACAACCUACUGCCUUCCC,GATATGTTGGATGACGGAAGGG,ATATGTTG,,,ATATGTT,ATATGT,TATGTT


# Exploratory Data Analysis

In [6]:
#checking miRNAs for duplicates

print(f"Total number of miRNA sequences: {len(mirna_df)}")
print("Duplicated MRE sequences: ", mirna_df["MRE"].duplicated().sum())
print("Duplicated miRNA sequences: ", mirna_df["miRNA_seq"].duplicated().sum())
print("Duplicated IDs: ", mirna_df["name"].duplicated().sum())
mirna_df.head()

Total number of miRNA sequences: 2656
Duplicated MRE sequences:  298
Duplicated miRNA sequences:  24
Duplicated IDs:  0


Unnamed: 0,name,miRNA_seq,cDNA_seq,MRE,8mer,7mer-A1,7mer-m8,6mer,offset_6mer
1,hsa-let-7a-5p,UGAGGUAGUAGGUUGUAUAGUU,ACTCCATCATCCAACATATCAA,CTCCATCA,ACTCCATC,ACTCCAT,CTCCATC,CTCCAT,TCCATC
2,hsa-let-7a-3p,CUAUACAAUCUACUGUCUUUC,GATATGTTAGATGACAGAAAG,ATATGTTA,,,ATATGTT,ATATGT,TATGTT
3,hsa-let-7a-2-3p,CUGUACAGCCUCCUAGCUUUCC,GACATGTCGGAGGATCGAAAGG,ACATGTCG,,,ACATGTC,ACATGT,CATGTC
4,hsa-let-7b-5p,UGAGGUAGUAGGUUGUGUGGUU,ACTCCATCATCCAACACACCAA,CTCCATCA,ACTCCATC,ACTCCAT,CTCCATC,CTCCAT,TCCATC
5,hsa-let-7b-3p,CUAUACAACCUACUGCCUUCCC,GATATGTTGGATGACGGAAGGG,ATATGTTG,,,ATATGTT,ATATGT,TATGTT


In [7]:
# exploring the imported sequence

print(f"The length of target sequence: {len(sequence)}")
print(f"Total number of transcripts in the file: {len(transcripts)}")
print(f"Total number of A: {sequence.count('A')}")
print(f"Total number of T: {sequence.count('T')}")
print(f"Total number of C: {sequence.count('C')}")
print(f"Total number of G: {sequence.count('G')}")
print(f"GC ratio:  {(sequence.count('G') + sequence.count('C')) / len(sequence)}" )


The length of target sequence: 5279
Total number of transcripts in the file: 1
Total number of A: 1550
Total number of T: 1588
Total number of C: 1001
Total number of G: 1140
GC ratio:  0.4055692365978405


# required functions

In [8]:
# generator function to find kmers of k size:

def generate_kmers(sequence, k=6):
    for nucleotide in range(len(sequence) - k + 1):
        yield sequence[nucleotide: nucleotide + k]

In [9]:
# function to find seed sequences in kmers
def find_seed_in_kmers(kmer_generator, seeds):
    
    # copying seeds dict into the function
    temp = seeds.copy()
    
    # counter that iterates through kmers to find matches
    counter = Counter([kmer for kmer in kmer_generator if kmer in temp])

    # returns dict, sorted by the number of matches            
    return dict(sorted(counter.items(), key = lambda item: item[1], reverse=True))

In [10]:
# creating a dict that holds all possible kmers to iterate on

columns_to_merge = ["8mer", "7mer-A1", "7mer-m8", "6mer"]
temp = []
for column in columns_to_merge:
    temp.extend(mirna_df[column].tolist())

# final dict containing all possible kmers and their corresponding hit values (currently 0)
seed_kmers = {k: 0 for k in temp}

# Results

the cell below computes and prints 8mer matches


In [11]:
# generate_kmers will find kmers in the imported fasta
# find_seed_in_kmers will find matches and counts them 
seeds = find_seed_in_kmers(generate_kmers(sequence, k=8), seed_kmers)


# for loop that prints only the seeds with at least one hit
for kmer, count in seeds.items():
    if count != 0:
        # fetches miRNA names having that specific seed sequence
        matching_mirnas = mirna_df.loc[(mirna_df["8mer"] == kmer)]["name"].values
        print(f"Seed: {kmer},    {count} matches,    matching miRNA(s): {matching_mirnas}")

Seed: ACACACAC,    5 matches,    matching miRNA(s): ['hsa-miR-6867-5p']
Seed: ATGCATCT,    2 matches,    matching miRNA(s): ['hsa-miR-1277-3p']
Seed: ACATTGTC,    2 matches,    matching miRNA(s): ['hsa-miR-194-5p']
Seed: ATTGTCAG,    1 matches,    matching miRNA(s): ['hsa-miR-212-3p' 'hsa-miR-132-3p']
Seed: ACATCACA,    1 matches,    matching miRNA(s): ['hsa-miR-142-3p']
Seed: AACGTCGA,    1 matches,    matching miRNA(s): ['hsa-miR-1301-3p' 'hsa-miR-5047']
Seed: AAAGGGAA,    1 matches,    matching miRNA(s): ['hsa-miR-4755-5p' 'hsa-miR-5006-3p']
Seed: AAGGGAAA,    1 matches,    matching miRNA(s): ['hsa-miR-204-5p' 'hsa-miR-211-5p']
Seed: ACTCCTGT,    1 matches,    matching miRNA(s): ['hsa-miR-5004-5p']
Seed: ACCATCTG,    1 matches,    matching miRNA(s): ['hsa-miR-379-5p']
Seed: AGGGTTCC,    1 matches,    matching miRNA(s): ['hsa-miR-6814-5p']
Seed: AGAAGAGA,    1 matches,    matching miRNA(s): ['hsa-miR-942-5p']
Seed: AGAGACTC,    1 matches,    matching miRNA(s): ['hsa-miR-3921' 'hsa-mi

the cell below computes 7mer matches

In [12]:
# top matching 7mers

seeds = find_seed_in_kmers(generate_kmers(sequence, 7), seed_kmers)

for kmer, count in seeds.items():
    if count != 0:
        possible_sequences = mirna_df.loc[(mirna_df['7mer-A1'] == kmer) | (mirna_df['7mer-m8'] == kmer)]["name"].values
        print(f"{kmer}: {count} - {possible_sequences}")

ACACACA: 8 - ['hsa-miR-6867-5p']
CACACAC: 6 - ['hsa-miR-6867-5p']
ACAGAAT: 5 - ['hsa-miR-550a-3p']
TGTTGTT: 4 - ['hsa-miR-3529-3p']
GAAGTGA: 4 - ['hsa-miR-6837-3p']
GGGCCCT: 4 - ['hsa-miR-1247-3p']
CATTGTC: 3 - ['hsa-miR-194-5p']
ATTGTCA: 3 - ['hsa-miR-212-3p' 'hsa-miR-132-3p']
GGGGAAG: 3 - ['hsa-miR-3127-3p' 'hsa-miR-6756-3p']
AACAGAG: 3 - ['hsa-miR-6818-3p']
AGAGGGA: 3 - ['hsa-miR-4287' 'hsa-miR-4685-3p' 'hsa-miR-6867-3p']
AAGGACA: 3 - ['hsa-miR-7152-5p']
AGAACCT: 3 - ['hsa-miR-432-5p']
CCCTCCA: 3 - ['hsa-miR-6799-5p']
ACTGTAG: 3 - ['hsa-miR-489-3p']
AAAGAAG: 3 - ['hsa-miR-4659a-3p' 'hsa-miR-4659b-3p']
AAACCTG: 3 - ['hsa-miR-4520-2-3p' 'hsa-miR-7850-5p']
TGCATCT: 3 - ['hsa-miR-1277-3p']
TTGTATT: 3 - ['hsa-miR-4803']
ATTCTGA: 3 - ['hsa-miR-499a-5p']
GACCTTT: 3 - ['hsa-miR-875-3p']
TCAGGGA: 3 - ['hsa-miR-2114-5p']
CTTTTGT: 3 - ['hsa-miR-570-3p']
AGTAAAT: 3 - ['hsa-miR-4729' 'hsa-miR-5696']
ATGAATG: 3 - ['hsa-miR-9898']
GATCTTT: 3 - ['hsa-miR-6508-5p' 'hsa-miR-8067']
TTTATTA: 3 - ['hsa-

In [17]:
# top matching 7mer-A1s

seeds = find_seed_in_kmers(generate_kmers(sequence, 7), seed_kmers)

for kmer, count in seeds.items():
    if count != 0:
        possible_sequences = mirna_df.loc[(mirna_df['7mer-A1'] == kmer)]["name"].values
        print(f"{kmer}: {count} - {possible_sequences}")

ACACACA: 8 - ['hsa-miR-6867-5p']
CACACAC: 6 - []
ACAGAAT: 5 - ['hsa-miR-550a-3p']
TGTTGTT: 4 - []
GAAGTGA: 4 - []
GGGCCCT: 4 - []
CATTGTC: 3 - []
ATTGTCA: 3 - ['hsa-miR-212-3p' 'hsa-miR-132-3p']
GGGGAAG: 3 - []
AACAGAG: 3 - ['hsa-miR-6818-3p']
AGAGGGA: 3 - ['hsa-miR-4287' 'hsa-miR-4685-3p']
AAGGACA: 3 - []
AGAACCT: 3 - ['hsa-miR-432-5p']
CCCTCCA: 3 - []
ACTGTAG: 3 - []
AAAGAAG: 3 - ['hsa-miR-4659a-3p' 'hsa-miR-4659b-3p']
AAACCTG: 3 - ['hsa-miR-4520-2-3p']
TGCATCT: 3 - []
TTGTATT: 3 - []
ATTCTGA: 3 - []
GACCTTT: 3 - []
TCAGGGA: 3 - []
CTTTTGT: 3 - []
AGTAAAT: 3 - ['hsa-miR-4729']
ATGAATG: 3 - ['hsa-miR-9898']
GATCTTT: 3 - []
TTTATTA: 3 - []
ATCAAGA: 3 - ['hsa-miR-5584-3p']
TTTTAAA: 3 - []
CCCCAAA: 2 - []
CAAACAA: 2 - []
AGTGATG: 2 - ['hsa-miR-4999-3p']
AATAACA: 2 - ['hsa-miR-5701']
ACATTAT: 2 - []
TAAGACA: 2 - []
CTCCACC: 2 - []
CCACCTG: 2 - []
ACCTGGA: 2 - ['hsa-miR-6861-3p']
CTTTCCA: 2 - []
CTGAAGA: 2 - []
AAAGGAC: 2 - ['hsa-miR-7152-5p']
AAGGACC: 2 - ['hsa-miR-6783-3p']
GGACCTG: 2 - 

In [18]:
# top matching 7mer-m8's

seeds = find_seed_in_kmers(generate_kmers(sequence, 7), seed_kmers)

for kmer, count in seeds.items():
    if count != 0:
        possible_sequences = mirna_df.loc[(mirna_df['7mer-m8'] == kmer)]["name"].values
        print(f"{kmer}: {count} - {possible_sequences}")

ACACACA: 8 - []
CACACAC: 6 - ['hsa-miR-6867-5p']
ACAGAAT: 5 - []
TGTTGTT: 4 - ['hsa-miR-3529-3p']
GAAGTGA: 4 - ['hsa-miR-6837-3p']
GGGCCCT: 4 - ['hsa-miR-1247-3p']
CATTGTC: 3 - ['hsa-miR-194-5p']
ATTGTCA: 3 - []
GGGGAAG: 3 - ['hsa-miR-3127-3p' 'hsa-miR-6756-3p']
AACAGAG: 3 - []
AGAGGGA: 3 - ['hsa-miR-6867-3p']
AAGGACA: 3 - ['hsa-miR-7152-5p']
AGAACCT: 3 - []
CCCTCCA: 3 - ['hsa-miR-6799-5p']
ACTGTAG: 3 - ['hsa-miR-489-3p']
AAAGAAG: 3 - []
AAACCTG: 3 - ['hsa-miR-7850-5p']
TGCATCT: 3 - ['hsa-miR-1277-3p']
TTGTATT: 3 - ['hsa-miR-4803']
ATTCTGA: 3 - ['hsa-miR-499a-5p']
GACCTTT: 3 - ['hsa-miR-875-3p']
TCAGGGA: 3 - ['hsa-miR-2114-5p']
CTTTTGT: 3 - ['hsa-miR-570-3p']
AGTAAAT: 3 - ['hsa-miR-5696']
ATGAATG: 3 - []
GATCTTT: 3 - ['hsa-miR-6508-5p' 'hsa-miR-8067']
TTTATTA: 3 - ['hsa-miR-5692a']
ATCAAGA: 3 - []
TTTTAAA: 3 - ['hsa-miR-4282']
CCCCAAA: 2 - ['hsa-miR-193b-5p']
CAAACAA: 2 - ['hsa-miR-8068']
AGTGATG: 2 - []
AATAACA: 2 - []
ACATTAT: 2 - ['hsa-miR-2054']
TAAGACA: 2 - ['hsa-miR-4778-5p']
CTC

# experiments

These temporary cells are not a part of the main code. They're here to test stuff & will be deleted later.

In [13]:
# code that drops 7mers that doesn't start with an A (experiment for anchor A filtering)
mirna_df["7mer-A1"].replace({"^[C,T,G]": None}, regex=True, inplace=True)

mirna_df.head()

Unnamed: 0,name,miRNA_seq,cDNA_seq,MRE,8mer,7mer-A1,7mer-m8,6mer,offset_6mer
1,hsa-let-7a-5p,UGAGGUAGUAGGUUGUAUAGUU,ACTCCATCATCCAACATATCAA,CTCCATCA,ACTCCATC,ACTCCAT,CTCCATC,CTCCAT,TCCATC
2,hsa-let-7a-3p,CUAUACAAUCUACUGUCUUUC,GATATGTTAGATGACAGAAAG,ATATGTTA,,,ATATGTT,ATATGT,TATGTT
3,hsa-let-7a-2-3p,CUGUACAGCCUCCUAGCUUUCC,GACATGTCGGAGGATCGAAAGG,ACATGTCG,,,ACATGTC,ACATGT,CATGTC
4,hsa-let-7b-5p,UGAGGUAGUAGGUUGUGUGGUU,ACTCCATCATCCAACACACCAA,CTCCATCA,ACTCCATC,ACTCCAT,CTCCATC,CTCCAT,TCCATC
5,hsa-let-7b-3p,CUAUACAACCUACUGCCUUCCC,GATATGTTGGATGACGGAAGGG,ATATGTTG,,,ATATGTT,ATATGT,TATGTT


In [14]:
# writes whole sequence string to a file

with open("sequence.txt", "w") as export:
    export.write(sequence)

In [15]:
# find & print specific miRNA from the db


mirna_df.loc[mirna_df['name'] == "hsa-miR-6867-5p"]


Unnamed: 0,name,miRNA_seq,cDNA_seq,MRE,8mer,7mer-A1,7mer-m8,6mer,offset_6mer
2403,hsa-miR-6867-5p,UGUGUGUGUAGAGGAAGAAGGGA,ACACACACATCTCCTTCTTCCCT,CACACACA,ACACACAC,ACACACA,CACACAC,CACACA,ACACAC


In [16]:
# checks if these two sequences are similar

query1 = "UGUGUGUGUAGAGGAAGAAGGGA"
query2 = "UGUGUGUGUAGAGGAAGAAGGGA"

def check_sequence_similarity(seq1, seq2):
    
    if seq1 == seq2:
        print(seq1 + "\n" + seq2)
        print("Same.")
        
    else:
        print(seq1 + "\n" + seq2)
        print("Different.")
        
check_sequence_similarity(query1, query2)

UGUGUGUGUAGAGGAAGAAGGGA
UGUGUGUGUAGAGGAAGAAGGGA
Same.
