In [None]:
# importing dependencies
from Bio import SeqIO
import pandas as pd
from collections import Counter


# Importing main sequence

write the relative path of your fasta file below


In [None]:
# import & parse fasta

with open ("../../data/bcl2_3utr.fasta") as f:
    records = SeqIO.parse(f, "fasta")
    
    transcripts = [str(rec.seq) for rec in records]
    
    # string containing whole sequence
    sequence = "".join(transcripts)


# Importing miRNA sequence database

download "mature.fa" from [miRBase Database](https://www.mirbase.org/ftp.shtml) and use in the cell below.

In [None]:
# opening miRNA sequence database


with open("../../data/mature_miRNAs.fa") as fasta_file:
    identifiers = []
    lengths = []
    sequences = []
    cDNA = []

    for seq_record in SeqIO.parse(fasta_file, "fasta"):
        identifiers.append(seq_record.id)
        lengths.append(len(seq_record.seq))
        sequences.append(str(seq_record.seq))
        
        #back_transcribe() changes U's into T's, complement() takes the complement
        temp = seq_record.seq.back_transcribe()
        temp = temp.complement()
        cDNA.append(str(temp))


# zipping lists into a pandas df
mirna_df = pd.DataFrame(list(zip(identifiers, sequences, cDNA)), columns = ["name", "miRNA_seq", "cDNA_seq"])

# removing non-human entries
mirna_df = mirna_df[mirna_df["name"].str.contains("hsa")]

# fixing index numbers
mirna_df.index = range(1, len(mirna_df)+1)


# deriving MRE sequences and adding as a new column
mirna_df["MRE"] = mirna_df["cDNA_seq"].str[1:9]

# deriving kmer columns
mirna_df["8mer"] = mirna_df["cDNA_seq"].str[0:8]
mirna_df["7mer-A1"] = mirna_df["cDNA_seq"].str[0:7]
mirna_df["7mer-m8"] = mirna_df["cDNA_seq"].str[1:8]
mirna_df["6mer"] = mirna_df["cDNA_seq"].str[1:7]

# dropping 8mer and 7mer-A8 values that doesn't start with an A
mirna_df["8mer"].replace({"^[C,T,G]": None}, regex=True, inplace=True)
mirna_df["7mer-A1"].replace({"^[C,T,G]": None}, regex=True, inplace=True)


mirna_df.head()

# Exploratory Data Analysis

In [None]:
#checking miRNAs for duplicates

print(f"Total number of miRNA sequences: {len(mirna_df)}")
print("Duplicated MRE sequences: ", mirna_df["MRE"].duplicated().sum())
print("Duplicated miRNA sequences: ", mirna_df["miRNA_seq"].duplicated().sum())
print("Duplicated IDs: ", mirna_df["name"].duplicated().sum())
mirna_df.head()

In [None]:
# exploring the imported sequence

print(f"The length of target sequence: {len(sequence)}")
print(f"Total number of transcripts in the file: {len(transcripts)}")
print(f"Total number of A: {sequence.count('A')}")
print(f"Total number of T: {sequence.count('T')}")
print(f"Total number of C: {sequence.count('C')}")
print(f"Total number of G: {sequence.count('G')}")
print(f"GC ratio:  {(sequence.count('G') + sequence.count('C')) / len(sequence)}" )


# required functions

In [None]:
# generator function to find kmers of k size:

def generate_kmers(sequence, k=6):
    for nucleotide in range(len(sequence) - k + 1):
        yield sequence[nucleotide: nucleotide + k]

In [None]:
# function to find seed sequences in kmers
def find_seed_in_kmers(kmer_generator, seeds):
    
    # copying seeds dict into the function
    temp = seeds.copy()
    
    # counter that iterates through kmers to find matches
    counter = Counter([kmer for kmer in kmer_generator if kmer in temp])

    # returns dict, sorted by the number of matches            
    return dict(sorted(counter.items(), key = lambda item: item[1], reverse=True))

In [None]:
# creating a dict that holds all possible kmers to iterate on

columns_to_merge = ["MRE", "7mer-A1", "7mer-m8", "6mer"]
temp = []
for column in columns_to_merge:
    temp.extend(mirna_df[column].tolist())

# final dict containing all possible kmers and their corresponding hit values (currently 0)
seed_kmers = {k: 0 for k in temp}

# Results

the cell below computes and prints 8mer matches


In [None]:
# generate_kmers will find kmers in the imported fasta
# find_seed_in_kmers will find matches and counts them 
seeds = find_seed_in_kmers(generate_kmers(sequence, 8), seed_kmers)


# for loop that prints only the seeds with at least one hit
for kmer, count in seeds.items():
    if count != 0:
        # fetches miRNA names having that specific seed sequence
        matching_mirnas = mirna_df.loc[(mirna_df["MRE"] == kmer)]["name"].values
        print(f"Seed: {kmer},    {count} matches,    matching miRNA(s): {matching_mirnas}")

the cell below computes 7mer matches

In [None]:
# top matching 7mers

seeds = find_seed_in_kmers(generate_kmers(sequence, 7), seed_kmers)

for kmer, count in seeds.items():
    if count != 0:
        possible_sequences = mirna_df.loc[(mirna_df['7mer-A1'] == kmer) | (mirna_df['7mer-m8'] == kmer)]["name"].values
        print(f"{kmer}: {count} - {possible_sequences}")

# experiments

These temporary cells are not a part of the main code. They're here to test stuff & will be deleted later.

In [None]:
# code that drops 7mers that doesn't start with an A (experiment for anchor A filtering)
mirna_df["7mer-A1"].replace({"^[C,T,G]": None}, regex=True, inplace=True)

mirna_df.head()

In [None]:
# writes whole sequence string to a file

with open("sequence.txt", "w") as export:
    export.write(sequence)