In [13]:
# importing dependencies
from Bio import SeqIO
import pandas as pd


In [55]:
def import_fasta(fasta):
    """ imports fasta file into a python string

    Args:
        fasta (_type_): relative path of fasta file

    Returns:
        _type_: python string containing sequence
    """
    with open(fasta) as f:
        records = SeqIO.parse(f, "fasta")
        transcripts = [str(rec.seq) for rec in records]
        print("sequence loaded")
        return "".join(transcripts)


def complement(string):

    result = ""

    for nuc in string:
        if nuc == "A":
            result += "U"
        elif nuc == "U":
            result += "A"
        elif nuc == "C":
            result += "G"
        elif nuc == "G":
            result += "C"

    return result


def uracil_to_thymine(string):

    return "".join("T" if nuc == "U" else nuc for nuc in string)



def generate_kmers(sequence, k):
    for nucleotide in range(len(sequence) - k + 1):
        yield sequence[nucleotide: nucleotide + k]


def generate_seed_kmer_dictionary(k, mirna_df):
    target_seed = []
    target_name = []

    if k == 6:
        target_name.extend(mirna_df["name"].tolist())
        target_seed.extend(mirna_df["6mer"].tolist())

        target_name.extend(mirna_df["name"].tolist())
        target_seed.extend(mirna_df["offset_6mer"].tolist())

    if k == 7:
        target_name.extend(mirna_df["name"].tolist())
        target_seed.extend(mirna_df["7mer-A1"].tolist())

        target_name.extend(mirna_df["name"].tolist())
        target_seed.extend(mirna_df["7mer-m8"].tolist())

    if k == 8:
        target_name.extend(mirna_df["name"].tolist())
        target_seed.extend(mirna_df["8mer"].tolist())

        # dictionary comprehension
    return {target_seed[i]: target_name[i] for i in range(len(target_name))}


def find_MRE_positions(kmer_generator, kmer_dictionary, k):
    kmer_list = []
    mirna_name = []
    starting_position = []
    ending_position = []

    for count, kmer in enumerate(kmer_generator, start=0):

        if kmer in kmer_dictionary:

            position_of_first_nucleotide = count + 1
            position_of_last_nucleotide = count + k

            kmer_list.append(kmer)
            mirna_name.append(kmer_dictionary[kmer])
            starting_position.append(position_of_first_nucleotide)
            ending_position.append(position_of_last_nucleotide)

    return pd.DataFrame(list(zip(kmer_list, mirna_name, starting_position, ending_position)), columns=["kmer", "name", "start", "end"])


def run_script(k, fasta, mirna_fasta):

    # DNA branch
    sequence = import_fasta(fasta)
    kmer_generator = generate_kmers(sequence, k)

    # miRNA branch
    mirna_df = create_mirna_database(mirna_fasta)
    kmer_dictionary = generate_seed_kmer_dictionary(k, mirna_df)

    return find_MRE_positions(kmer_generator, kmer_dictionary, k), mirna_df


In [56]:
def create_mirna_database(fasta):
    with open(fasta) as f:

        # initializing lists to zip into a df
        names = []
        raw_3to5 = []
        mRNA_sequences = []
        cDNA_sequences = []

        for record in SeqIO.parse(f, "fasta"):

            name = str(record.id)
            if name.startswith("hsa"):  # drops all non-human entries
                sequence = str(record.seq)

                # adding name
                names.append(name)

                # adding raw 3 to 5 sequence
                raw_3to5.append(sequence[::-1])

                # adding mRNA sequence
                mRNA_sequences.append(complement(sequence[::-1]))

                # adding cDNA sequence
                cDNA_sequences.append(uracil_to_thymine(sequence[::-1]))

    # zipping lists into a pandas df
    colnames = ["name", "3to5_miRNA_seq", "mRNA_seq", "cDNA_seq"]
    df = pd.DataFrame(
        list(zip(names, raw_3to5, mRNA_sequences, cDNA_sequences)), columns=colnames)

    # deriving MRE sequences and adding as a new column
    """ df["MRE"] = df["cDNA_seq"].str[-9:-1] """

    # deriving kmer columns
    df["8mer"] = df["cDNA_seq"].str[-8:]
    df["7mer-A1"] = df["cDNA_seq"].str[-7:]
    df["7mer-m8"] = df["cDNA_seq"].str[-8:-1]
    df["6mer"] = df["cDNA_seq"].str[-7:-1]
    df["offset_6mer"] = df["cDNA_seq"].str[-8:-2]

    # dropping 8mer and 7mer-A1 values that doesn't start with an A
    #df["8mer"].replace({"[C,T,G]$": None}, regex=True, inplace=True)
    #df["7mer-A1"].replace({"[C,T,G]$": None}, regex=True, inplace=True)
    return df


In [None]:
def alt_create_mirna_database(fasta):
    with open(fasta) as f:

        # initializing lists to zip into a df
        names = []
        raw_3to5 = []

        

        for record in SeqIO.parse(f, "fasta"):

            name = str(record.id)
            if name.startswith("hsa"):  # drops all non-human entries
                sequence = str(record.seq)

                # adding name
                names.append(name)

                # adding raw 3 to 5 sequence
                raw_3to5.append(sequence[::-1])
    
    return names, raw_3to5

def alt_run_script(k, fasta, mirna_fasta):

    # DNA branch
    sequence = import_fasta(fasta)
    kmer_generator = generate_kmers(sequence, k)

    # miRNA branch
    mirna_df = create_mirna_database(mirna_fasta)
    kmer_dictionary = generate_seed_kmer_dictionary(k, mirna_df)

    return find_MRE_positions(kmer_generator, kmer_dictionary, k), mirna_df



## Files help:

Fasta of GRCh38, chromosome 1, soft masked downloaded from:

https://ftp.ensembl.org/pub/current_fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna_sm.chromosome.1.fa.gz

Fasta of mature miRNA sequences downloaded from:

https://www.mirbase.org/ftp.shtml

## How to run the script:

1. enter k size
2. point to the target fasta
3. point to the miRNA database fasta

In [57]:
k = 7
fasta = "../sequences/grch38/chromosomes/Homo_sapiens.GRCh38.dna_sm.chromosome.1.fa"
hmga2 = "../sequences/HMGA2_ENST00000403681_7_sequence.fa"
mirna_fasta = "../sequences/mirna/mature_miRNAs.fa"

In [58]:
results, mirna_df = run_script(k, hmga2, mirna_fasta)

""" results.to_csv("file_name", sep='\t')
mirna_df.to_csv("mirna_df.csv", sep='\t')
 """

mirna_df.head()


sequence loaded


Unnamed: 0,name,3to5_miRNA_seq,mRNA_seq,cDNA_seq,8mer,7mer-A1,7mer-m8,6mer,offset_6mer
0,hsa-let-7a-5p,UUGAUAUGUUGGAUGAUGGAGU,AACUAUACAACCUACUACCUCA,TTGATATGTTGGATGATGGAGT,GATGGAGT,ATGGAGT,GATGGAG,ATGGAG,GATGGA
1,hsa-let-7a-3p,CUUUCUGUCAUCUAACAUAUC,GAAAGACAGUAGAUUGUAUAG,CTTTCTGTCATCTAACATATC,AACATATC,ACATATC,AACATAT,ACATAT,AACATA
2,hsa-let-7a-2-3p,CCUUUCGAUCCUCCGACAUGUC,GGAAAGCUAGGAGGCUGUACAG,CCTTTCGATCCTCCGACATGTC,GACATGTC,ACATGTC,GACATGT,ACATGT,GACATG
3,hsa-let-7b-5p,UUGGUGUGUUGGAUGAUGGAGU,AACCACACAACCUACUACCUCA,TTGGTGTGTTGGATGATGGAGT,GATGGAGT,ATGGAGT,GATGGAG,ATGGAG,GATGGA
4,hsa-let-7b-3p,CCCUUCCGUCAUCCAACAUAUC,GGGAAGGCAGUAGGUUGUAUAG,CCCTTCCGTCATCCAACATATC,AACATATC,ACATATC,AACATAT,ACATAT,AACATA


In [59]:
mir_4458 = mirna_df[mirna_df["name"] == "hsa-miR-4458"].values.tolist()
mir_4500 = mirna_df[mirna_df["name"] == "hsa-miR-4500"].values.tolist()





print(mir_4458)
print(mir_4500)

[['hsa-miR-4458', 'AAGAAGGUGUGGAUGGAGA', 'UUCUUCCACACCUACCUCU', 'AAGAAGGTGTGGATGGAGA', 'GATGGAGA', 'ATGGAGA', 'GATGGAG', 'ATGGAG', 'GATGGA']]
[['hsa-miR-4500', 'UUCUUUGAUGAUGGAGU', 'AAGAAACUACUACCUCA', 'TTCTTTGATGATGGAGT', 'GATGGAGT', 'ATGGAGT', 'GATGGAG', 'ATGGAG', 'GATGGA']]
