In [1]:
import pandas as pd
from Bio import SeqIO

In [2]:
def mirna_to_mrna(string):
    """
    Convert a miRNA sequence to its complementary mRNA sequence.

    Args:
    - string (str): The miRNA sequence to convert.

    Returns:
    - str: The complementary mRNA sequence.

    Example usage:
    >>> mirna_to_mrna('UGAGGUAGUAGGUUGUAUAGUU')
    'UAUACAACCACUACUCCAUCA'
    """
    complement = {'A': 'U', 'C': 'G', 'G': 'C', 'U': 'A'}
    return ''.join(complement.get(base, base) for base in string)[::-1]

In [3]:
def create_mirbase_db():
    with open("mature.fa") as f:

        # initializing lists to zip into a df
        names = []
        accessions = []
        sequences = []

        for record in SeqIO.parse(f, "fasta"):

            name = str(record.id)
            if name.startswith("hsa"):  # drops all non-human entries
                sequence = str(record.seq)

                # adding name
                names.append(name)

                # adding accession
                accessions.append(record.description.split(" ")[1])
                # adding mRNA sequence
                sequences.append(mirna_to_mrna(sequence))

    return pd.DataFrame(list(zip(names, accessions, sequences)), columns=["name", "accession", "sequence"])


In [5]:
df = create_mirbase_db()
df.to_csv("../../../processed/mirbase/mirbase22.csv", index=False)