In [1]:
import editdistance

import pandas as pd

from Bio import SeqIO

In [2]:
def mirna_to_mrna(string):
    """
    Convert a miRNA sequence to its complementary mRNA sequence.

    Args:
    - string (str): The miRNA sequence to convert.

    Returns:
    - str: The complementary mRNA sequence.

    Example usage:
    >>> mirna_to_mrna('UGAGGUAGUAGGUUGUAUAGUU')
    'UAUACAACCACUACUCCAUCA'
    """
    complement = {'A': 'U', 'C': 'G', 'G': 'C', 'U': 'A'}
    return ''.join(complement.get(base, base) for base in string)[::-1]


def generate_mirna_conservation_column(df):
    targetscan = pd.read_csv("../../../processed/targetscan/targetscan.csv")
    targetscan = targetscan[["accession", "conservation"]]
    df = df.merge(targetscan, on="accession", how="left")
    return df


def find_least_different_string(original_string, column, cache=None):
    if cache is None:
        cache = {}
    min_distance = float('inf')
    least_different_string = ''
    for sequence in column:
        if sequence in cache:
            distance = cache[sequence]
        else:
            distance = editdistance.eval(original_string, sequence)
            cache[sequence] = distance
        if distance < min_distance:
            min_distance = distance
            least_different_string = sequence
    return least_different_string



def create_mirbase_db():
    with open("mature.fa") as f:

        # initializing lists to zip into a df
        names = []
        accessions = []
        sequences = []

        for record in SeqIO.parse(f, "fasta"):

            name = str(record.id)
            if name.startswith("hsa"):  # drops all non-human entries
                sequence = str(record.seq)

                # adding name
                names.append(name)

                # adding accession
                accessions.append(record.description.split(" ")[1])
                # adding mRNA sequence
                sequences.append(mirna_to_mrna(sequence))

    return pd.DataFrame(list(zip(names, accessions, sequences)), columns=["name", "accession", "sequence"])


In [3]:
# preparing dfs
df = create_mirbase_db()
df = generate_mirna_conservation_column(df)
targetscan = pd.read_csv("../../../processed/targetscan/targetscan.csv")

# get mirnas without conservation
big_mirnas = df.accession.unique()
small_mirnas = targetscan.accession.unique()
mask = df.accession.isin(set(big_mirnas) - set(small_mirnas))

# assigning to a df
conservations_to_calculate = (df[mask][["accession", "sequence"]]
                            .assign(conservation=None))


# append least difference string column
for i, row in conservations_to_calculate.iterrows():
    
    original_string = row["sequence"]
    
    target_sequences = targetscan.sequence
    
    least_different_string = find_least_different_string(original_string, target_sequences)
    
    conservations_to_calculate.loc[i, "least_different_string"] = least_different_string
    

# getting seq:conservation pairs from targetscan
pair_dict = dict(zip(targetscan['sequence'], targetscan['conservation']))


# appending conservation values of least_different_strings
conservations_to_calculate['conservation'] = conservations_to_calculate['least_different_string'].map(pair_dict)



In [4]:
df[df.conservation.isna()]

Unnamed: 0,name,accession,sequence,conservation
80,hsa-miR-103a-1-5p,MIMAT0037306,CAAGGCAGCACUGUAAAGAAGCC,
90,hsa-miR-196a-1-3p,MIMAT0037307,UCGGGUGGUUUAAUGUUGUUG,
134,hsa-miR-203a-3p,MIMAT0000264,CUAGUGGUCCUAAACAUUUCAC,
153,hsa-miR-217-3p,MIMAT0037308,GGCAAUGCAUUAGGAACUGAUG,
198,hsa-miR-135a-2-3p,MIMAT0037309,UUCAUGGCUUCCAUCCCUACAU,
...,...,...,...,...
2651,hsa-miR-12131,MIMAT0049025,UGGGAGUACACCUCUCCAAA,
2652,hsa-miR-12132,MIMAT0049026,CAUCAUAAUUCUCACAGUAAUA,
2653,hsa-miR-12133,MIMAT0049027,UGUACUUUUUAAUGGUGCCAAG,
2654,hsa-miR-12135,MIMAT0049031,UUUACAAACAAACCUUUA,


In [6]:
# dropping rows with empty conservation
df = df.dropna()

# dropping extra cols & generating 6mer seeds
df = df[["accession", "sequence", "conservation"]]

# drop least different strings
conservations_to_calculate = conservations_to_calculate.drop(columns="least_different_string")

# concatenate
df = pd.concat([df, conservations_to_calculate])

In [8]:
df.isna().sum()

accession       0
sequence        0
conservation    0
dtype: int64

In [12]:
df.to_csv("../../../processed/mirbase/mirbase22.csv", index=False)