In [1]:
import pandas as pd
import editdistance

In [2]:
def parse_fasta(file_path):
    results = []
    
    with open(file_path, 'r') as file:
        sequence = ''
        header = ''
        
        for line in file:
            line = line.strip()
            
            if line.startswith('>'):
                if sequence:
                    results.append((header, sequence))
                    sequence = ''
                
                header = line[1:]
            else:
                sequence += line
        
        if sequence:
            results.append((header, sequence))
    
    return results

def reverse_complement_rna_to_dna(rna_sequence):
    complement = {'A': 'T', 'U': 'A', 'C': 'G', 'G': 'C'}
    reverse_seq = rna_sequence[::-1]
    return ''.join(complement[base] for base in reverse_seq)

def find_least_different_string(original_string, column, cache=None):
    if cache is None:
        cache = {}
    min_distance = float('inf')
    least_different_string = ''
    for sequence in column:
        if sequence in cache:
            distance = cache[sequence]
        else:
            distance = editdistance.eval(original_string, sequence)
            cache[sequence] = distance
        if distance < min_distance:
            min_distance = distance
            least_different_string = sequence
    return least_different_string

In [3]:
mirbase = parse_fasta('mirbase_22.fa')

#parse the mirbase list by splitting the elements and creating a new list with the desired format
parsed_mirbase = [(parts[0], parts[1], item[1]) for item in mirbase for parts in [item[0].split(' ')]]

mb_df = (pd.DataFrame(parsed_mirbase, columns=['mirna_name', 'mirna_accession', 'sequence'])
         .query('mirna_name.str.startswith("hsa")')
         .assign(sequence= lambda x: x.sequence.apply(reverse_complement_rna_to_dna))
         .sort_values("mirna_accession")
         .reset_index(drop=True)
        )

In [4]:
renaming_dict = {
    "Mature sequence": "sequence",
    "Family Conservation?": "conservation",
    "MiRBase Accession": "mirna_accession",
    'MiRBase ID': 'mirna_name',
    "miR family": "mirna_family"
}

ts_df = (pd.read_csv('targetscan_mirnas.txt', sep='\t')
         .rename(columns=renaming_dict)
         
         .query('mirna_name.str.startswith("hsa")')
         .drop([ "Species ID", "Seed+m8"], axis=1)
         .iloc[:, [1, 4, 0, 3, 2]]
         .assign(sequence= lambda x: x.sequence.apply(reverse_complement_rna_to_dna))
         .sort_values("mirna_accession")
         .reset_index(drop=True)
         )

ts_df.head()

Unnamed: 0,mirna_name,mirna_accession,mirna_family,conservation,sequence
0,hsa-let-7a-5p,MIMAT0000062,let-7-5p/98-5p,2,AACTATACAACCTACTACCTCA
1,hsa-let-7b-5p,MIMAT0000063,let-7-5p/98-5p,2,AACCACACAACCTACTACCTCA
2,hsa-let-7c-5p,MIMAT0000064,let-7-5p/98-5p,2,AACCATACAACCTACTACCTCA
3,hsa-let-7d-5p,MIMAT0000065,let-7-5p/98-5p,2,AACTATGCAACCTACTACCTCT
4,hsa-let-7e-5p,MIMAT0000066,let-7-5p/98-5p,2,AACTATACAACCTCCTACCTCA


In [5]:
# writing conservation values
df = mb_df.merge(ts_df, on=[ "mirna_name" ,'mirna_accession', "sequence"], how='left')

# Create a mask for rows where 'conservation' is NaN
mask = df['conservation'].isna()

# Creating a dictionary from sequence to conservation in targetscan
pair_dict = dict(zip(ts_df['sequence'], ts_df['conservation']))

# For each row in df that has a missing 'conservation' value
for i, row in df.loc[mask].iterrows():
    original_string = row["sequence"]
    target_sequences = ts_df['sequence']
    # Find the least different sequence
    least_different_string = find_least_different_string(original_string, target_sequences)
    # Map the least different string to its conservation value using the pair_dict
    conservation_value = pair_dict.get(least_different_string, None)
    # Update the 'conservation' value in df
    df.at[i, 'conservation'] = conservation_value




# df["mirna_family"] = df["mirna_family"].fillna('not_classified')

In [6]:
df.isna().sum()

mirna_name           0
mirna_accession      0
sequence             0
mirna_family       142
conservation         0
dtype: int64

# mircarta



In [7]:
mircarta = pd.read_csv("miRCarta - miRBase family.csv")

## Create a dictionary with precursors as keys and family names as values
precursor_families = {
    precursor.replace("mir", "miR"): family
    for family, precursors in mircarta.set_index('Family')['Precursor'].apply(lambda x: x.split('; ')).to_dict().items()
    for precursor in precursors
}
precursor_families

{'hsa-miR-106a': 'mir-17',
 'hsa-miR-106b': 'mir-17',
 'hsa-miR-17': 'mir-17',
 'hsa-miR-18a': 'mir-17',
 'hsa-miR-18b': 'mir-17',
 'hsa-miR-20a': 'mir-17',
 'hsa-miR-20b': 'mir-17',
 'hsa-miR-93': 'mir-17',
 'hsa-let-7a-1': 'let-7',
 'hsa-let-7a-2': 'let-7',
 'hsa-let-7a-3': 'let-7',
 'hsa-let-7b': 'let-7',
 'hsa-let-7c': 'let-7',
 'hsa-let-7d': 'let-7',
 'hsa-let-7e': 'let-7',
 'hsa-let-7f-1': 'let-7',
 'hsa-let-7f-2': 'let-7',
 'hsa-let-7g': 'let-7',
 'hsa-let-7i': 'let-7',
 'hsa-miR-98': 'let-7',
 'hsa-miR-30a': 'mir-30',
 'hsa-miR-30b': 'mir-30',
 'hsa-miR-30c-1': 'mir-30',
 'hsa-miR-30c-2': 'mir-30',
 'hsa-miR-30d': 'mir-30',
 'hsa-miR-30e': 'mir-30',
 'hsa-miR-15a': 'mir-15',
 'hsa-miR-15b': 'mir-15',
 'hsa-miR-16-1': 'mir-15',
 'hsa-miR-16-2': 'mir-15',
 'hsa-miR-195': 'mir-15',
 'hsa-miR-181a-1': 'mir-181',
 'hsa-miR-181a-2': 'mir-181',
 'hsa-miR-181b-1': 'mir-181',
 'hsa-miR-181b-2': 'mir-181',
 'hsa-miR-181c': 'mir-181',
 'hsa-miR-181d': 'mir-181',
 'hsa-miR-29a': 'mir-29',


In [8]:
df[df.mirna_family.isna()]

Unnamed: 0,mirna_name,mirna_accession,sequence,mirna_family,conservation
37,hsa-miR-101-3p,MIMAT0000099,TTCAGTTATCACAGTACTGTA,,2.0
56,hsa-miR-7-5p,MIMAT0000252,AACAACAAAATCACTAGTCTTCCA,,2.0
65,hsa-miR-183-5p,MIMAT0000261,AGTGAATTCTACCAGTGCCATA,,2.0
68,hsa-miR-203a-3p,MIMAT0000264,CTAGTGGTCCTAAACATTTCAC,,2.0
78,hsa-miR-217-5p,MIMAT0000274,TCCAATCAGTTCCTGATGCAGTA,,2.0
...,...,...,...,...,...
2651,hsa-miR-12131,MIMAT0049025,TGGGAGTACACCTCTCCAAA,,-1.0
2652,hsa-miR-12132,MIMAT0049026,CATCATAATTCTCACAGTAATA,,2.0
2653,hsa-miR-12133,MIMAT0049027,TGTACTTTTTAATGGTGCCAAG,,2.0
2654,hsa-miR-12135,MIMAT0049031,TTTACAAACAAACCTTTA,,-1.0


In [None]:
df["new_families"] = df["mirna_name"].map(precursor_families)

# Concatenate "_mircarta" to non-na values in 'new_families' column
df['new_families'] = df['new_families'].apply(
    lambda x: f"{x}_mircarta" if pd.notna(x) else x
)

# Append non-na values from 'new_families' to na rows in 'mirna_family'
df['mirna_family'] = df['mirna_family'].fillna(df['new_families'])

In [10]:
# predict mirna_family name from mirna_name and add _pred suffix to all
df['mirna_name_pred'] = df['mirna_name'].str.split('-', expand=True)[1] + '-' + df['mirna_name'].str.split('-', expand=True)[2] + '_pred'

# append to family col
df['mirna_family'] = df['mirna_family'].combine_first(df['mirna_name_pred'])

df.drop(columns=["new_families",'mirna_name_pred'], inplace=True)


In [11]:
df.isna().sum()

mirna_name         0
mirna_accession    0
sequence           0
mirna_family       0
conservation       0
dtype: int64

In [12]:
df.to_csv('mirna.csv', index=False)