In [1]:
import pandas as pd
import editdistance

In [2]:
def parse_fasta(file_path):
    results = []
    
    with open(file_path, 'r') as file:
        sequence = ''
        header = ''
        
        for line in file:
            line = line.strip()
            
            if line.startswith('>'):
                if sequence:
                    results.append((header, sequence))
                    sequence = ''
                
                header = line[1:]
            else:
                sequence += line
        
        if sequence:
            results.append((header, sequence))
    
    return results

def reverse_complement_rna_to_dna(rna_sequence):
    complement = {'A': 'T', 'U': 'A', 'C': 'G', 'G': 'C'}
    reverse_seq = rna_sequence[::-1]
    return ''.join(complement[base] for base in reverse_seq)

def find_least_different_string(original_string, column, cache=None):
    if cache is None:
        cache = {}
    min_distance = float('inf')
    least_different_string = ''
    for sequence in column:
        if sequence in cache:
            distance = cache[sequence]
        else:
            distance = editdistance.eval(original_string, sequence)
            cache[sequence] = distance
        if distance < min_distance:
            min_distance = distance
            least_different_string = sequence
    return least_different_string

In [3]:
mirbase = parse_fasta('mirbase_22.fa')

#parse the mirbase list by splitting the elements and creating a new list with the desired format
parsed_mirbase = [(parts[0], parts[1], item[1]) for item in mirbase for parts in [item[0].split(' ')]]

mb_df = (pd.DataFrame(parsed_mirbase, columns=['mirna_name', 'mirna_accession', 'sequence'])
         .query('mirna_name.str.startswith("hsa")')
         .assign(sequence= lambda x: x.sequence.apply(reverse_complement_rna_to_dna))
         .sort_values("mirna_accession")
         .reset_index(drop=True)
        )


In [4]:
renaming_dict = {
    "Mature sequence": "sequence",
    "Family Conservation?": "conservation",
    "MiRBase Accession": "mirna_accession"
}

ts_df = (pd.read_csv('targetscan_mirnas.txt', sep='\t')
         .rename(columns={'MiRBase ID': 'mirna_name'})
         .query('mirna_name.str.startswith("hsa")')
         .drop(["miR family", "Species ID", "Seed+m8"], axis=1)
         .iloc[:, [0,3,1,2 ]]
         .rename(columns=renaming_dict)
         .assign(sequence= lambda x: x.sequence.apply(reverse_complement_rna_to_dna))
         .sort_values("mirna_accession")
         .reset_index(drop=True)
         )

ts_df.head()

Unnamed: 0,mirna_name,mirna_accession,sequence,conservation
0,hsa-let-7a-5p,MIMAT0000062,AACTATACAACCTACTACCTCA,2
1,hsa-let-7b-5p,MIMAT0000063,AACCACACAACCTACTACCTCA,2
2,hsa-let-7c-5p,MIMAT0000064,AACCATACAACCTACTACCTCA,2
3,hsa-let-7d-5p,MIMAT0000065,AACTATGCAACCTACTACCTCT,2
4,hsa-let-7e-5p,MIMAT0000066,AACTATACAACCTCCTACCTCA,2


In [5]:
targetscan = ts_df[["mirna_accession", "conservation"]]

# writing conservation values
df = mb_df.merge(targetscan, on='mirna_accession', how='left')

In [6]:
# get mirnas without conservation
big_mirnas = df.mirna_accession.unique()
small_mirnas = targetscan.mirna_accession.unique()
mask = df.mirna_accession.isin(set(big_mirnas) - set(small_mirnas))

# assigning to a df
conservations_to_calculate = (df[mask][["mirna_accession", "sequence"]]
                            .assign(conservation=None))

# append least difference string column
for i, row in conservations_to_calculate.iterrows():
    
    original_string = row["sequence"]
    
    target_sequences = ts_df.sequence
    
    least_different_string = find_least_different_string(original_string, target_sequences)
    
    conservations_to_calculate.loc[i, "least_different_string"] = least_different_string
    
    
# getting seq:conservation pairs from targetscan
pair_dict = dict(zip(ts_df['sequence'], ts_df['conservation']))

# appending conservation values of least_different_strings
conservations_to_calculate['conservation'] = conservations_to_calculate['least_different_string'].map(pair_dict)

# dropping rows with empty conservation
df = df.dropna()

# dropping extra cols & generating 6mer seeds
df = df[["mirna_accession", "sequence", "conservation"]]

# drop least different strings
conservations_to_calculate = conservations_to_calculate.drop(columns="least_different_string")

# concatenate
df = pd.concat([df, conservations_to_calculate])


In [7]:
df.isna().sum()

mirna_accession    0
sequence           0
conservation       0
dtype: int64

In [8]:
df.head()

Unnamed: 0,mirna_accession,sequence,conservation
0,MIMAT0000062,AACTATACAACCTACTACCTCA,2.0
1,MIMAT0000063,AACCACACAACCTACTACCTCA,2.0
2,MIMAT0000064,AACCATACAACCTACTACCTCA,2.0
3,MIMAT0000065,AACTATGCAACCTACTACCTCT,2.0
4,MIMAT0000066,AACTATACAACCTCCTACCTCA,2.0


In [9]:
df.to_csv('mirna.csv', index=False)