In [1]:
cd ../..

/home/nazif/thesis/mirscribe-vcf


  self.shell.db['dhist'] = compress_dhist(dhist)[-100:]


In [9]:
import pandas as pd
from thefuzz import fuzz
import numpy as np

In [10]:
link = pd.read_csv("data/ensg_to_mirna_name/mart_export.txt")
mirna_df = pd.read_csv('/home/nazif/thesis/mirscribe-vcf/data/mirna/mirna.csv')



In [17]:
def find_matching_mirnas(ensembl_df, mirbase_df, threshold=85):
    """finds mirnas using smarter mirna-specific matching"""
    # normalize names and handle NaN values
    ensembl_names = ensembl_df['miRBase ID'].fillna('').str.replace('hsa-', '').str.lower()
    mirbase_names = mirbase_df['mirna_name'].fillna('').str.replace('hsa-', '').str.lower()
    
    matches = []
    unmatched_ensembl = []
    
    for eid, e_name in enumerate(ensembl_names):
        if not isinstance(e_name, str) or not e_name:  # handle non-string or empty
            unmatched_ensembl.append(eid)
            continue
            
        # split into mir number and suffix if present
        e_base = e_name.split('-')[1] if '-' in e_name else e_name
        
        scores = []
        for m_name in mirbase_names:
            if not isinstance(m_name, str) or not m_name:
                scores.append(0)
                continue
                
            m_base = m_name.split('-')[1] if '-' in m_name else m_name
            
            # exact match on base number
            if e_base == m_base:
                score = 100
            else:
                score = 0
            scores.append(score)
            
        best_match = np.argmax(scores)
        best_score = scores[best_match]
        
        if best_score >= threshold:
            ensembl_row = ensembl_df.iloc[eid]
            mirbase_row = mirbase_df.iloc[best_match]
            
            match_dict = {
                **ensembl_row.to_dict(),
                **mirbase_row.to_dict(),
                'match_score': best_score
            }
            matches.append(match_dict)
        else:
            unmatched_ensembl.append(eid)
    
    matched = pd.DataFrame(matches) if matches else pd.DataFrame()
    unmatched_e = ensembl_df.iloc[unmatched_ensembl]
    
    if len(matched) > 0:
        matched_mirbase = set(matched['mirna_accession'])
        unmatched_m = mirbase_df[~mirbase_df['mirna_accession'].isin(matched_mirbase)]
    else:
        unmatched_m = mirbase_df
    
    return matched, unmatched_e, unmatched_m


In [None]:
def test_match_thresholds(ensembl_df, mirbase_df, thresholds=range(60, 100, 5)):
    """tests different fuzzy matching thresholds and reports results
    
    args:
        ensembl_df: ensembl mirna df
        mirbase_df: mirbase df
        thresholds: list of threshold values to test
    """
    results = []
    
    for threshold in thresholds:
        matches, missing_e, missing_m = find_matching_mirnas(ensembl_df, mirbase_df, threshold)
        
        result = {
            'threshold': threshold,
            'matches': len(matches),
            'missing_ensembl': len(missing_e),
            'missing_mirbase': len(missing_m)
        }
        
        if len(matches) > 0:
            result['avg_score'] = matches['match_score'].mean()
            result['min_score'] = matches['match_score'].min()
            # sample of matches at this threshold
            result['sample_matches'] = list(zip(matches['miRBase ID'].head(), 
                                             matches['mirna_name'].head(),
                                             matches['match_score'].head()))
        
        results.append(result)
    
    # convert to df for easy viewing
    results_df = pd.DataFrame(results)
    
    # print detailed report
    print("Matching Results by Threshold:")
    print("-" * 50)
    for r in results:
        print(f"\nThreshold: {r['threshold']}%")
        print(f"Matches found: {r['matches']}")
        print(f"Unmatched ensembl: {r['missing_ensembl']}")
        print(f"Unmatched mirbase: {r['missing_mirbase']}")
        
        if 'avg_score' in r:
            print(f"Average match score: {r['avg_score']:.1f}")
            print("\nSample matches (mirbase_id, mirna_name, score):")
            for match in r['sample_matches'][:3]:
                print(f"  {match[0]} -> {match[1]} ({match[2]}%)")
    
    
    return results_df

# usage:
results = test_match_thresholds(link, mirna_df)


test results show that 85 is a good threshold

In [22]:
matches, missing_ensembl, missing_mirbase = find_matching_mirnas(link, mirna_df, threshold=80)


In [25]:
matches

Unnamed: 0,Gene stable ID,Transcript stable ID,Gene type,Gene name,miRBase accession,miRBase ID,mirna_name,mirna_accession,sequence,mirna_family,conservation,match_score
0,ENSG00000252695,ENST00000516886,miRNA,MIR2276,MI0011282,hsa-mir-2276,hsa-miR-2276-3p,MIMAT0011775,CCTCGCCTCTGACACTTGCAGA,miR-2276-3p,-1.0,100
1,ENSG00000263399,ENST00000582699,miRNA,MIR3170,MI0014201,hsa-mir-3170,hsa-miR-3170,MIMAT0015045,ACTGTCTGTCTCAGAACCCCAG,miR-3170/6855-5p,-1.0,100
2,ENSG00000207719,ENST00000384986,miRNA,MIR623,MI0003637,hsa-mir-623,hsa-miR-623,MIMAT0003292,ACCCAACAGCCCCTGCAAGGGAT,miR-623,-1.0,100
3,ENSG00000263615,ENST00000583390,miRNA,MIR4306,MI0015836,hsa-mir-4306,hsa-miR-4306,MIMAT0016858,TACTGCCTTTCTCTCCA,miR-185-5p,1.0,100
4,ENSG00000265164,ENST00000581814,miRNA,MIR2681,MI0012062,hsa-mir-2681,hsa-miR-2681-5p,MIMAT0013515,AGTCTCCTGGAGGTGGTAAAAC,miR-2681-5p,0.0,100
...,...,...,...,...,...,...,...,...,...,...,...,...
1570,ENSG00000272671,ENST00000609547,miRNA,MIR1184-2,MI0015971,hsa-mir-1184-2,hsa-miR-1184,MIMAT0005829,GGAAGCCATCAAGTCGCTGCAGG,miR-1184,-1.0,100
1571,ENSG00000272671,ENST00000609547,miRNA,MIR1184-2,MI0006277,hsa-mir-1184-1,hsa-miR-1184,MIMAT0005829,GGAAGCCATCAAGTCGCTGCAGG,miR-1184,-1.0,100
1572,ENSG00000273438,ENST00000593571,miRNA,MIR1184-2,MI0015972,hsa-mir-1184-3,hsa-miR-1184,MIMAT0005829,GGAAGCCATCAAGTCGCTGCAGG,miR-1184,-1.0,100
1573,ENSG00000273438,ENST00000593571,miRNA,MIR1184-2,MI0015971,hsa-mir-1184-2,hsa-miR-1184,MIMAT0005829,GGAAGCCATCAAGTCGCTGCAGG,miR-1184,-1.0,100


In [26]:
matches[["mirna_name", "miRBase ID", "match_score"]].sort_values("match_score", ascending=False)

Unnamed: 0,mirna_name,miRBase ID,match_score
0,hsa-miR-2276-3p,hsa-mir-2276,100
1058,hsa-miR-770-5p,hsa-mir-770,100
1056,hsa-miR-3680-5p,hsa-mir-3680-2,100
1055,hsa-miR-5688,hsa-mir-5688,100
1054,hsa-miR-3652,hsa-mir-3652,100
...,...,...,...
523,hsa-miR-4776-5p,hsa-mir-4776-2,100
522,hsa-miR-383-5p,hsa-mir-383,100
521,hsa-miR-5700,hsa-mir-5700,100
520,hsa-miR-378e,hsa-mir-378e,100
