In [2]:
import pandas as pd
import numpy as np

from scripts.utils_v2_for_jupyter import *
from features import *

ImportError: attempted relative import with no known parent package

In [1]:
def slide_and_compare(mirna_seq, mrna_seq, start_pos, min_matches):
    """
    Slide a window of length len(mirna_seq) along mrna_seq, and check for matches
    between the two sequences. Return the starting and ending positions of windows
    with at least min_matches matches, along with the matching window
    sequences and alignment strings.

    Args:
        mirna_seq (str): sequence of miRNA
        mrna_seq (str): sequence of mRNA
        start_pos (int): starting position of the mRNA sequence
        min_matches (int): minimum number of matches required

    Returns:
        Tuple of arrays:
        - starts (int): starting positions of matching windows
        - ends (int): ending positions of matching windows
        - alignments (str): alignment strings for each matching window
        - matches (int): number of matches for each matching window
        - mrna_sequences (str): matching window sequences
    """

    # Reverse complement miRNA sequence
    mirna_seq = reverse_complement(mirna_seq)

    # Convert sequences to arrays
    mirna_arr = np.array(list(mirna_seq))
    mrna_arr = np.array(list(mrna_seq))

    # Create sliding window view of mRNA sequence
    sliding_window = np.lib.stride_tricks.sliding_window_view(mrna_arr, len(mirna_arr))

    # Get matching window sequences
    mrna_seqs = np.apply_along_axis("".join, 1, sliding_window)

    # Get number of matches for each window
    matches = np.sum(sliding_window == mirna_arr, axis=1)

    # Get starting and ending positions of matching windows
    starts = start_pos + np.arange(len(mrna_arr) - len(mirna_arr) + 1) + 1
    ends = starts + len(mirna_arr) - 1

    # Generate alignment strings for each window
    alignments = ["".join(["1" if i == 1 else "0" for i in row]) for row in (sliding_window == mirna_arr)]

    # Filter out windows with too few matches
    mask = matches < min_matches
    starts = np.delete(starts, np.where(mask))
    ends = np.delete(ends, np.where(mask))
    alignments = np.delete(alignments, np.where(mask))
    matches = np.delete(matches, np.where(mask))
    mrna_seqs = np.delete(mrna_seqs, np.where(mask))

    return starts, ends, alignments, matches, mrna_seqs




def find_clash_matches(df):
    # from slide_and_compare()
    starts = []
    ends = []
    alignment_strings = []
    no_of_base_pairs = []
    mrna_sequences = []
    
    accessions = []
    ensts = []
    mirna_sequences = []
    seed_types = []
    folding_classes = []
    num_basepairs = []

    for _, row in df.iterrows():
        start, end, alignment_string, no_of_bp, mrna_sequence = slide_and_compare(row.mirna_sequence, row.mrna_sequence, row.start, 7)
        
        # from slide_and_compare()
        starts.extend(start.tolist())
        ends.extend(end.tolist())
        alignment_strings.extend(alignment_string)
        no_of_base_pairs.extend(no_of_bp.tolist())
        mrna_sequences.extend(mrna_sequence)
        
        
        # outside
        accessions.extend([row.accession] * len(start))
        ensts.extend([row.ENST] * len(start))
        mirna_sequences.extend([row.mirna_sequence] * len(start))
        seed_types.extend([row.seed_type] * len(start))
        folding_classes.extend([row.folding_class] * len(start))
        num_basepairs.extend([row.num_basepairs] * len(start))
        

    return pd.DataFrame(
        {
            "enst": ensts,
            "start": starts,
            "end": ends,
            "accession": accessions,
            "no_of_base_pairs": no_of_base_pairs,
            "alignment_string": alignment_strings,
            "mirna_sequence": mirna_sequences,
            "mrna_sequence": mrna_sequences,
            "seed_type": seed_types,
            "folding_class": folding_classes,
            "num_basepairs": num_basepairs
        }
    )

In [4]:
clash_df = import_clash_df(drop_irrelevant_columns=False)

# dropping rows where len(mrna_sequence) < len(mirna_sequence)
clash_df = clash_df[clash_df.mrna_sequence.str.len() >= clash_df.mirna_sequence.str.len()]

results = find_clash_matches(clash_df)

results.to_csv("clash_results.csv")

In [12]:
results = generate_ohe_CLASH_type_column(results, find_non_CLASH_types=True)

In [6]:
results[results.no_of_base_pairs == results.num_basepairs]

Unnamed: 0,enst,start,end,accession,no_of_base_pairs,alignment_string,mirna_sequence,mrna_sequence,seed_types,folding_class,num_basepairs
46,ENST00000343455,3866,3887,MIMAT0000062,17,1010010111110111111111,TGAGGTAGTAGGTTGTATAGTT,ACCCGTGCAACCAACTACCTCA,9-mer,II,17
208,ENST00000294256,150,171,MIMAT0000062,13,0000010000111111111111,TGAGGTAGTAGGTTGTATAGTT,GTGGCTCACGCCTACTACCTCA,9-mer,I,13
337,ENST00000399878,1832,1853,MIMAT0000062,14,0100100110111111111100,TGAGGTAGTAGGTTGTATAGTT,CATAAAGCATCCTACTACCTTG,7-mer,I,14
413,ENST00000434452,5448,5469,MIMAT0000062,13,0000001011111011111110,TGAGGTAGTAGGTTGTATAGTT,CCGATCAAAACCTGCTACCTCC,7-mer,I,13
414,ENST00000434452,5451,5472,MIMAT0000062,13,1010101100011110011011,TGAGGTAGTAGGTTGTATAGTT,ATCAAAACCTGCTACCTCCCCA,7-mer,I,13
...,...,...,...,...,...,...,...,...,...,...,...
219929,ENST00000369026,224,245,MIMAT0018105,8,1001000010010100101001,CTTCCCCCCAGTAATCTTCATC,GCGGTGATTGGCGGAAGCGCCG,6-mer,V,8
219931,ENST00000369026,230,251,MIMAT0018105,8,0011001000010001101010,CTTCCCCCCAGTAATCTTCATC,ATTGGCGGAAGCGCCGGCGCAA,6-mer,V,8
219942,ENST00000299505,4181,4204,MIMAT0018115,11,100011101010101001000110,CCCGGACAGGCGTTCGTGCGACGT,AGTGCGCCCCATCCCAGGGAGGGT,none,V,11
220022,ENST00000357647,326,347,MIMAT0018183,12,1001110001110100101011,TGTCCTCTAGGGCCTGCAGTCT,AACCTGTGCGCCATCCACGCCA,7-mer,I,12


In [8]:
clash_df.columns

Index(['seq_ID', 'miRNA_start', 'miRNA_end', 'mirna_sequence', 'start', 'end',
       'mrna_sequence', 'chimeras_decompressed', 'experiments',
       'experiments_list', 'microRNA_first', 'two_way_merged', 'seed_type',
       'num_basepairs', 'seed_basepairs', 'folding_energy', '5'UTR', 'CDS',
       '3'UTR', 'folding_class', 'conservation_score',
       'log2_target_enrichment', 'CLASH_single_reads_ovlp',
       'CLASH_cluster_ovlp', 'PAR_CLIP_cluster_ovlp', 'accession',
       'mirna_name', 'ENSG', 'ENST', 'gene_name'],
      dtype='object')

In [13]:
results[results.seed_types == "9-mer"]

Unnamed: 0,enst,start,end,accession,no_of_base_pairs,alignment_string,mirna_sequence,mrna_sequence,seed_types,folding_class,...,8mer,7mer-a1,7mer-m8,CLASH_II,CLASH_III,CLASH_IV,CLASH_V,compensatory,seed_with_1_mismatch,centered_site
42,ENST00000343455,3858,3879,MIMAT0000062,9,0100101010110000101001,TGAGGTAGTAGGTTGTATAGTT,CAGGAAATACCCGTGCAACCAA,9-mer,II,...,0,0,0,0,0,0,0,0,0,0
43,ENST00000343455,3859,3880,MIMAT0000062,8,1000100000101010111000,TGAGGTAGTAGGTTGTATAGTT,AGGAAATACCCGTGCAACCAAC,9-mer,II,...,0,0,0,0,0,0,0,0,0,0
44,ENST00000343455,3862,3883,MIMAT0000062,11,1101100100010110101100,TGAGGTAGTAGGTTGTATAGTT,AAATACCCGTGCAACCAACTAC,9-mer,II,...,0,0,0,0,0,0,1,0,0,0
45,ENST00000343455,3863,3884,MIMAT0000062,7,1100000000100010110010,TGAGGTAGTAGGTTGTATAGTT,AATACCCGTGCAACCAACTACC,9-mer,II,...,0,0,0,0,0,0,0,0,0,0
46,ENST00000343455,3866,3887,MIMAT0000062,17,1010010111110111111111,TGAGGTAGTAGGTTGTATAGTT,ACCCGTGCAACCAACTACCTCA,9-mer,II,...,1,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
219202,ENST00000222005,297,318,MIMAT0015085,8,0000010111100100101000,CACCTTGCGCTACTCAGGTCTG,GGTGGCCGAGTGCCAGAGGAAA,9-mer,I,...,0,0,0,0,0,0,0,0,0,0
219203,ENST00000222005,300,321,MIMAT0015085,7,0000000000011010110011,CACCTTGCGCTACTCAGGTCTG,GGCCGAGTGCCAGAGGAAACTG,9-mer,I,...,0,0,0,0,0,0,0,0,0,0
219204,ENST00000222005,302,323,MIMAT0015085,7,1011000000011000100100,CACCTTGCGCTACTCAGGTCTG,CCGAGTGCCAGAGGAAACTGAA,9-mer,I,...,0,0,0,0,0,0,0,0,0,0
219205,ENST00000222005,305,326,MIMAT0015085,8,0000110111010000010100,CACCTTGCGCTACTCAGGTCTG,AGTGCCAGAGGAAACTGAAGGA,9-mer,I,...,0,0,0,0,0,0,0,0,0,0
