In [1]:
import pandas as pd
import numpy as np

from utils_v2_for_jupyter import *
from features import *

In [8]:
def slide_and_compare(mirna_seq, mrna_seq, start_pos, min_matches):
    """
    Slide a window of length len(mirna_seq) along mrna_seq, and check for matches
    between the two sequences. Return the starting and ending positions of windows
    with at least min_matches matches, along with the matching window
    sequences and alignment strings.

    Args:
        mirna_seq (str): sequence of miRNA
        mrna_seq (str): sequence of mRNA
        start_pos (int): starting position of the mRNA sequence
        min_matches (int): minimum number of matches required

    Returns:
        Tuple of arrays:
        - starts (int): starting positions of matching windows
        - ends (int): ending positions of matching windows
        - alignments (str): alignment strings for each matching window
        - matches (int): number of matches for each matching window
        - mrna_sequences (str): matching window sequences
    """

    # Reverse complement miRNA sequence
    mirna_seq = reverse_complement(mirna_seq)

    # Convert sequences to arrays
    mirna_arr = np.array(list(mirna_seq))
    mrna_arr = np.array(list(mrna_seq))

    # Create sliding window view of mRNA sequence
    sliding_window = np.lib.stride_tricks.sliding_window_view(mrna_arr, len(mirna_arr))

    # Get matching window sequences
    mrna_seqs = np.apply_along_axis("".join, 1, sliding_window)

    # Get number of matches for each window
    matches = np.sum(sliding_window == mirna_arr, axis=1)

    # Get starting and ending positions of matching windows
    starts = start_pos + np.arange(len(mrna_arr) - len(mirna_arr) + 1) + 1
    ends = starts + len(mirna_arr) - 1

    # Generate alignment strings for each window
    alignments = ["".join(["1" if i == 1 else "0" for i in row]) for row in (sliding_window == mirna_arr)]

    # Filter out windows with too few matches
    mask = matches < min_matches
    starts = np.delete(starts, np.where(mask))
    ends = np.delete(ends, np.where(mask))
    alignments = np.delete(alignments, np.where(mask))
    matches = np.delete(matches, np.where(mask))
    mrna_seqs = np.delete(mrna_seqs, np.where(mask))

    return starts, ends, alignments, matches, mrna_seqs



def find_clash_matches(df):
    """
    Find all matches between miRNA and mRNA sequences in a given DataFrame.

    Args:
        df (pandas.DataFrame): A DataFrame containing the miRNA and mRNA sequences.

    Returns:
        pandas.DataFrame: A DataFrame containing the resulting matches, with columns for the
        ENST, start and end positions, accession, number of base pairs, alignment string,
        miRNA sequence, mRNA sequence, and seed type.
    """
    starts = []
    ends = []
    alignment_strings = []
    no_of_base_pairs = []
    accessions = []
    ensts = []
    mrna_sequences = []
    mirna_sequences = []
    seed_types = []
    folding_classes = []

    for _, row in df.iterrows():
        start, end, alignment_string, no_of_bp, mrna_sequence = slide_and_compare(row.mirna_sequence, row.mrna_sequence, row.start, 7)
        
        starts.extend(start.tolist())
        ends.extend(end.tolist())
        alignment_strings.extend(alignment_string)
        no_of_base_pairs.extend(no_of_bp.tolist())
        mrna_sequences.extend(mrna_sequence)
        mirna_sequences.extend([row.mirna_sequence] * len(start))
        accessions.extend([row.accession] * len(start))
        ensts.extend([row.ENST] * len(start))
        seed_types.extend([row.seed_type] * len(start))
        folding_classes.extend([row.folding_class] * len(start))

    return pd.DataFrame(
        {
            "enst": ensts,
            "start": starts,
            "end": ends,
            "accession": accessions,
            "no_of_base_pairs": no_of_base_pairs,
            "alignment_string": alignment_strings,
            "mirna_sequence": mirna_sequences,
            "mrna_sequence": mrna_sequences,
            "seed_types": seed_types,
            "folding_class": folding_classes
        }
    )

In [9]:
clash_df = import_clash_df(drop_irrelevant_columns=False)

# dropping rows where len(mrna_sequence) < len(mirna_sequence)
clash_df = clash_df[clash_df.mrna_sequence.str.len() >= clash_df.mirna_sequence.str.len()]

results = find_clash_matches(clash_df)

results.to_csv("clash_results.csv")

In [10]:
results = generate_ohe_CLASH_type_column(results, find_non_CLASH_types=True)

In [19]:
results[(results['folding_class'] == "III") & (results['CLASH_III'] == 0)]


Unnamed: 0,enst,start,end,accession,no_of_base_pairs,alignment_string,mirna_sequence,mrna_sequence,seed_types,folding_class,8mer,7mer-a1,7mer-m8,CLASH_II,CLASH_III,CLASH_IV,CLASH_V,compensatory,seed_with_1_mismatch,centered_site
0,ENST00000340828,1792,1813,MIMAT0000062,7,1001011000010100100000,TGAGGTAGTAGGTTGTATAGTT,ATTTGTATCTACGATAAAAATT,noncanonical_seed,III,0,0,0,0,0,0,0,0,0,0
1,ENST00000340828,1794,1815,MIMAT0000062,8,0001110010001100100100,TGAGGTAGTAGGTTGTATAGTT,TTGTATCTACGATAAAAATTTT,noncanonical_seed,III,0,0,0,0,0,0,0,0,0,0
2,ENST00000340828,1796,1817,MIMAT0000062,8,0001011101000100000101,TGAGGTAGTAGGTTGTATAGTT,GTATCTACGATAAAAATTTTTA,noncanonical_seed,III,0,0,0,0,0,0,0,0,0,0
3,ENST00000340828,1798,1819,MIMAT0000062,8,1011100001000101000001,TGAGGTAGTAGGTTGTATAGTT,ATCTACGATAAAAATTTTTATA,noncanonical_seed,III,0,0,0,0,0,0,0,0,0,0
4,ENST00000340828,1801,1822,MIMAT0000062,10,0110111011001001100000,TGAGGTAGTAGGTTGTATAGTT,TACGATAAAAATTTTTATACAG,noncanonical_seed,III,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
220149,ENST00000257857,629,651,MIMAT0018359,7,10010100010010000011000,TAGCCCCCAGGCTTCACTTGGCG,CTGCTAACTACACAGATTGGGAG,6-mer,III,0,0,0,0,0,0,0,0,0,0
220150,ENST00000257857,630,652,MIMAT0018359,10,01101101001000000111001,TAGCCCCCAGGCTTCACTTGGCG,TGCTAACTACACAGATTGGGAGA,6-mer,III,0,0,0,0,0,0,0,0,0,0
220151,ENST00000257857,631,653,MIMAT0018359,7,00001000010000101110001,TAGCCCCCAGGCTTCACTTGGCG,GCTAACTACACAGATTGGGAGAA,6-mer,III,0,0,0,0,0,0,0,0,0,0
220152,ENST00000257857,632,654,MIMAT0018359,9,10000000001100111101001,TAGCCCCCAGGCTTCACTTGGCG,CTAACTACACAGATTGGGAGAAA,6-mer,III,0,0,0,0,0,0,0,0,0,0
