In [4]:
import pandas as pd

from icecream import ic
import numpy as np
from utils import import_fasta, sliding_window


In [5]:
sequence = import_fasta(
    "../sequences/tp53/Homo_sapiens_ENST00000420246_2_sequence.fa")

targetscan_df = pd.read_csv(
    "../data/supplementary_files/targetscan.tsv", sep="\t")

targetscan_df.head()

Unnamed: 0,name,sequence,seed,conservation,accession
0,hsa-let-7a-5p,AACUAUACAACCUACUACCUCA,UACCUC,2,MIMAT0000062
1,hsa-let-7b-5p,AACCACACAACCUACUACCUCA,UACCUC,2,MIMAT0000063
2,hsa-let-7c-5p,AACCAUACAACCUACUACCUCA,UACCUC,2,MIMAT0000064
3,hsa-let-7d-5p,AACUAUGCAACCUACUACCUCU,UACCUC,2,MIMAT0000065
4,hsa-let-7e-5p,AACUAUACAACCUCCUACCUCA,UACCUC,2,MIMAT0000066


In [20]:
def find_CLASH_type_V_matches(sequence, targetscan_df, minimum_matches=8, allow_wobbles=False):
    # sourcery skip: low-code-quality

    names = targetscan_df["name"].values.tolist()
    mirna_sequences = targetscan_df["sequence"].values.tolist()

    name_results = []
    position_results = []
    alignment_results = []
    bp_count_results = []
    
    if allow_wobbles:
        wobble_count_results = []

    # for each mirna
    for c, mirna_sequence in enumerate(mirna_sequences):

        # get the fluff ready
        name = names[c]
        mirna_length = len(mirna_sequence)
        generator = sliding_window(sequence, mirna_length)

        # for each window
        for i, window in enumerate(generator, start=1):

            # get the fluff ready
            match_string = ""
            bp_count = 0
            
            if allow_wobbles:
                wobble_count = 0

            # check nucleotides 1 by 1
            for j in range(mirna_length):

                if mirna_sequence[j] == window[j]:
                    match_string += str(1)
                    bp_count += 1
                    
                elif allow_wobbles and ((window[j] == "G" and mirna_sequence[j] == "U") or (window[j] == "U" and mirna_sequence[j] == "G")):
                    match_string += str(2)
                    wobble_count += 1
                    
                else:
                    match_string += str(0)

            name_results.append(name)
            position_results.append(i)
            alignment_results.append(match_string)
            bp_count_results.append(bp_count)
            
            if allow_wobbles:
                wobble_count_results.append(wobble_count)

    
    # returning final df
    
    if allow_wobbles:
        df = pd.DataFrame(
            {
                "name": name_results,
                "start": position_results,
                "alignment_string": alignment_results,
                "no_of_base_pairs": bp_count_results,
                "no_of_wobbles": wobble_count_results,
                "no_of_matches": [sum(x) for x in zip(bp_count_results, wobble_count_results)]
            }
        )
    
    else:
        df = pd.DataFrame(
            {
                "name": name_results,
                "start": position_results,
                "alignment_string": alignment_results,
                "no_of_base_pairs": bp_count_results
            }
        )

    df = df[df["no_of_matches"] >= minimum_matches]

    return df

In [27]:
asd = targetscan_df["sequence"]


lens = asd.apply(lambda x: len(x))

sum(lens)

56265

In [28]:
56265 / 2656

21.18411144578313

In [22]:
len(sequence)

1494

In [23]:
2656 * 1494


3968064

In [24]:
250000000 / 4000000

62.5

In [21]:
df = find_CLASH_type_V_matches(sequence, targetscan_df[:100], 8, allow_wobbles=True)

df


Unnamed: 0,name,start,alignment_string,no_of_base_pairs,no_of_wobbles,no_of_matches
3,hsa-let-7a-5p,4,0100110011002001000101,8,1,9
4,hsa-let-7a-5p,5,1010021011002000000200,5,3,8
5,hsa-let-7a-5p,6,0001001010001011100010,8,0,8
8,hsa-let-7a-5p,9,0000100000011102110100,7,1,8
11,hsa-let-7a-5p,12,1101020000002111000010,7,2,9
...,...,...,...,...,...,...
147335,hsa-miR-1243,1468,0002102001001102000012,5,4,9
147337,hsa-miR-1243,1470,0011010100021102010102,8,3,11
147338,hsa-miR-1243,1471,0002110000012201100022,5,5,10
147339,hsa-miR-1243,1472,1000100010021100001122,7,3,10


In [15]:
df.sort_values(by="no_of_matches", ascending=False)

Unnamed: 0,name,start,alignment_string,no_of_matches
121784,hsa-miR-1229-3p,965,11120111222112020120212,19
130121,hsa-miR-1234-3p,463,2222222221111201110011,19
121643,hsa-miR-1229-3p,824,12111110211011122021011,19
121278,hsa-miR-1229-3p,459,11122120221001111211100,18
121855,hsa-miR-1229-3p,1036,11212220222002122120202,18
...,...,...,...,...
41565,hsa-miR-103a-3p,310,01100101020001000010010,8
41567,hsa-miR-103a-3p,312,01021000102001100010000,8
91090,hsa-miR-3972,1224,0000012200010100211000,8
115083,hsa-miR-1227-3p,166,10011000000012011001,8


In [10]:
def find_occurence(df, kmer):

    kmer_column_name = f"{kmer}mer"
    string_to_find = "1" * kmer

    df[kmer_column_name] = df["alignment_string"].str.contains(
        string_to_find).astype(int)

    position_column_name = f"{kmer}mer_position_start"

    # Find the position of "111111" in each "alignment" string
    df[position_column_name] = df["alignment_string"].str[::-1].str.find(string_to_find) + 1

    return df

In [11]:
df = find_occurence(df, 8)
df = find_occurence(df, 9)
df = find_occurence(df, 10)


df


Unnamed: 0,name,start,alignment_string,no_of_matches,8mer,8mer_position_start,9mer,9mer_position_start,10mer,10mer_position_start
3,hsa-let-7a-5p,4,0100110011001001000101,9,0,0,0,0,0,0
4,hsa-let-7a-5p,5,1010011011001000000100,8,0,0,0,0,0,0
5,hsa-let-7a-5p,6,0001001010001011100010,8,0,0,0,0,0,0
8,hsa-let-7a-5p,9,0000100000011101110100,8,0,0,0,0,0,0
11,hsa-let-7a-5p,12,1101010000001111000010,9,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...
147335,hsa-miR-1243,1468,0001101001001101000011,9,0,0,0,0,0,0
147337,hsa-miR-1243,1470,0011010100011101010101,11,0,0,0,0,0,0
147338,hsa-miR-1243,1471,0001110000011101100011,10,0,0,0,0,0,0
147339,hsa-miR-1243,1472,1000100010011100001111,10,0,0,0,0,0,0


In [160]:
df[df["10mer"] == 1]


Unnamed: 0,name,position,alignment,count,8mer,8mer_position,8mer_position_start,9mer,9mer_position_start,10mer,10mer_position_start
12471,hsa-miR-4458,688,0010100011111111110,12,1,2,2,1,2,1,2
51464,hsa-miR-7853-5p,1376,000000000111111111100,10,1,3,3,1,3,1,3
179211,hsa-miR-1252-3p,937,1111111111000010100100,13,1,13,13,1,13,1,13
237201,hsa-miR-1272,1460,00101010111111111101000010,15,1,9,9,1,9,1,9
237618,hsa-miR-1273a,408,1011000000011111111110100,14,1,5,5,1,5,1,5
...,...,...,...,...,...,...,...,...,...,...,...
3555529,hsa-miR-6885-3p,184,000111111111100011000,12,1,9,9,1,9,1,9
3573922,hsa-miR-6894-5p,886,011111111110000100001100,13,1,14,14,1,14,1,14
3663201,hsa-miR-7705,281,00001111111111100011001,14,1,9,9,1,9,1,9
3776522,hsa-miR-892b,166,0100010010111111111101,14,1,3,3,1,3,1,3


In [131]:
df.to_csv("../data/results/clash_class_5_results.csv", index=False)


In [132]:
df["alignment"][:10]


3     0100110011000001000101
5     0001001010001011100010
13    1000010010000001011101
17    0011110000000011000011
20    0100100000111001110000
23    0111010100001110000010
29    0011000001001011110101
32    0001101010011111000000
36    1000110101101000000001
41    0011100001101110100000
Name: alignment, dtype: object