In [1]:
%cd ../..
import pandas as pd
from scripts.features import *
pd.set_option('display.max_columns', None)


/run/media/nazif/2F946E411BA61D49/thesis


In [2]:
def generate_alignment_string_from_dot_bracket(df):

    full_strings = []

    for _, row in df.iterrows():

        start_string = (row.mirna_start - 1) * "0"
        mid_string = row["mirna_dot_bracket_5to3"].replace(
            ".", "0").replace(")", "1")
        end_string = (len(row.mirna_sequence) - row.mirna_end) * "0"

        full_string = start_string + mid_string + end_string

        full_strings.append(full_string)

    df["alignment_string"] = full_strings

    return df

def generate_match_count_columns(df):

    def count_ones(str, seed=False):
        return str[1:7].count("1") if seed else str.count("1")

    df["pred_num_basepairs"] = df["alignment_string"].apply(count_ones)

    df["pred_seed_basepairs"] = df["alignment_string"].apply(
        count_ones, seed=True)

    return df

In [3]:
positive_df = pd.read_csv("results/positive_df_results.csv")
positive_df["label"] = 1


negative_df = pd.read_csv("results/negative_df_results.csv")
negative_df["label"] = 0


df = pd.concat([positive_df, negative_df], ignore_index=True)
df = generate_alignment_string_from_dot_bracket(df)
df = generate_match_count_columns(df)

df.to_csv("results/merged_df_results.csv", index=False)

df.head()

Unnamed: 0,mrna_start,mrna_end,pred_energy,mirna_start,mirna_end,mirna_dot_bracket_5to3,mirna_sequence,mirna_accession,mre_region,enst,label,alignment_string,pred_num_basepairs,pred_seed_basepairs
0,23,46,-27.7,1,21,)))))))))))))))))))).,TGAGGTAGTAGGTTGTATAGTT,MIMAT0000062,TTTATACAGAACCTACTGCCTC,ENST00000340828,1,1111111111111111111100,20,6
1,12,31,-26.2,1,19,))))))))).)))))))).,TGAGGTAGTAGGTTGTATAGTT,MIMAT0000062,CGTGCAACCAACTACCTCATAT,ENST00000343455,1,1111111110111111110000,17,6
2,4,25,-23.6,1,22,.)))))).))))))))))))).,TGAGGTAGTAGGTTGTATAGTT,MIMAT0000062,CGCTATATGACCTGATGCCTTT,ENST00000436639,1,0111111011111111111110,19,6
3,4,25,-23.8,1,22,)))))))...)))))))))))),TGAGGTAGTAGGTTGTATAGTT,MIMAT0000062,TGACTATGCAACCATACCTTAC,ENST00000282516,1,1111111000111111111111,19,6
4,3,25,-24.0,1,22,))))))))))))))...)))).,TGAGGTAGTAGGTTGTATAGTT,MIMAT0000062,TACTGGGAAACCTGCTACTTCG,ENST00000340139,1,1111111111111100011110,18,6


In [4]:
# Compare the lengths of strings in 'col1' and 'col2' columns of dataframe 'df'
df['length_diff'] = df.apply(lambda x: len(x['mirna_sequence']) - len(x['mre_region']), axis=1)


In [5]:
df.length_diff.value_counts()

length_diff
 0     18428
 1      5134
 2      2892
-1      2075
 3      1810
 4       637
-2       379
 5       322
 6       201
 7        87
-3        74
 8        73
 9        58
 10       24
 11       13
-5         5
 15        3
 14        3
 13        3
 12        2
-4         1
 16        1
Name: count, dtype: int64

In [6]:
df["alignment_string"] = df["alignment_string"].apply(lambda x: x[::-1])
df.head()


Unnamed: 0,mrna_start,mrna_end,pred_energy,mirna_start,mirna_end,mirna_dot_bracket_5to3,mirna_sequence,mirna_accession,mre_region,enst,label,alignment_string,pred_num_basepairs,pred_seed_basepairs,length_diff
0,23,46,-27.7,1,21,)))))))))))))))))))).,TGAGGTAGTAGGTTGTATAGTT,MIMAT0000062,TTTATACAGAACCTACTGCCTC,ENST00000340828,1,0011111111111111111111,20,6,0
1,12,31,-26.2,1,19,))))))))).)))))))).,TGAGGTAGTAGGTTGTATAGTT,MIMAT0000062,CGTGCAACCAACTACCTCATAT,ENST00000343455,1,0000111111110111111111,17,6,0
2,4,25,-23.6,1,22,.)))))).))))))))))))).,TGAGGTAGTAGGTTGTATAGTT,MIMAT0000062,CGCTATATGACCTGATGCCTTT,ENST00000436639,1,0111111111111101111110,19,6,0
3,4,25,-23.8,1,22,)))))))...)))))))))))),TGAGGTAGTAGGTTGTATAGTT,MIMAT0000062,TGACTATGCAACCATACCTTAC,ENST00000282516,1,1111111111110001111111,19,6,0
4,3,25,-24.0,1,22,))))))))))))))...)))).,TGAGGTAGTAGGTTGTATAGTT,MIMAT0000062,TACTGGGAAACCTGCTACTTCG,ENST00000340139,1,0111100011111111111111,18,6,0


In [7]:

df = find_clash_types(df, find_noncanonical=True, drop_flag_column=False)
df.flag_column.isna().sum()

24006

In [11]:
len(df)

32225

In [14]:
df[df["pred_type_5"] == 1]

Unnamed: 0,mrna_start,mrna_end,pred_energy,mirna_start,mirna_end,mirna_dot_bracket_5to3,mirna_sequence,mirna_accession,mre_region,enst,label,alignment_string,pred_num_basepairs,pred_seed_basepairs,length_diff,flag_column,pred_8mer,pred_7mer-a1,pred_7mer-m8,pred_type_2,pred_type_3,pred_type_4,pred_type_5,pred_compensatory,pred_seed_with_1_mismatch,pred_centered_site
2,4,25,-23.6,1,22,.)))))).))))))))))))).,TGAGGTAGTAGGTTGTATAGTT,MIMAT0000062,CGCTATATGACCTGATGCCTTT,ENST00000436639,1,0111111111111101111110,19,6,0,,0,0,0,0,0,0,1,0,0,0
23,21,39,-20.8,3,21,.))))))))))).))))).,TGAGGTAGTAGGTTGTATAGTT,MIMAT0000062,TCTTTATAAAACCTACTACCAC,ENST00000297145,1,0011111011111111111000,16,4,0,1,0,0,0,0,0,0,1,0,0,1
29,11,28,-20.3,2,18,.))))))))))).))).,TGAGGTAGTAGGTTGTATAGTT,MIMAT0000062,CCTACTTACCTACTACCTGCGA,ENST00000267970,1,0000011101111111111100,14,5,0,1,0,0,0,0,0,0,1,0,1,0
31,12,30,-21.9,1,19,))))).))))))).)))).,TGAGGTAGTAGGTTGTATAGTT,MIMAT0000062,CGTGCCGCCTGCTCCTCACTGG,ENST00000311487,1,0000111101111111011111,16,5,0,,0,0,0,0,0,0,1,0,0,0
35,1,23,-22.4,1,22,.))))).))))))).))).))),TGAGGTAGTAGGTTGTATAGTT,MIMAT0000062,AAACCATGGGACCTGCCACCTC,ENST00000306773,1,1110111011111110111110,18,5,0,1,0,0,0,0,0,0,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32192,3,22,-9.4,7,24,.))).))...))).)))),AAAGACATAGGATAGAGTCACCTC,MIMAT0003311,ATGGGGGGAGGCAAAGGTGCTA,ENST00000300151,0,111101110001101110000000,12,0,2,,0,0,0,0,0,0,1,0,0,0
32208,4,22,-7.2,6,22,.))))..)))).))))),CAAAACTGGCAATTACTTTTGC,MIMAT0003251,AGCTGCAGGCCGTGGAGGTCGT,ENST00000396593,0,1111101111001111000000,13,1,0,,0,0,0,0,0,0,1,0,0,0
32210,1,15,-6.6,1,18,))))...)))..))))).,CAAAACTGGCAATTACTTTTGC,MIMAT0003251,GGTAGGGCTCTTTGAGGACACA,ENST00000244661,0,0000011111001110001111,12,3,0,,0,0,0,0,0,0,1,0,0,0
32211,1,15,-7.5,1,18,))))..))))..))))).,CAAAACTGGCAATTACTTTTGC,MIMAT0003251,GGTAGGGCTATTTGAGGACACT,ENST00000357647,0,0000011111001111001111,13,4,0,,0,0,0,0,0,0,1,0,0,0


In [12]:
df.head()

Unnamed: 0,mrna_start,mrna_end,pred_energy,mirna_start,mirna_end,mirna_dot_bracket_5to3,mirna_sequence,mirna_accession,mre_region,enst,label,alignment_string,pred_num_basepairs,pred_seed_basepairs,length_diff,flag_column,pred_8mer,pred_7mer-a1,pred_7mer-m8,pred_type_2,pred_type_3,pred_type_4,pred_type_5,pred_compensatory,pred_seed_with_1_mismatch,pred_centered_site
0,23,46,-27.7,1,21,)))))))))))))))))))).,TGAGGTAGTAGGTTGTATAGTT,MIMAT0000062,TTTATACAGAACCTACTGCCTC,ENST00000340828,1,0011111111111111111111,20,6,0,1.0,1,0,0,0,0,0,0,0,0,0
1,12,31,-26.2,1,19,))))))))).)))))))).,TGAGGTAGTAGGTTGTATAGTT,MIMAT0000062,CGTGCAACCAACTACCTCATAT,ENST00000343455,1,0000111111110111111111,17,6,0,1.0,1,0,0,0,0,0,0,0,0,0
2,4,25,-23.6,1,22,.)))))).))))))))))))).,TGAGGTAGTAGGTTGTATAGTT,MIMAT0000062,CGCTATATGACCTGATGCCTTT,ENST00000436639,1,0111111111111101111110,19,6,0,,0,0,0,0,0,0,1,0,0,0
3,4,25,-23.8,1,22,)))))))...)))))))))))),TGAGGTAGTAGGTTGTATAGTT,MIMAT0000062,TGACTATGCAACCATACCTTAC,ENST00000282516,1,1111111111110001111111,19,6,0,1.0,0,1,0,0,0,0,0,0,0,0
4,3,25,-24.0,1,22,))))))))))))))...)))).,TGAGGTAGTAGGTTGTATAGTT,MIMAT0000062,TACTGGGAAACCTGCTACTTCG,ENST00000340139,1,0111100011111111111111,18,6,0,1.0,1,0,0,0,0,0,0,0,0,0


In [9]:
df[df["flag_column"] == None]

Unnamed: 0,mrna_start,mrna_end,pred_energy,mirna_start,mirna_end,mirna_dot_bracket_5to3,mirna_sequence,mirna_accession,mre_region,enst,label,alignment_string,pred_num_basepairs,pred_seed_basepairs,length_diff,flag_column,pred_8mer,pred_7mer-a1,pred_7mer-m8,pred_type_2,pred_type_3,pred_type_4,pred_type_5,pred_compensatory,pred_seed_with_1_mismatch,pred_centered_site


# previous seed match determining heuristics with regex is clearly not working. it couldn't designate seed types for 75% of the data.