In [1]:
%cd ..

import pandas as pd
from scripts.utils_latest import *
pd.set_option("display.max_columns", None)

/run/media/nazif/2F946E411BA61D49/thesis


In [2]:
def add_ta_sps_columns(df):
    # Generate temporary seed column
    df["seed"] = df["mirna_sequence"].str[1:8].str.replace("T", "U")
    # Read ta sps data
    ta_sps_df = pd.read_csv("data/processed/ta_sps/ta_sps.csv", usecols=["seed_8mer", "ta_log10", "sps_mean"])
    ta_sps_df = ta_sps_df.rename(columns={"seed_8mer": "seed"})
    # Merge dataframes on seed column
    df = df.merge(ta_sps_df, on="seed", how="left")
    # Drop temporary column
    df.drop(columns=["seed"], inplace=True)

    return df

def add_mirna_conservation_column(df):
    targetscan = pd.read_csv("data/processed/targetscan/targetscan.csv")
    targetscan = targetscan.rename(columns={"accession": "mirna_accession", "conservation": "mirna_conservation"})
    targetscan = targetscan[["mirna_accession", "mirna_conservation"]]
    df = df.merge(targetscan, on="mirna_accession", how="left")
    return df

def find_seed_type(df):
    df["anchor_a"] = (df["mre_region"].str[-1] == "A").astype(int)
    df["6mer_seed"] = (df["alignment_string"].str[1:7].str.count("0") == 0).astype(int)
    df["match_8"] = (df["alignment_string"].str[7] == "1").astype(int)
    df["6mer_seed_1_mismatch"] = (df["alignment_string"].str[1:7].str.count("0") == 1).astype(int)
    
    df["compensatory_site"] = (df["alignment_string"].str[12:17].str.count("0") == 0).astype(int)
    
    df["supplementary_site"] = (df["alignment_string"].str[12:16].str.count("0") == 0).astype(int)
    df["supplementary_site_2"] = (df["alignment_string"].str[16:21].str.count("0") == 0).astype(int)
    df["empty_seed"] = (df["alignment_string"].str[1:8].str.count("1") == 0).astype(int)
    
    
    df["9_consecutive_match_anywhere"] = (df["alignment_string"]
                                          .str
                                          .contains("1{" + str(9) + ",}")
                                          .astype(int))
    
    
    
    return df

In [3]:
df = pd.read_csv("results/4_merged_data.csv")
df = add_ta_sps_columns(df)
df = find_seed_type(df)
df = add_mirna_conservation_column(df)
df = generate_close_proximity_column(df)

df.head()

Unnamed: 0,id,mrna_start,mrna_end,pred_energy,mirna_start,mirna_end,mirna_dot_bracket_5to3,mirna_sequence,mirna_accession,mre_region,enst,extended_mrna_start,extended_mrna_end,extended_mrna_sequence,clash_mrna_start,clash_mrna_end,mre_start,mre_end,label,alignment_string,pred_num_basepairs,pred_seed_basepairs,ta_log10,sps_mean,anchor_a,6mer_seed,match_8,6mer_seed_1_mismatch,compensatory_site,supplementary_site,supplementary_site_2,empty_seed,9_consecutive_match_anywhere,mirna_conservation,midpoint,close_proximity
0,0727A-1038930_1,22,45,-27.7,0,20,)))))))))))))))))))).,TGAGGTAGTAGGTTGTATAGTT,MIMAT0000062,TTTATACAGAACCTACTGCCTC,ENST00000340828,1760,1920,TTTAATATTTTTTTCTAGAAAACAGGTGACATTTGTATCTACGATA...,1790,1890,22,44,1,1111111111111111111100,20,6,3.393,-8.18,0,1,1,0,1,1,0,0,1,2.0,26,0
1,L1HS-1112536_1,11,30,-26.2,0,18,))))))))).)))))))).,TGAGGTAGTAGGTTGTATAGTT,MIMAT0000062,CGTGCAACCAACTACCTCATAT,ENST00000343455,3826,3958,TGCCAAGGAAATCAGCTAAATTACTACAAGCAGGAAATACCCGTGC...,3856,3928,11,33,1,1111111110111111110000,17,6,3.393,-8.18,0,1,1,0,1,1,0,0,1,2.0,15,0
2,L2HS-818542_2,3,24,-23.6,0,21,.)))))).))))))))))))).,TGAGGTAGTAGGTTGTATAGTT,MIMAT0000062,CGCTATATGACCTGATGCCTTT,ENST00000436639,2354,2464,GCAGAACTCCTTTATGCTCTGAGAGCCATTACCCGCTATATGACCT...,2384,2434,3,25,1,0111111011111111111110,19,6,3.393,-8.18,0,1,0,0,1,1,1,0,1,2.0,7,0
3,L2HS-1161339_2,3,24,-23.8,0,21,)))))))...)))))))))))),TGAGGTAGTAGGTTGTATAGTT,MIMAT0000062,TGACTATGCAACCATACCTTAC,ENST00000282516,6539,6653,AAATAAGACCCCAGCTCATGGTTAAACATGCAATGACTATGCAACC...,6569,6623,3,25,1,1111111000111111111111,19,6,3.393,-8.18,0,1,0,0,1,1,1,0,1,2.0,7,0
4,L2-407944_2,2,24,-24.0,0,21,))))))))))))))...)))).,TGAGGTAGTAGGTTGTATAGTT,MIMAT0000062,TACTGGGAAACCTGCTACTTCG,ENST00000340139,1133,1238,ACAGCAAAATTGAGTTCAACAACACAAAACAATACTGGGAAACCTG...,1163,1208,2,24,1,1111111111111100011110,18,6,3.393,-8.18,0,1,1,0,0,0,0,0,1,2.0,6,0


# inferring seed types

In [4]:
df['seed_8mer'] = ((df['anchor_a'] == 1) & (df['6mer_seed'] == 1) & (df['match_8'] == 1)).astype(int)
df['seed_7mer_a1'] = ((df['anchor_a'] == 1) & (df['6mer_seed'] == 1) & (df['match_8'] == 0)).astype(int)
df['seed_7mer_m8'] = ((df['anchor_a'] == 0) & (df['6mer_seed'] == 1) & (df['match_8'] == 1) & (df['supplementary_site'] == 0) & (df['supplementary_site_2'] == 0)).astype(int)
df['seed_compensatory'] = ((df['compensatory_site'] == 1) & (df['6mer_seed_1_mismatch'] == 1) & (df['match_8'] == 1)).astype(int)

df['seed_clash_2'] = ((df['supplementary_site'] == 1) & (df['6mer_seed'] == 1) & (df['match_8'] == 1)).astype(int)
df['seed_clash_3'] = ((df['supplementary_site_2'] == 1) & (df['6mer_seed'] == 1) & (df['match_8'] == 1)).astype(int)
df['seed_clash_4'] = ((df['empty_seed'] == 1) & (df['9_consecutive_match_anywhere'] == 1)).astype(int)
df['seed_clash_5'] = ((df['pred_num_basepairs'] > 10) & (df['6mer_seed'] == 0)).astype(int)

df.head()

Unnamed: 0,id,mrna_start,mrna_end,pred_energy,mirna_start,mirna_end,mirna_dot_bracket_5to3,mirna_sequence,mirna_accession,mre_region,enst,extended_mrna_start,extended_mrna_end,extended_mrna_sequence,clash_mrna_start,clash_mrna_end,mre_start,mre_end,label,alignment_string,pred_num_basepairs,pred_seed_basepairs,ta_log10,sps_mean,anchor_a,6mer_seed,match_8,6mer_seed_1_mismatch,compensatory_site,supplementary_site,supplementary_site_2,empty_seed,9_consecutive_match_anywhere,mirna_conservation,midpoint,close_proximity,seed_8mer,seed_7mer_a1,seed_7mer_m8,seed_compensatory,seed_clash_2,seed_clash_3,seed_clash_4,seed_clash_5
0,0727A-1038930_1,22,45,-27.7,0,20,)))))))))))))))))))).,TGAGGTAGTAGGTTGTATAGTT,MIMAT0000062,TTTATACAGAACCTACTGCCTC,ENST00000340828,1760,1920,TTTAATATTTTTTTCTAGAAAACAGGTGACATTTGTATCTACGATA...,1790,1890,22,44,1,1111111111111111111100,20,6,3.393,-8.18,0,1,1,0,1,1,0,0,1,2.0,26,0,0,0,0,0,1,0,0,0
1,L1HS-1112536_1,11,30,-26.2,0,18,))))))))).)))))))).,TGAGGTAGTAGGTTGTATAGTT,MIMAT0000062,CGTGCAACCAACTACCTCATAT,ENST00000343455,3826,3958,TGCCAAGGAAATCAGCTAAATTACTACAAGCAGGAAATACCCGTGC...,3856,3928,11,33,1,1111111110111111110000,17,6,3.393,-8.18,0,1,1,0,1,1,0,0,1,2.0,15,0,0,0,0,0,1,0,0,0
2,L2HS-818542_2,3,24,-23.6,0,21,.)))))).))))))))))))).,TGAGGTAGTAGGTTGTATAGTT,MIMAT0000062,CGCTATATGACCTGATGCCTTT,ENST00000436639,2354,2464,GCAGAACTCCTTTATGCTCTGAGAGCCATTACCCGCTATATGACCT...,2384,2434,3,25,1,0111111011111111111110,19,6,3.393,-8.18,0,1,0,0,1,1,1,0,1,2.0,7,0,0,0,0,0,0,0,0,0
3,L2HS-1161339_2,3,24,-23.8,0,21,)))))))...)))))))))))),TGAGGTAGTAGGTTGTATAGTT,MIMAT0000062,TGACTATGCAACCATACCTTAC,ENST00000282516,6539,6653,AAATAAGACCCCAGCTCATGGTTAAACATGCAATGACTATGCAACC...,6569,6623,3,25,1,1111111000111111111111,19,6,3.393,-8.18,0,1,0,0,1,1,1,0,1,2.0,7,0,0,0,0,0,0,0,0,0
4,L2-407944_2,2,24,-24.0,0,21,))))))))))))))...)))).,TGAGGTAGTAGGTTGTATAGTT,MIMAT0000062,TACTGGGAAACCTGCTACTTCG,ENST00000340139,1133,1238,ACAGCAAAATTGAGTTCAACAACACAAAACAATACTGGGAAACCTG...,1163,1208,2,24,1,1111111111111100011110,18,6,3.393,-8.18,0,1,1,0,0,0,0,0,1,2.0,6,0,0,0,1,0,0,0,0,0


In [5]:
df.to_csv("results/5_features.csv", index=False)


In [6]:
def calculate_au_content(sequence):
    au_count = 0
    for nucleotide in sequence:
        if nucleotide in ['A', 'T', "U"]:
            au_count += 1
            
    return None if len(sequence) == 0 else au_count / len(sequence)

df["mre_au_content"] = df['mre_region'].apply(calculate_au_content)

df.head()

Unnamed: 0,id,mrna_start,mrna_end,pred_energy,mirna_start,mirna_end,mirna_dot_bracket_5to3,mirna_sequence,mirna_accession,mre_region,enst,extended_mrna_start,extended_mrna_end,extended_mrna_sequence,clash_mrna_start,clash_mrna_end,mre_start,mre_end,label,alignment_string,pred_num_basepairs,pred_seed_basepairs,ta_log10,sps_mean,anchor_a,6mer_seed,match_8,6mer_seed_1_mismatch,compensatory_site,supplementary_site,supplementary_site_2,empty_seed,9_consecutive_match_anywhere,mirna_conservation,midpoint,close_proximity,seed_8mer,seed_7mer_a1,seed_7mer_m8,seed_compensatory,seed_clash_2,seed_clash_3,seed_clash_4,seed_clash_5,mre_au_content
0,0727A-1038930_1,22,45,-27.7,0,20,)))))))))))))))))))).,TGAGGTAGTAGGTTGTATAGTT,MIMAT0000062,TTTATACAGAACCTACTGCCTC,ENST00000340828,1760,1920,TTTAATATTTTTTTCTAGAAAACAGGTGACATTTGTATCTACGATA...,1790,1890,22,44,1,1111111111111111111100,20,6,3.393,-8.18,0,1,1,0,1,1,0,0,1,2.0,26,0,0,0,0,0,1,0,0,0,0.590909
1,L1HS-1112536_1,11,30,-26.2,0,18,))))))))).)))))))).,TGAGGTAGTAGGTTGTATAGTT,MIMAT0000062,CGTGCAACCAACTACCTCATAT,ENST00000343455,3826,3958,TGCCAAGGAAATCAGCTAAATTACTACAAGCAGGAAATACCCGTGC...,3856,3928,11,33,1,1111111110111111110000,17,6,3.393,-8.18,0,1,1,0,1,1,0,0,1,2.0,15,0,0,0,0,0,1,0,0,0,0.545455
2,L2HS-818542_2,3,24,-23.6,0,21,.)))))).))))))))))))).,TGAGGTAGTAGGTTGTATAGTT,MIMAT0000062,CGCTATATGACCTGATGCCTTT,ENST00000436639,2354,2464,GCAGAACTCCTTTATGCTCTGAGAGCCATTACCCGCTATATGACCT...,2384,2434,3,25,1,0111111011111111111110,19,6,3.393,-8.18,0,1,0,0,1,1,1,0,1,2.0,7,0,0,0,0,0,0,0,0,0,0.545455
3,L2HS-1161339_2,3,24,-23.8,0,21,)))))))...)))))))))))),TGAGGTAGTAGGTTGTATAGTT,MIMAT0000062,TGACTATGCAACCATACCTTAC,ENST00000282516,6539,6653,AAATAAGACCCCAGCTCATGGTTAAACATGCAATGACTATGCAACC...,6569,6623,3,25,1,1111111000111111111111,19,6,3.393,-8.18,0,1,0,0,1,1,1,0,1,2.0,7,0,0,0,0,0,0,0,0,0,0.590909
4,L2-407944_2,2,24,-24.0,0,21,))))))))))))))...)))).,TGAGGTAGTAGGTTGTATAGTT,MIMAT0000062,TACTGGGAAACCTGCTACTTCG,ENST00000340139,1133,1238,ACAGCAAAATTGAGTTCAACAACACAAAACAATACTGGGAAACCTG...,1163,1208,2,24,1,1111111111111100011110,18,6,3.393,-8.18,0,1,1,0,0,0,0,0,1,2.0,6,0,0,0,1,0,0,0,0,0,0.5


In [7]:
# finding the positions of the extended sequence
df["temp_start_offset"] = df.clash_mrna_start - df.extended_mrna_start
df["temp_end_offset"] = df.extended_mrna_end - df.clash_mrna_end

# finding indices of MRE region in extended sequence
df["temp_mre_start_in_extended"] = df.mre_start + df.temp_start_offset
df["temp_mre_end_in_extended"] = df.temp_mre_start_in_extended + df.mre_region.str.len()

# calculating local au content
df['temp_sequence_for_au_content_calculation'] = df.apply(lambda x: x['extended_mrna_sequence'][x['temp_mre_start_in_extended']-30:x['temp_mre_end_in_extended']+30], axis=1)
df["local_au_content"] = df['temp_sequence_for_au_content_calculation'].apply(calculate_au_content)


In [8]:
df["temp_mirna_length"] = df.mirna_sequence.str.len()
df["test_mirna_start_aligned_on_clash"] = df.clash_mrna_start + df.mrna_start - df.mirna_start
df["test_mirna_end_aligned_on_clash"] = df.test_mirna_start_aligned_on_clash + df.temp_mirna_length

df["test_mirna_start_aligned_on_clash_sequence"] = + df.mrna_start - df.mirna_start
df["test_mirna_end_aligned_on_clash_sequence"] = df.test_mirna_start_aligned_on_clash_sequence + df.temp_mirna_length

df["test_mirna_start_aligned_on_extended_sequence"] = df.temp_start_offset + df.mrna_start - df.mirna_start
df["test_mirna_end_aligned_on_extended_sequence"] = df.test_mirna_start_aligned_on_extended_sequence + df.temp_mirna_length



df.head()

Unnamed: 0,id,mrna_start,mrna_end,pred_energy,mirna_start,mirna_end,mirna_dot_bracket_5to3,mirna_sequence,mirna_accession,mre_region,enst,extended_mrna_start,extended_mrna_end,extended_mrna_sequence,clash_mrna_start,clash_mrna_end,mre_start,mre_end,label,alignment_string,pred_num_basepairs,pred_seed_basepairs,ta_log10,sps_mean,anchor_a,6mer_seed,match_8,6mer_seed_1_mismatch,compensatory_site,supplementary_site,supplementary_site_2,empty_seed,9_consecutive_match_anywhere,mirna_conservation,midpoint,close_proximity,seed_8mer,seed_7mer_a1,seed_7mer_m8,seed_compensatory,seed_clash_2,seed_clash_3,seed_clash_4,seed_clash_5,mre_au_content,temp_start_offset,temp_end_offset,temp_mre_start_in_extended,temp_mre_end_in_extended,temp_sequence_for_au_content_calculation,local_au_content,temp_mirna_length,test_mirna_start_aligned_on_clash,test_mirna_end_aligned_on_clash,test_mirna_start_aligned_on_clash_sequence,test_mirna_end_aligned_on_clash_sequence,test_mirna_start_aligned_on_extended_sequence,test_mirna_end_aligned_on_extended_sequence
0,0727A-1038930_1,22,45,-27.7,0,20,)))))))))))))))))))).,TGAGGTAGTAGGTTGTATAGTT,MIMAT0000062,TTTATACAGAACCTACTGCCTC,ENST00000340828,1760,1920,TTTAATATTTTTTTCTAGAAAACAGGTGACATTTGTATCTACGATA...,1790,1890,22,44,1,1111111111111111111100,20,6,3.393,-8.18,0,1,1,0,1,1,0,0,1,2.0,26,0,0,0,0,0,1,0,0,0,0.590909,30,30,52,74,CAGGTGACATTTGTATCTACGATAAAAATTTTTATACAGAACCTAC...,0.658537,22,1812,1834,22,44,52,74
1,L1HS-1112536_1,11,30,-26.2,0,18,))))))))).)))))))).,TGAGGTAGTAGGTTGTATAGTT,MIMAT0000062,CGTGCAACCAACTACCTCATAT,ENST00000343455,3826,3958,TGCCAAGGAAATCAGCTAAATTACTACAAGCAGGAAATACCCGTGC...,3856,3928,11,33,1,1111111110111111110000,17,6,3.393,-8.18,0,1,1,0,1,1,0,0,1,2.0,15,0,0,0,0,0,1,0,0,0,0.545455,30,30,41,63,TCAGCTAAATTACTACAAGCAGGAAATACCCGTGCAACCAACTACC...,0.621951,22,3867,3889,11,33,41,63
2,L2HS-818542_2,3,24,-23.6,0,21,.)))))).))))))))))))).,TGAGGTAGTAGGTTGTATAGTT,MIMAT0000062,CGCTATATGACCTGATGCCTTT,ENST00000436639,2354,2464,GCAGAACTCCTTTATGCTCTGAGAGCCATTACCCGCTATATGACCT...,2384,2434,3,25,1,0111111011111111111110,19,6,3.393,-8.18,0,1,0,0,1,1,1,0,1,2.0,7,0,0,0,0,0,0,0,0,0,0.545455,30,30,33,55,GAACTCCTTTATGCTCTGAGAGCCATTACCCGCTATATGACCTGAT...,0.585366,22,2387,2409,3,25,33,55
3,L2HS-1161339_2,3,24,-23.8,0,21,)))))))...)))))))))))),TGAGGTAGTAGGTTGTATAGTT,MIMAT0000062,TGACTATGCAACCATACCTTAC,ENST00000282516,6539,6653,AAATAAGACCCCAGCTCATGGTTAAACATGCAATGACTATGCAACC...,6569,6623,3,25,1,1111111000111111111111,19,6,3.393,-8.18,0,1,0,0,1,1,1,0,1,2.0,7,0,0,0,0,0,0,0,0,0,0.590909,30,30,33,55,TAAGACCCCAGCTCATGGTTAAACATGCAATGACTATGCAACCATA...,0.621951,22,6572,6594,3,25,33,55
4,L2-407944_2,2,24,-24.0,0,21,))))))))))))))...)))).,TGAGGTAGTAGGTTGTATAGTT,MIMAT0000062,TACTGGGAAACCTGCTACTTCG,ENST00000340139,1133,1238,ACAGCAAAATTGAGTTCAACAACACAAAACAATACTGGGAAACCTG...,1163,1208,2,24,1,1111111111111100011110,18,6,3.393,-8.18,0,1,1,0,0,0,0,0,1,2.0,6,0,0,0,1,0,0,0,0,0,0.5,30,30,32,54,AGCAAAATTGAGTTCAACAACACAAAACAATACTGGGAAACCTGCT...,0.585366,22,1165,1187,2,24,32,54


In [9]:
df.to_csv("results/5_features.csv", index=False)
