In [1]:
%cd ..


/run/media/nazif/2F946E411BA61D49/thesis


In [2]:

import pandas as pd
from scripts.utils_latest import *
pd.set_option("display.max_columns", None)

In [3]:
def add_ta_sps_columns(df):
    # Generate temporary seed column
    df["seed"] = df["mirna_sequence"].str[1:8].str.replace("T", "U")
    # Read ta sps data
    ta_sps_df = pd.read_csv("data/processed/ta_sps/ta_sps.csv", usecols=["seed_8mer", "ta_log10", "sps_mean"])
    ta_sps_df = ta_sps_df.rename(columns={"seed_8mer": "seed"})
    # Merge dataframes on seed column
    df = df.merge(ta_sps_df, on="seed", how="left")
    # Drop temporary column
    df.drop(columns=["seed"], inplace=True)

    return df


def add_mirna_conservation_column(df):
    targetscan = pd.read_csv("data/processed/targetscan/targetscan.csv")
    targetscan = targetscan.rename(columns={"accession": "mirna_accession", "conservation": "mirna_conservation"})
    targetscan = targetscan[["mirna_accession", "mirna_conservation"]]
    df = df.merge(targetscan, on="mirna_accession", how="left")
    return df


def find_seed_type(df):
    df["anchor_a"] = (df["mre_region"].str[-1] == "A").astype(int)
    df["6mer_seed"] = (df["alignment_string"].str[1:7].str.count("0") == 0).astype(int)
    df["match_8"] = (df["alignment_string"].str[7] == "1").astype(int)
    df["6mer_seed_1_mismatch"] = (df["alignment_string"].str[1:7].str.count("0") == 1).astype(int)
    
    df["compensatory_site"] = (df["alignment_string"].str[12:17].str.count("0") == 0).astype(int)
    
    df["supplementary_site"] = (df["alignment_string"].str[12:16].str.count("0") == 0).astype(int)
    df["supplementary_site_2"] = (df["alignment_string"].str[16:21].str.count("0") == 0).astype(int)
    df["empty_seed"] = (df["alignment_string"].str[1:8].str.count("1") == 0).astype(int)
    
    
    df["9_consecutive_match_anywhere"] = (df["alignment_string"]
                                          .str
                                          .contains("1{" + str(9) + ",}")
                                          .astype(int))
    
    
    
    return df

In [4]:
df = pd.read_csv("results/4_merged_data.csv")
df = add_ta_sps_columns(df)
df = find_seed_type(df)
df = add_mirna_conservation_column(df)

df.head()


Unnamed: 0,mrna_start,mrna_end,pred_energy,mirna_start,mirna_end,mirna_dot_bracket_5to3,mirna_sequence,mirna_accession,mre_region,enst,label,alignment_string,pred_num_basepairs,pred_seed_basepairs,ta_log10,sps_mean,anchor_a,6mer_seed,match_8,6mer_seed_1_mismatch,compensatory_site,supplementary_site,supplementary_site_2,empty_seed,9_consecutive_match_anywhere,mirna_conservation
0,23,46,-27.7,1,21,)))))))))))))))))))).,TGAGGTAGTAGGTTGTATAGTT,MIMAT0000062,TTTATACAGAACCTACTGCCTC,ENST00000340828,1,1111111111111111111100,20,6,3.393,-8.18,0,1,1,0,1,1,0,0,1,2.0
1,12,31,-26.2,1,19,))))))))).)))))))).,TGAGGTAGTAGGTTGTATAGTT,MIMAT0000062,CGTGCAACCAACTACCTCATAT,ENST00000343455,1,1111111110111111110000,17,6,3.393,-8.18,0,1,1,0,1,1,0,0,1,2.0
2,4,25,-23.6,1,22,.)))))).))))))))))))).,TGAGGTAGTAGGTTGTATAGTT,MIMAT0000062,CGCTATATGACCTGATGCCTTT,ENST00000436639,1,0111111011111111111110,19,6,3.393,-8.18,0,1,0,0,1,1,1,0,1,2.0
3,4,25,-23.8,1,22,)))))))...)))))))))))),TGAGGTAGTAGGTTGTATAGTT,MIMAT0000062,TGACTATGCAACCATACCTTAC,ENST00000282516,1,1111111000111111111111,19,6,3.393,-8.18,0,1,0,0,1,1,1,0,1,2.0
4,3,25,-24.0,1,22,))))))))))))))...)))).,TGAGGTAGTAGGTTGTATAGTT,MIMAT0000062,TACTGGGAAACCTGCTACTTCG,ENST00000340139,1,1111111111111100011110,18,6,3.393,-8.18,0,1,1,0,0,0,0,0,1,2.0


In [5]:
df = generate_close_proximity_column(df)

In [6]:
df.head()

Unnamed: 0,mrna_start,mrna_end,pred_energy,mirna_start,mirna_end,mirna_dot_bracket_5to3,mirna_sequence,mirna_accession,mre_region,enst,label,alignment_string,pred_num_basepairs,pred_seed_basepairs,ta_log10,sps_mean,anchor_a,6mer_seed,match_8,6mer_seed_1_mismatch,compensatory_site,supplementary_site,supplementary_site_2,empty_seed,9_consecutive_match_anywhere,mirna_conservation,midpoint,close_proximity
0,23,46,-27.7,1,21,)))))))))))))))))))).,TGAGGTAGTAGGTTGTATAGTT,MIMAT0000062,TTTATACAGAACCTACTGCCTC,ENST00000340828,1,1111111111111111111100,20,6,3.393,-8.18,0,1,1,0,1,1,0,0,1,2.0,27,0
1,12,31,-26.2,1,19,))))))))).)))))))).,TGAGGTAGTAGGTTGTATAGTT,MIMAT0000062,CGTGCAACCAACTACCTCATAT,ENST00000343455,1,1111111110111111110000,17,6,3.393,-8.18,0,1,1,0,1,1,0,0,1,2.0,16,0
2,4,25,-23.6,1,22,.)))))).))))))))))))).,TGAGGTAGTAGGTTGTATAGTT,MIMAT0000062,CGCTATATGACCTGATGCCTTT,ENST00000436639,1,0111111011111111111110,19,6,3.393,-8.18,0,1,0,0,1,1,1,0,1,2.0,8,0
3,4,25,-23.8,1,22,)))))))...)))))))))))),TGAGGTAGTAGGTTGTATAGTT,MIMAT0000062,TGACTATGCAACCATACCTTAC,ENST00000282516,1,1111111000111111111111,19,6,3.393,-8.18,0,1,0,0,1,1,1,0,1,2.0,8,0
4,3,25,-24.0,1,22,))))))))))))))...)))).,TGAGGTAGTAGGTTGTATAGTT,MIMAT0000062,TACTGGGAAACCTGCTACTTCG,ENST00000340139,1,1111111111111100011110,18,6,3.393,-8.18,0,1,1,0,0,0,0,0,1,2.0,7,0


In [7]:
df['seed_8mer'] = ((df['anchor_a'] == 1) & (df['6mer_seed'] == 1) & (df['match_8'] == 1)).astype(int)

df['seed_7mer_a1'] = ((df['anchor_a'] == 1) & (df['6mer_seed'] == 1) & (df['match_8'] == 0)).astype(int)

df['seed_7mer_m8'] = ((df['anchor_a'] == 0) & (df['6mer_seed'] == 1) & (df['match_8'] == 1) & (df['supplementary_site'] == 0) & (df['supplementary_site_2'] == 0)).astype(int)

df['seed_compensatory'] = ((df['compensatory_site'] == 1) & (df['6mer_seed_1_mismatch'] == 1) & (df['match_8'] == 1)).astype(int)

df['seed_clash_2'] = ((df['supplementary_site'] == 1) & (df['6mer_seed'] == 1) & (df['match_8'] == 1)).astype(int)
df['seed_clash_3'] = ((df['supplementary_site_2'] == 1) & (df['6mer_seed'] == 1) & (df['match_8'] == 1)).astype(int)

df['seed_clash_4'] = ((df['empty_seed'] == 1) & (df['9_consecutive_match_anywhere'] == 1)).astype(int)


df['seed_clash_5'] = ((df['pred_num_basepairs'] > 10) & (df['6mer_seed'] == 0)).astype(int)





In [11]:
df.head()

Unnamed: 0,mrna_start,mrna_end,pred_energy,mirna_start,mirna_end,mirna_dot_bracket_5to3,mirna_sequence,mirna_accession,mre_region,enst,label,alignment_string,pred_num_basepairs,pred_seed_basepairs,ta_log10,sps_mean,anchor_a,6mer_seed,match_8,6mer_seed_1_mismatch,compensatory_site,supplementary_site,supplementary_site_2,empty_seed,9_consecutive_match_anywhere,mirna_conservation,midpoint,close_proximity,seed_8mer,seed_7mer_a1,seed_7mer_m8,seed_compensatory,seed_clash_2,seed_clash_3,seed_clash_4,seed_clash_5
0,23,46,-27.7,1,21,)))))))))))))))))))).,TGAGGTAGTAGGTTGTATAGTT,MIMAT0000062,TTTATACAGAACCTACTGCCTC,ENST00000340828,1,1111111111111111111100,20,6,3.393,-8.18,0,1,1,0,1,1,0,0,1,2.0,27,0,0,0,0,0,1,0,0,0
1,12,31,-26.2,1,19,))))))))).)))))))).,TGAGGTAGTAGGTTGTATAGTT,MIMAT0000062,CGTGCAACCAACTACCTCATAT,ENST00000343455,1,1111111110111111110000,17,6,3.393,-8.18,0,1,1,0,1,1,0,0,1,2.0,16,0,0,0,0,0,1,0,0,0
2,4,25,-23.6,1,22,.)))))).))))))))))))).,TGAGGTAGTAGGTTGTATAGTT,MIMAT0000062,CGCTATATGACCTGATGCCTTT,ENST00000436639,1,0111111011111111111110,19,6,3.393,-8.18,0,1,0,0,1,1,1,0,1,2.0,8,0,0,0,0,0,0,0,0,0
3,4,25,-23.8,1,22,)))))))...)))))))))))),TGAGGTAGTAGGTTGTATAGTT,MIMAT0000062,TGACTATGCAACCATACCTTAC,ENST00000282516,1,1111111000111111111111,19,6,3.393,-8.18,0,1,0,0,1,1,1,0,1,2.0,8,0,0,0,0,0,0,0,0,0
4,3,25,-24.0,1,22,))))))))))))))...)))).,TGAGGTAGTAGGTTGTATAGTT,MIMAT0000062,TACTGGGAAACCTGCTACTTCG,ENST00000340139,1,1111111111111100011110,18,6,3.393,-8.18,0,1,1,0,0,0,0,0,1,2.0,7,0,0,0,1,0,0,0,0,0


## negatif datada seed type'yi ne olursa olsun 0 yapmalı mıyız? seed oluşumu yok ki?

In [12]:
df.to_csv("results/5_features.csv", index=False)