In [1]:
%cd ..

import pandas as pd
import numpy as np

from scripts.utils_latest import *
pd.set_option("display.max_columns", None)

/run/media/nazif/2F946E411BA61D49/thesis


In [2]:
def generate_alignment_string_from_dot_bracket(df):
    full_strings = []
    for _, row in df.iterrows():
        start_string = (row.mirna_start) * "0"
        mid_string = row["mirna_dot_bracket_5to3"].replace(
            ".", "0").replace(")", "1")
        end_string = (len(row.mirna_sequence) - row.mirna_end - 1) * "0"

        full_string = start_string + mid_string + end_string
        full_strings.append(full_string)

    df["alignment_string"] = full_strings

    return df


def generate_match_count_columns(df):

    def count_ones(str, seed=False):
        return str[1:7].count("1") if seed else str.count("1")

    df["pred_num_basepairs"] = df["alignment_string"].apply(count_ones)
    df["pred_seed_basepairs"] = df["alignment_string"].apply(
        count_ones, seed=True)

    return df


def generate_ta_sps_columns(df):
    # Generate temporary seed column
    df["seed"] = df["mirna_sequence"].str[1:8].str.replace("T", "U")
    # Read ta sps data
    ta_sps_df = pd.read_csv("data/processed/ta_sps/ta_sps.csv",
                            usecols=["seed_8mer", "ta_log10", "sps_mean"])
    ta_sps_df = ta_sps_df.rename(columns={"seed_8mer": "seed"})
    # Merge dataframes on seed column
    df = df.merge(ta_sps_df, on="seed", how="left")
    # Drop temporary column
    df.drop(columns=["seed"], inplace=True)

    return df


def generate_mirna_conservation_column(df):
    targetscan = pd.read_csv("data/processed/targetscan/targetscan.csv")
    targetscan = targetscan.rename(
        columns={"accession": "mirna_accession", "conservation": "mirna_conservation"})
    targetscan = targetscan[["mirna_accession", "mirna_conservation"]]
    df = df.merge(targetscan, on="mirna_accession", how="left")
    return df


def generate_important_sites(df):
    df["anchor_a"] = (df["mre_region"].str[-1] == "A").astype(int)
    df["6mer_seed"] = (
        df["alignment_string"].str[1:7].str.count("0") == 0).astype(int)
    df["match_8"] = (df["alignment_string"].str[7] == "1").astype(int)
    df["6mer_seed_1_mismatch"] = (
        df["alignment_string"].str[1:7].str.count("0") == 1).astype(int)

    df["compensatory_site"] = (
        df["alignment_string"].str[12:17].str.count("0") == 0).astype(int)

    df["supplementary_site"] = (
        df["alignment_string"].str[12:16].str.count("0") == 0).astype(int)
    df["supplementary_site_2"] = (
        df["alignment_string"].str[16:21].str.count("0") == 0).astype(int)
    df["empty_seed"] = (
        df["alignment_string"].str[1:8].str.count("1") == 0).astype(int)

    df["9_consecutive_match_anywhere"] = (df["alignment_string"]
                                          .str
                                          .contains("1{" + str(9) + ",}")
                                          .astype(int))

    return df


def generate_seed_type_columns(df):
    df['seed_8mer'] = ((df['anchor_a'] == 1) & (
        df['6mer_seed'] == 1) & (df['match_8'] == 1)).astype(int)
    df['seed_7mer_a1'] = ((df['anchor_a'] == 1) & (
        df['6mer_seed'] == 1) & (df['match_8'] == 0)).astype(int)
    df['seed_7mer_m8'] = ((df['anchor_a'] == 0) & (df['6mer_seed'] == 1) & (df['match_8'] == 1) & (
        df['supplementary_site'] == 0) & (df['supplementary_site_2'] == 0)).astype(int)
    df['seed_compensatory'] = ((df['compensatory_site'] == 1) & (
        df['6mer_seed_1_mismatch'] == 1) & (df['match_8'] == 1)).astype(int)

    df['seed_clash_2'] = ((df['supplementary_site'] == 1) & (
        df['6mer_seed'] == 1) & (df['match_8'] == 1)).astype(int)
    df['seed_clash_3'] = ((df['supplementary_site_2'] == 1) & (
        df['6mer_seed'] == 1) & (df['match_8'] == 1)).astype(int)
    df['seed_clash_4'] = ((df['empty_seed'] == 1) & (
        df['9_consecutive_match_anywhere'] == 1)).astype(int)
    df['seed_clash_5'] = ((df['pred_num_basepairs'] > 10)
                          & (df['6mer_seed'] == 0)).astype(int)

    return df


def generate_mre_au_content_column(df):

    def calculate_au_content(sequence):
        au_count = sequence.count(
            'A') + sequence.count('T') + sequence.count('U')
        return None if len(sequence) == 0 else au_count / len(sequence)

    df["mre_au_content"] = df['mre_region'].apply(calculate_au_content)

    return df


def generate_au_content_column(df):
    def calculate_au_content(sequence):
        au_count = sequence.count(
            'A') + sequence.count('T') + sequence.count('U')
        return None if len(sequence) == 0 else au_count / len(sequence)

    def slice_column(row):
        return row['full_sequence_of_transcript'][row['temp_start']:row['temp_end']]

    # clip sets negative start indices to 0
    df["temp_start"] = (df.mre_start - 30).clip(0)

    # this handles mre_ends extending off the mRNA transcript
    df["temp_extended_end"] = df.mre_end + 30
    df["temp_transcript_length"] = df.full_sequence_of_transcript.str.len()
    df["temp_end"] = df[["temp_transcript_length",
                         "temp_extended_end"]].min(axis=1)

    # temp col for calculating au content
    df["temp_col_for_calculating_au_content"] = df.apply(slice_column, axis=1)

    df["au_content"] = df['temp_col_for_calculating_au_content'].apply(
        calculate_au_content)

    df.drop(["temp_start", "temp_extended_end", "temp_transcript_length",
            "temp_end", "temp_col_for_calculating_au_content"], axis=1, inplace=True)

    return df

In [3]:
df = pd.read_csv("results/4_merged_data.csv")
df.head()

Unnamed: 0,id,mrna_start,mrna_end,pred_energy,mirna_start,mirna_end,mirna_dot_bracket_5to3,mirna_sequence,mirna_accession,mre_region,enst,full_sequence_of_transcript,extended_mrna_start,extended_mrna_end,extended_mrna_sequence,clash_mrna_start,clash_mrna_end,mre_start,mre_end,label
0,0727A-1038930_1,22,45,-27.7,0,20,)))))))))))))))))))).,TGAGGTAGTAGGTTGTATAGTT,MIMAT0000062,TTATACAGAACCTACTGCCTCA,ENST00000340828,AGGGCAGGCGCGGCCCCTTCGGCTCCGAGCTGACCCTGATCAGGGC...,1760,1920,TTTAATATTTTTTTCTAGAAAACAGGTGACATTTGTATCTACGATA...,1790,1890,1813,1835,1
1,L1HS-1112536_1,11,30,-26.2,0,18,))))))))).)))))))).,TGAGGTAGTAGGTTGTATAGTT,MIMAT0000062,ACCCGTGCAACCAACTACCTCA,ENST00000343455,CGGAGGCGCGGCGCAGGCTGCTGCAGGCCCAGGTGAATGGAGTAAC...,3826,3958,TGCCAAGGAAATCAGCTAAATTACTACAAGCAGGAAATACCCGTGC...,3856,3928,3864,3886,1
2,L2HS-818542_2,3,24,-23.6,0,21,.)))))).))))))))))))).,TGAGGTAGTAGGTTGTATAGTT,MIMAT0000062,CCGCTATATGACCTGATGCCTT,ENST00000436639,GATTGCCAGGGCCGCCCTGTGCCCTCTGGCTCGGCGGTGGTGGGCG...,2354,2464,GCAGAACTCCTTTATGCTCTGAGAGCCATTACCCGCTATATGACCT...,2384,2434,2386,2408,1
3,L2HS-1161339_2,3,24,-23.8,0,21,)))))))...)))))))))))),TGAGGTAGTAGGTTGTATAGTT,MIMAT0000062,ATGACTATGCAACCATACCTTA,ENST00000282516,TCCGGTCGGCATTTTGTTCTGAGAGGGAGAGACGGAACGAGAGAGA...,6539,6653,AAATAAGACCCCAGCTCATGGTTAAACATGCAATGACTATGCAACC...,6569,6623,6571,6593,1
4,L2-407944_2,2,24,-24.0,0,21,))))))))))))))...)))).,TGAGGTAGTAGGTTGTATAGTT,MIMAT0000062,TACTGGGAAACCTGCTACTTCG,ENST00000340139,AGGGACCACCGGGAACAGACGGATCGGCAGGGCGGGGCGGAACGGT...,1133,1238,ACAGCAAAATTGAGTTCAACAACACAAAACAATACTGGGAAACCTG...,1163,1208,1165,1187,1


In [4]:
df = generate_alignment_string_from_dot_bracket(df)
df = generate_match_count_columns(df)
df = generate_ta_sps_columns(df)
df = generate_important_sites(df)
df = generate_mirna_conservation_column(df)
# df = generate_close_proximity_column(df)
df = generate_seed_type_columns(df)

df.head()

Unnamed: 0,id,mrna_start,mrna_end,pred_energy,mirna_start,mirna_end,mirna_dot_bracket_5to3,mirna_sequence,mirna_accession,mre_region,enst,full_sequence_of_transcript,extended_mrna_start,extended_mrna_end,extended_mrna_sequence,clash_mrna_start,clash_mrna_end,mre_start,mre_end,label,alignment_string,pred_num_basepairs,pred_seed_basepairs,ta_log10,sps_mean,anchor_a,6mer_seed,match_8,6mer_seed_1_mismatch,compensatory_site,supplementary_site,supplementary_site_2,empty_seed,9_consecutive_match_anywhere,mirna_conservation,seed_8mer,seed_7mer_a1,seed_7mer_m8,seed_compensatory,seed_clash_2,seed_clash_3,seed_clash_4,seed_clash_5
0,0727A-1038930_1,22,45,-27.7,0,20,)))))))))))))))))))).,TGAGGTAGTAGGTTGTATAGTT,MIMAT0000062,TTATACAGAACCTACTGCCTCA,ENST00000340828,AGGGCAGGCGCGGCCCCTTCGGCTCCGAGCTGACCCTGATCAGGGC...,1760,1920,TTTAATATTTTTTTCTAGAAAACAGGTGACATTTGTATCTACGATA...,1790,1890,1813,1835,1,1111111111111111111100,20,6,3.393,-8.18,1,1,1,0,1,1,0,0,1,2.0,1,0,0,0,1,0,0,0
1,L1HS-1112536_1,11,30,-26.2,0,18,))))))))).)))))))).,TGAGGTAGTAGGTTGTATAGTT,MIMAT0000062,ACCCGTGCAACCAACTACCTCA,ENST00000343455,CGGAGGCGCGGCGCAGGCTGCTGCAGGCCCAGGTGAATGGAGTAAC...,3826,3958,TGCCAAGGAAATCAGCTAAATTACTACAAGCAGGAAATACCCGTGC...,3856,3928,3864,3886,1,1111111110111111110000,17,6,3.393,-8.18,1,1,1,0,1,1,0,0,1,2.0,1,0,0,0,1,0,0,0
2,L2HS-818542_2,3,24,-23.6,0,21,.)))))).))))))))))))).,TGAGGTAGTAGGTTGTATAGTT,MIMAT0000062,CCGCTATATGACCTGATGCCTT,ENST00000436639,GATTGCCAGGGCCGCCCTGTGCCCTCTGGCTCGGCGGTGGTGGGCG...,2354,2464,GCAGAACTCCTTTATGCTCTGAGAGCCATTACCCGCTATATGACCT...,2384,2434,2386,2408,1,0111111011111111111110,19,6,3.393,-8.18,0,1,0,0,1,1,1,0,1,2.0,0,0,0,0,0,0,0,0
3,L2HS-1161339_2,3,24,-23.8,0,21,)))))))...)))))))))))),TGAGGTAGTAGGTTGTATAGTT,MIMAT0000062,ATGACTATGCAACCATACCTTA,ENST00000282516,TCCGGTCGGCATTTTGTTCTGAGAGGGAGAGACGGAACGAGAGAGA...,6539,6653,AAATAAGACCCCAGCTCATGGTTAAACATGCAATGACTATGCAACC...,6569,6623,6571,6593,1,1111111000111111111111,19,6,3.393,-8.18,1,1,0,0,1,1,1,0,1,2.0,0,1,0,0,0,0,0,0
4,L2-407944_2,2,24,-24.0,0,21,))))))))))))))...)))).,TGAGGTAGTAGGTTGTATAGTT,MIMAT0000062,TACTGGGAAACCTGCTACTTCG,ENST00000340139,AGGGACCACCGGGAACAGACGGATCGGCAGGGCGGGGCGGAACGGT...,1133,1238,ACAGCAAAATTGAGTTCAACAACACAAAACAATACTGGGAAACCTG...,1163,1208,1165,1187,1,1111111111111100011110,18,6,3.393,-8.18,0,1,1,0,0,0,0,0,1,2.0,0,0,1,0,0,0,0,0


In [5]:
df = generate_mre_au_content_column(df)

In [10]:
df = generate_au_content_column(df)

df.head()

Unnamed: 0,id,mrna_start,mrna_end,pred_energy,mirna_start,mirna_end,mirna_dot_bracket_5to3,mirna_sequence,mirna_accession,mre_region,enst,full_sequence_of_transcript,extended_mrna_start,extended_mrna_end,extended_mrna_sequence,clash_mrna_start,clash_mrna_end,mre_start,mre_end,label,alignment_string,pred_num_basepairs,pred_seed_basepairs,ta_log10,sps_mean,anchor_a,6mer_seed,match_8,6mer_seed_1_mismatch,compensatory_site,supplementary_site,supplementary_site_2,empty_seed,9_consecutive_match_anywhere,mirna_conservation,seed_8mer,seed_7mer_a1,seed_7mer_m8,seed_compensatory,seed_clash_2,seed_clash_3,seed_clash_4,seed_clash_5,mre_au_content,au_content,midpoint
0,0727A-1038930_1,22,45,-27.7,0,20,)))))))))))))))))))).,TGAGGTAGTAGGTTGTATAGTT,MIMAT0000062,TTATACAGAACCTACTGCCTCA,ENST00000340828,AGGGCAGGCGCGGCCCCTTCGGCTCCGAGCTGACCCTGATCAGGGC...,1760,1920,TTTAATATTTTTTTCTAGAAAACAGGTGACATTTGTATCTACGATA...,1790,1890,1813,1835,1,1111111111111111111100,20,6,3.393,-8.18,1,1,1,0,1,1,0,0,1,2.0,1,0,0,0,1,0,0,0,0.590909,0.670732,1824.0
1,L1HS-1112536_1,11,30,-26.2,0,18,))))))))).)))))))).,TGAGGTAGTAGGTTGTATAGTT,MIMAT0000062,ACCCGTGCAACCAACTACCTCA,ENST00000343455,CGGAGGCGCGGCGCAGGCTGCTGCAGGCCCAGGTGAATGGAGTAAC...,3826,3958,TGCCAAGGAAATCAGCTAAATTACTACAAGCAGGAAATACCCGTGC...,3856,3928,3864,3886,1,1111111110111111110000,17,6,3.393,-8.18,1,1,1,0,1,1,0,0,1,2.0,1,0,0,0,1,0,0,0,0.454545,0.634146,3875.0
2,L2HS-818542_2,3,24,-23.6,0,21,.)))))).))))))))))))).,TGAGGTAGTAGGTTGTATAGTT,MIMAT0000062,CCGCTATATGACCTGATGCCTT,ENST00000436639,GATTGCCAGGGCCGCCCTGTGCCCTCTGGCTCGGCGGTGGTGGGCG...,2354,2464,GCAGAACTCCTTTATGCTCTGAGAGCCATTACCCGCTATATGACCT...,2384,2434,2386,2408,1,0111111011111111111110,19,6,3.393,-8.18,0,1,0,0,1,1,1,0,1,2.0,0,0,0,0,0,0,0,0,0.5,0.585366,2397.0
3,L2HS-1161339_2,3,24,-23.8,0,21,)))))))...)))))))))))),TGAGGTAGTAGGTTGTATAGTT,MIMAT0000062,ATGACTATGCAACCATACCTTA,ENST00000282516,TCCGGTCGGCATTTTGTTCTGAGAGGGAGAGACGGAACGAGAGAGA...,6539,6653,AAATAAGACCCCAGCTCATGGTTAAACATGCAATGACTATGCAACC...,6569,6623,6571,6593,1,1111111000111111111111,19,6,3.393,-8.18,1,1,0,0,1,1,1,0,1,2.0,0,1,0,0,0,0,0,0,0.636364,0.621951,6582.0
4,L2-407944_2,2,24,-24.0,0,21,))))))))))))))...)))).,TGAGGTAGTAGGTTGTATAGTT,MIMAT0000062,TACTGGGAAACCTGCTACTTCG,ENST00000340139,AGGGACCACCGGGAACAGACGGATCGGCAGGGCGGGGCGGAACGGT...,1133,1238,ACAGCAAAATTGAGTTCAACAACACAAAACAATACTGGGAAACCTG...,1163,1208,1165,1187,1,1111111111111100011110,18,6,3.393,-8.18,0,1,1,0,0,0,0,0,1,2.0,0,0,1,0,0,0,0,0,0.5,0.585366,1176.0


In [8]:
df["midpoint"] = (df["mre_start"] + df["mre_end"]) / 2

In [7]:
# df.to_csv("results/5_features.csv", index=False)