In [1]:
%cd ..

import pandas as pd
pd.set_option('display.max_columns', None)
import xgboost as xgb
import numpy as np



from scripts.utils_latest import *

/run/media/nazif/2F946E411BA61D49/thesis


In [3]:
def generate_positions_from_id(vcf_df):
    vcf_df['chr'] = vcf_df['id'].str.split('_').str[0]

    vcf_df['start_coordinate'] = vcf_df['id'].str.split('_').str[1].astype(int) - 30 + vcf_df["mrna_start"]
    vcf_df['end_coordinate'] = vcf_df['id'].str.split('_').str[1].astype(int) - 30 + vcf_df["mrna_end"]
    
    return vcf_df

def generate_alignment_string_from_dot_bracket(df):
    full_strings = []
    for _, row in df.iterrows():
        start_string = (row.mirna_start) * "0"
        mid_string = row["mirna_dot_bracket_5to3"].replace(".", "0").replace(")", "1")
        end_string = (len(row.mirna_sequence) - row.mirna_end -1) * "0"
        
        full_string = start_string + mid_string + end_string
        full_strings.append(full_string)

    df["alignment_string"] = full_strings

    return df

def generate_match_count_columns(df):

    def count_ones(str, seed=False):
        return str[1:7].count("1") if seed else str.count("1")

    df["pred_num_basepairs"] = df["alignment_string"].apply(count_ones)

    df["pred_seed_basepairs"] = df["alignment_string"].apply(
        count_ones, seed=True)

    return df

def generate_ta_sps_columns(df):
    # Generate temporary seed column
    df["seed"] = df["mirna_sequence"].str[1:8].str.replace("T", "U")
    # Read ta sps data
    ta_sps_df = pd.read_csv("data/processed/ta_sps/ta_sps.csv", usecols=["seed_8mer", "ta_log10", "sps_mean"])
    ta_sps_df = ta_sps_df.rename(columns={"seed_8mer": "seed"})
    # Merge dataframes on seed column
    df = df.merge(ta_sps_df, on="seed", how="left")
    # Drop temporary column
    df.drop(columns=["seed"], inplace=True)

    return df

def generate_mre_sequence_for_vcf(vcf_df):

    def slice_column(row):
        return row["mrna_sequence"][row["mre_start"]:row["mre_end"]]
    
    # getting mirna length
    vcf_df["mirna_length"] = vcf_df["mirna_sequence"].str.len()

    # using mirna length to figure out mre coordinates
    vcf_df["mre_end"] = vcf_df["mrna_end"] + vcf_df["mirna_start"]
    vcf_df["mre_start"] = vcf_df["mre_end"] - vcf_df["mirna_length"]

    # some start values might be lower than zero, so we need to adjust
    vcf_df["mre_start"] = vcf_df["mre_start"].apply(lambda x: max(x, 0))

    # creating mre sequence column
    vcf_df["mre_region"] = vcf_df.apply(slice_column, axis=1)

    # dropping temp column
    vcf_df.drop(columns=["mirna_length"], inplace=True)
    
    return vcf_df

def generate_important_sites(df):
    df["anchor_a"] = (df["mre_region"].str[-1] == "A").astype(int)
    df["6mer_seed"] = (df["alignment_string"].str[1:7].str.count("0") == 0).astype(int)
    df["match_8"] = (df["alignment_string"].str[7] == "1").astype(int)
    df["6mer_seed_1_mismatch"] = (df["alignment_string"].str[1:7].str.count("0") == 1).astype(int)
    
    df["compensatory_site"] = (df["alignment_string"].str[12:17].str.count("0") == 0).astype(int)
    
    df["supplementary_site"] = (df["alignment_string"].str[12:16].str.count("0") == 0).astype(int)
    df["supplementary_site_2"] = (df["alignment_string"].str[16:21].str.count("0") == 0).astype(int)
    df["empty_seed"] = (df["alignment_string"].str[1:8].str.count("1") == 0).astype(int)
    
    
    df["9_consecutive_match_anywhere"] = (df["alignment_string"]
                                          .str
                                          .contains("1{" + str(9) + ",}")
                                          .astype(int))
    
    return df

def generate_mirna_conservation_column(df):
    targetscan = pd.read_csv("data/processed/mirbase/mirbase22.csv")
    targetscan = targetscan.rename(columns={"accession": "mirna_accession", "conservation": "mirna_conservation"})
    targetscan = targetscan[["mirna_accession", "mirna_conservation"]]
    df = df.merge(targetscan, on="mirna_accession", how="left")
    return df

def generate_seed_type_columns(df):
    df['seed_8mer'] = ((df['anchor_a'] == 1) & (df['6mer_seed'] == 1) & (df['match_8'] == 1)).astype(int)
    df['seed_7mer_a1'] = ((df['anchor_a'] == 1) & (df['6mer_seed'] == 1) & (df['match_8'] == 0)).astype(int)
    df['seed_7mer_m8'] = ((df['anchor_a'] == 0) & (df['6mer_seed'] == 1) & (df['match_8'] == 1) & (df['supplementary_site'] == 0) & (df['supplementary_site_2'] == 0)).astype(int)
    df['seed_compensatory'] = ((df['compensatory_site'] == 1) & (df['6mer_seed_1_mismatch'] == 1) & (df['match_8'] == 1)).astype(int)

    df['seed_clash_2'] = ((df['supplementary_site'] == 1) & (df['6mer_seed'] == 1) & (df['match_8'] == 1)).astype(int)
    df['seed_clash_3'] = ((df['supplementary_site_2'] == 1) & (df['6mer_seed'] == 1) & (df['match_8'] == 1)).astype(int)
    df['seed_clash_4'] = ((df['empty_seed'] == 1) & (df['9_consecutive_match_anywhere'] == 1)).astype(int)
    df['seed_clash_5'] = ((df['pred_num_basepairs'] > 10) & (df['6mer_seed'] == 0)).astype(int)
    
    return df

def generate_mre_au_content_column(df):
    
    def calculate_au_content(sequence):
        au_count = sequence.count('A') + sequence.count('T') + sequence.count('U')
        return None if len(sequence) == 0 else au_count / len(sequence)


    df["mre_au_content"] = df['mre_region'].apply(calculate_au_content)
        
    return df

def generate_au_content_column_for_vcf(vcf_df):
    
    def calculate_au_content(sequence):
        au_count = sequence.count('A') + sequence.count('T') + sequence.count('U')
        return None if len(sequence) == 0 else au_count / len(sequence)

    vcf_df["au_content_sequence"] = vcf_df.apply(lambda x: get_nucleotides_in_interval(x['chr'], x['start_coordinate']-30, x["end_coordinate"]+30), axis=1)

    vcf_df["au_content"] = vcf_df['au_content_sequence'].apply(calculate_au_content)
    
    return vcf_df

In [4]:
df = pd.read_csv("sana/initial_results.csv")
df.head()

Unnamed: 0,id,mrna_start,mrna_end,mrna_sequence,mirna_accession,mirna_start,mirna_end,mirna_sequence,mrna_dot_bracket_5to3,mirna_dot_bracket_5to3,pred_energy,is_mutated
0,16_339439_C_T_MIMAT0000062,14,31,GGGGAGGGGGCACCCCAGCCCTCACACTCACCTGTAGCTGCCCTTT...,MIMAT0000062,0,16,TGAGGTAGTAGGTTGTATAGTT,.(((((..(((.((((.,)))))).)..))))).,-12.2,0
1,16_339439_C_T_MIMAT0004481,0,11,GGGGAGGGGGCACCCCAGCCCTCACACTCACCTGTAGCTGCCCTTT...,MIMAT0004481,11,21,CTATACAATCTACTGTCTTTC,.((((((.((.,.)).)))))),-6.7,0
2,16_339439_C_T_MIMAT0010195,0,19,GGGGAGGGGGCACCCCAGCCCTCACACTCACCTGTAGCTGCCCTTT...,MIMAT0010195,0,19,CTGTACAGCCTCCTAGCTTTCC,((..(((((((....(((.,)))....)))))))..)).,-14.8,0
3,16_339439_C_T_MIMAT0000063,20,43,GGGGAGGGGGCACCCCAGCCCTCACACTCACCTGTAGCTGCCCTTT...,MIMAT0000063,2,21,TGAGGTAGTAGGTTGTGTGGTT,.((((((..((((...((((((.,.)))))))))).)))))).,-19.9,0
4,16_339439_C_T_MIMAT0004482,0,11,GGGGAGGGGGCACCCCAGCCCTCACACTCACCTGTAGCTGCCCTTT...,MIMAT0004482,11,22,CTATACAACCTACTGCCTTCCC,(((((((.((.,.)).))))))),-15.3,0


In [5]:
df = generate_positions_from_id(df)
df = generate_alignment_string_from_dot_bracket(df)
df = generate_match_count_columns(df)
df = generate_ta_sps_columns(df)
df = generate_mre_sequence_for_vcf(df)
df = generate_important_sites(df)
df = generate_mirna_conservation_column(df)
df = generate_seed_type_columns(df)
df = generate_mre_au_content_column(df)
df = generate_au_content_column_for_vcf(df)

df.head()

Unnamed: 0,id,mrna_start,mrna_end,mrna_sequence,mirna_accession,mirna_start,mirna_end,mirna_sequence,mrna_dot_bracket_5to3,mirna_dot_bracket_5to3,pred_energy,is_mutated,chr,start_coordinate,end_coordinate,alignment_string,pred_num_basepairs,pred_seed_basepairs,ta_log10,sps_mean,mre_end,mre_start,mre_region,anchor_a,6mer_seed,match_8,6mer_seed_1_mismatch,compensatory_site,supplementary_site,supplementary_site_2,empty_seed,9_consecutive_match_anywhere,mirna_conservation,seed_8mer,seed_7mer_a1,seed_7mer_m8,seed_compensatory,seed_clash_2,seed_clash_3,seed_clash_4,seed_clash_5,mre_au_content,au_content_sequence,au_content
0,16_339439_C_T_MIMAT0000062,14,31,GGGGAGGGGGCACCCCAGCCCTCACACTCACCTGTAGCTGCCCTTT...,MIMAT0000062,0,16,TGAGGTAGTAGGTTGTATAGTT,.(((((..(((.((((.,)))))).)..))))).,-12.2,0,16,339423,339440,111111010011111000000,12,5,3.393,-8.18,31,9,GCACCCCAGCCCTCACACTCAC,0,0,1,1,0,0,0,0,0,2.0,0,0,0,0,0,0,0,1,0.318182,ACCCACATACTCGTGCGGGGAGGGGGCACCCCAGCCCTCACACTCA...,0.371795
1,16_339439_C_T_MIMAT0004481,0,11,GGGGAGGGGGCACCCCAGCCCTCACACTCACCTGTAGCTGCCCTTT...,MIMAT0004481,11,21,CTATACAATCTACTGTCTTTC,.((((((.((.,.)).)))))),-6.7,0,16,339409,339420,000000000000110111111,8,0,3.716,-3.71,22,1,GGGAGGGGGCACCCCAGCCCT,0,0,0,0,0,0,1,1,0,-1.0,0,0,0,0,0,0,0,0,0.190476,TCATACCGTTGGGCACCCACATACTCGTGCGGGGAGGGGGCACCCC...,0.347222
2,16_339439_C_T_MIMAT0010195,0,19,GGGGAGGGGGCACCCCAGCCCTCACACTCACCTGTAGCTGCCCTTT...,MIMAT0010195,0,19,CTGTACAGCCTCCTAGCTTTCC,((..(((((((....(((.,)))....)))))))..)).,-14.8,0,16,339409,339428,111000011111110011000,12,2,3.549,-6.275,19,0,GGGGAGGGGGCACCCCAGC,0,0,1,0,0,0,0,0,0,-1.0,0,0,0,0,0,0,0,1,0.157895,TCATACCGTTGGGCACCCACATACTCGTGCGGGGAGGGGGCACCCC...,0.375
3,16_339439_C_T_MIMAT0000063,20,43,GGGGAGGGGGCACCCCAGCCCTCACACTCACCTGTAGCTGCCCTTT...,MIMAT0000063,2,21,TGAGGTAGTAGGTTGTGTGGTT,.((((((..((((...((((((.,.)))))))))).)))))).,-19.9,0,16,339429,339452,000111111111101111110,16,4,3.393,-8.18,45,23,ACACTCACCTGTAGCTGCCCTT,0,0,1,0,0,0,0,0,1,2.0,0,0,0,0,0,0,0,1,0.454545,ATACTCGTGCGGGGAGGGGGCACCCCAGCCCTCACACTCACCTGTA...,0.380952
4,16_339439_C_T_MIMAT0004482,0,11,GGGGAGGGGGCACCCCAGCCCTCACACTCACCTGTAGCTGCCCTTT...,MIMAT0004482,11,22,CTATACAACCTACTGCCTTCCC,(((((((.((.,.)).))))))),-15.3,0,16,339409,339420,0000000000001101111111,9,0,3.716,-3.71,22,0,GGGGAGGGGGCACCCCAGCCCT,0,0,0,0,0,0,1,1,0,-1.0,0,0,0,0,0,0,0,0,0.181818,TCATACCGTTGGGCACCCACATACTCGTGCGGGGAGGGGGCACCCC...,0.347222


# prediction

In [16]:
cols_to_keep = [
    "pred_energy",
    "pred_num_basepairs",
    "pred_seed_basepairs",
    "ta_log10",
    "sps_mean",
    "anchor_a",
    "6mer_seed",
    "match_8",
    "6mer_seed_1_mismatch",
    "compensatory_site",
    "supplementary_site",
    "supplementary_site_2",
    "empty_seed",
    "9_consecutive_match_anywhere",
    "mirna_conservation",

    "seed_8mer",
    "seed_7mer_a1",
    "seed_7mer_m8",
    "seed_compensatory",
    "seed_clash_2",
    "seed_clash_3",
    "seed_clash_4",
    "seed_clash_5",
]

dropped_df = df[cols_to_keep]

# Convert the DataFrame into a DMatrix object
data = xgb.DMatrix(dropped_df)

# importing model and predicting
model = xgb.Booster()
model.load_model('results/model.xgb')
predictions = model.predict(data)

# adding predictions to the original df
df["prediction"] = predictions.astype(float)
df["binary_prediction"] = [1 if pred >= 0.5 else 0 for pred in predictions]


In [18]:
df.head()

Unnamed: 0,id,mrna_start,mrna_end,mrna_sequence,mirna_accession,mirna_start,mirna_end,mirna_sequence,mrna_dot_bracket_5to3,mirna_dot_bracket_5to3,pred_energy,is_mutated,chr,start_coordinate,end_coordinate,alignment_string,pred_num_basepairs,pred_seed_basepairs,ta_log10,sps_mean,mre_end,mre_start,mre_region,anchor_a,6mer_seed,match_8,6mer_seed_1_mismatch,compensatory_site,supplementary_site,supplementary_site_2,empty_seed,9_consecutive_match_anywhere,mirna_conservation,seed_8mer,seed_7mer_a1,seed_7mer_m8,seed_compensatory,seed_clash_2,seed_clash_3,seed_clash_4,seed_clash_5,mre_au_content,au_content_sequence,au_content,prediction,binary_prediction
0,16_339439_C_T_MIMAT0000062,14,31,GGGGAGGGGGCACCCCAGCCCTCACACTCACCTGTAGCTGCCCTTT...,MIMAT0000062,0,16,TGAGGTAGTAGGTTGTATAGTT,.(((((..(((.((((.,)))))).)..))))).,-12.2,0,16,339423,339440,111111010011111000000,12,5,3.393,-8.18,31,9,GCACCCCAGCCCTCACACTCAC,0,0,1,1,0,0,0,0,0,2.0,0,0,0,0,0,0,0,1,0.318182,ACCCACATACTCGTGCGGGGAGGGGGCACCCCAGCCCTCACACTCA...,0.371795,0.722705,1
1,16_339439_C_T_MIMAT0004481,0,11,GGGGAGGGGGCACCCCAGCCCTCACACTCACCTGTAGCTGCCCTTT...,MIMAT0004481,11,21,CTATACAATCTACTGTCTTTC,.((((((.((.,.)).)))))),-6.7,0,16,339409,339420,000000000000110111111,8,0,3.716,-3.71,22,1,GGGAGGGGGCACCCCAGCCCT,0,0,0,0,0,0,1,1,0,-1.0,0,0,0,0,0,0,0,0,0.190476,TCATACCGTTGGGCACCCACATACTCGTGCGGGGAGGGGGCACCCC...,0.347222,0.005295,0
2,16_339439_C_T_MIMAT0010195,0,19,GGGGAGGGGGCACCCCAGCCCTCACACTCACCTGTAGCTGCCCTTT...,MIMAT0010195,0,19,CTGTACAGCCTCCTAGCTTTCC,((..(((((((....(((.,)))....)))))))..)).,-14.8,0,16,339409,339428,111000011111110011000,12,2,3.549,-6.275,19,0,GGGGAGGGGGCACCCCAGC,0,0,1,0,0,0,0,0,0,-1.0,0,0,0,0,0,0,0,1,0.157895,TCATACCGTTGGGCACCCACATACTCGTGCGGGGAGGGGGCACCCC...,0.375,0.069397,0
3,16_339439_C_T_MIMAT0000063,20,43,GGGGAGGGGGCACCCCAGCCCTCACACTCACCTGTAGCTGCCCTTT...,MIMAT0000063,2,21,TGAGGTAGTAGGTTGTGTGGTT,.((((((..((((...((((((.,.)))))))))).)))))).,-19.9,0,16,339429,339452,000111111111101111110,16,4,3.393,-8.18,45,23,ACACTCACCTGTAGCTGCCCTT,0,0,1,0,0,0,0,0,1,2.0,0,0,0,0,0,0,0,1,0.454545,ATACTCGTGCGGGGAGGGGGCACCCCAGCCCTCACACTCACCTGTA...,0.380952,0.905918,1
4,16_339439_C_T_MIMAT0004482,0,11,GGGGAGGGGGCACCCCAGCCCTCACACTCACCTGTAGCTGCCCTTT...,MIMAT0004482,11,22,CTATACAACCTACTGCCTTCCC,(((((((.((.,.)).))))))),-15.3,0,16,339409,339420,0000000000001101111111,9,0,3.716,-3.71,22,0,GGGGAGGGGGCACCCCAGCCCT,0,0,0,0,0,0,1,1,0,-1.0,0,0,0,0,0,0,0,0,0.181818,TCATACCGTTGGGCACCCACATACTCGTGCGGGGAGGGGGCACCCC...,0.347222,0.082955,0


In [11]:
dropped_df.dtypes

pred_energy                     float64
pred_num_basepairs                int64
pred_seed_basepairs               int64
ta_log10                        float64
sps_mean                        float64
anchor_a                          int64
6mer_seed                         int64
match_8                           int64
6mer_seed_1_mismatch              int64
compensatory_site                 int64
supplementary_site                int64
supplementary_site_2              int64
empty_seed                        int64
9_consecutive_match_anywhere      int64
mirna_conservation              float64
seed_8mer                         int64
seed_7mer_a1                      int64
seed_7mer_m8                      int64
seed_compensatory                 int64
seed_clash_2                      int64
seed_clash_3                      int64
seed_clash_4                      int64
seed_clash_5                      int64
dtype: object

In [14]:
firstrow = df.iloc[[0]]



In [17]:
model.predict(xgb.DMatrix(dropped_df.iloc[[0]]))

array([0.7227053], dtype=float32)

In [11]:
# split df by half
middle_row_index = len(df) // 2

# Split the DataFrame into two halves
wt = df.iloc[:middle_row_index].reset_index(drop=True)
mut = df.iloc[middle_row_index:].reset_index(drop=True)

results = pd.DataFrame()
results = (results
           .assign(id=wt.id)
           .assign(wt=wt.prediction)
           .assign(mut=mut.prediction)
           .assign(wt_binary=wt.binary_prediction)
           .assign(mut_binary=mut.binary_prediction)
           .assign(is_changed = results.wt_binary != results.mut_binary)
           .assign(difference=wt.prediction - mut.prediction)
           )


AttributeError: 'DataFrame' object has no attribute 'wt_binary'

In [20]:
results[results.difference != 0]

Unnamed: 0,id,wt,mut,wt_binary,mut_binary,is_changed,difference
3,16_339439_C_T_MIMAT0000063,0.905918,0.975883,1,1,False,-0.069965
5,16_339439_C_T_MIMAT0000064,0.849435,0.915929,1,1,False,-0.066494
7,16_339439_C_T_MIMAT0000065,0.541017,0.249389,1,0,True,0.291628
9,16_339439_C_T_MIMAT0000066,0.695119,0.697113,1,1,False,-0.001993
11,16_339439_C_T_MIMAT0000067,0.139947,0.158331,0,0,False,-0.018384
...,...,...,...,...,...,...,...
5300,16_339608_C_T_MIMAT0049018,0.129047,0.169671,0,0,False,-0.040624
5301,16_339608_C_T_MIMAT0049019,0.095555,0.066715,0,0,False,0.028840
5302,16_339608_C_T_MIMAT0049020,0.772211,0.388436,1,0,True,0.383775
5303,16_339608_C_T_MIMAT0049021,0.906580,0.978821,1,1,False,-0.072241


In [11]:
wt.prediction - mut.prediction

0       0.000000
1       0.000000
2       0.000000
3      -0.069965
4       0.000000
          ...   
5307    0.000000
5308    0.000000
5309    0.000000
5310    0.000000
5311    0.000000
Name: prediction, Length: 5312, dtype: float64

In [None]:
# Step 1: Group by 'id' and check for different 'prediction' values
grouped = df.groupby('id')['binary_prediction'].nunique()

# Step 2: Filter groups where 'prediction' values are different
ids_with_different_predictions = grouped[grouped > 1].index.tolist()

df['is_affected'] = 0  # Initialize the new column with zeros
df.loc[df['id'].isin(ids_with_different_predictions), 'is_affected'] = 1


df[df.is_affected == 1]

In [None]:
df.head()

In [None]:

half1, half2 = np.array_split(df.binary_prediction, 2)
binary_half1, binary_half2 =  np.array_split(df.binary_prediction, 2)

half1 = half1.r

result_df = pd.DataFrame({'col1': half1, 'col2': half2, 'col3': binary_half1, 'col4': binary_half2})

result_df


In [None]:
prediction_series = df.prediction
binary_prediction_series = df.binary_prediction

halved_predictions = np.array_split(prediction_series, 2)
halved_binary_prediction_series = np.array_split(binary_prediction_series, 2)

result_df = pd.DataFrame({'col1': halved_predictions[0], 'col2': halved_predictions[1], 'col3': halved_binary_prediction_series[0], 'col4': halved_binary_prediction_series[1]})

result_df

import pandas as pd

# Assuming 'series' is your long pandas series
series = pd.Series([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])

# Split the series into two halves
half1, half2 = series[:len(series)//2], series[len(series)//2:]

# Print the two halves
print("First half:", half1)
print("Second half:", half2)



In [None]:
df[df.is_affected == 1].prediction

In [None]:
df = df.sort_values(by=["id", "is_mutated"])

# Create a new column to store the changes in prediction
df['change'] = 0

# Initialize variables to keep track of the previous 'id' and 'prediction' values
prev_id = None
prev_prediction = None

# Iterate through the DataFrame row by row
for i, row in df.iterrows():

    if i % 2 == 1:
        # Get the current 'id' and 'prediction' values
        current_id = row['id']
        current_prediction = row['prediction']

        # Check if it's not the first row (i.e., prev_id and prev_prediction are not None)
        if prev_id is not None and prev_prediction is not None:
            # Check if the 'prediction' value decreased from the previous row
            if current_prediction < prev_prediction:
                df.at[i, 'change'] = -1
            # Check if the 'prediction' value increased from the previous row
            elif current_prediction > prev_prediction:
                df.at[i, 'change'] = 1

        # Update the previous row values
        prev_id = current_id
        prev_prediction = current_prediction

# Display the updated DataFrame with the 'change' column


df.head()

In [None]:
result_dict = {}

# Group the DataFrame by 'id' column
grouped = df.groupby('id')

# Iterate over each group and convert it into a dictionary
for group_name, group_df in grouped:
    result_dict[group_name] = group_df['prediction'].tolist()


for key, value in result_dict.items():
    if value == [0,0]:
        result_dict[key] = "unchanged_0"
        
    elif value == [1,1]:
        result_dict[key] = "unchanged_1"
    
    elif value == [0,1]:
        result_dict[key] = "up"
    
    elif value == [1,0]:
        result_dict[key] = "down"
        
dff = pd.DataFrame.from_dict(result_dict, orient='index', columns=['prediction'])

dff[dff.prediction == "up"]

In [None]:
dff.prediction.value_counts()
