In [1]:
%cd ..
import xgboost as xgb
import pandas as pd

from sklearn.preprocessing import MinMaxScaler, StandardScaler

pd.set_option('display.max_columns', 500)

/run/media/nazif/2F946E411BA61D49/thesis


In [2]:
def scale_columns(df, cols):

    # Create a scaler object
    scaler = StandardScaler()

    # Scale the specified columns in the dataframe
    df_scaled = df.copy()
    df_scaled[cols] = scaler.fit_transform(df[cols])

    return df_scaled

In [3]:
df = pd.read_csv("results/8_adding_feature_cols.csv")

cols_to_keep = [
    "pred_energy",
    "pred_num_basepairs",
    "pred_seed_basepairs",
    "ta_log10",
    "sps_mean",
    "anchor_a",
    "6mer_seed",
    "match_8",
    "6mer_seed_1_mismatch",
    "compensatory_site",
    "supplementary_site",
    "supplementary_site_2",
    "empty_seed",
    "9_consecutive_match_anywhere",
    "mirna_conservation",
    "midpoint",
    "close_proximity",
    "seed_8mer",
    "seed_7mer_a1",
    "seed_7mer_m8",
    "seed_compensatory",
    "seed_clash_2",
    "seed_clash_3",
    "seed_clash_4",
    "seed_clash_5",
]



In [4]:
# scaling columns
# cols_to_scale = ["pred_energy", "ta_log10", "sps_mean"]
# df = scale_columns(df, cols_to_scale)

# scale midpoint using minmax
minmax = MinMaxScaler(feature_range=(0, 1))
df["midpoint"] = minmax.fit_transform(df["midpoint"].values.reshape(-1, 1))

In [5]:
# Convert the DataFrame into a DMatrix object
data = xgb.DMatrix(df[cols_to_keep])


In [6]:
model = xgb.Booster()
model.load_model('results/model_no_au_content.xgb')
predictions = model.predict(data)



In [7]:
predictions

array([0.98502743, 0.9797901 , 0.99118453, ..., 0.99796486, 0.9978313 ,
       0.99319285], dtype=float32)

In [8]:
df["predictions"] = predictions
df["binary_predictions"] = [1 if pred >= 0.5 else 0 for pred in predictions]

df.head(10)

Unnamed: 0,id,mrna_start,mrna_end,mrna_sequence,mirna_accession,mirna_start,mirna_end,mirna_sequence,mrna_dot_bracket_5to3,mirna_dot_bracket_5to3,pred_energy,is_mutated,alignment_string,pred_num_basepairs,pred_seed_basepairs,ta_log10,sps_mean,anchor_a,6mer_seed,match_8,6mer_seed_1_mismatch,compensatory_site,supplementary_site,supplementary_site_2,empty_seed,9_consecutive_match_anywhere,mirna_conservation,midpoint,close_proximity,seed_8mer,seed_7mer_a1,seed_7mer_m8,seed_compensatory,seed_clash_2,seed_clash_3,seed_clash_4,seed_clash_5,mre_au_content,predictions,binary_predictions
0,1_809687_G_C_MIMAT0000062,0,10,GCCACGGCTGAGGAGGAGGAGTT,MIMAT0000062,2,14,TGAGGTAGTAGGTTGTATAGTT,(((((.(((.,.))).)).))).,-5.2,0,111011011100000000,8,3,3.393,-8.18,0,0,1,0,0,0,0,0,0,2.0,0.0,0,0,0,0,0,0,0,0,0,0.347826,0.985027,1
1,1_809687_G_C_MIMAT0004481,10,23,GCCACGGCTGAGGAGGAGGAGTT,MIMAT0004481,10,21,CTATACAATCTACTGTCTTTC,.((((((..(((.,.))).)))))),-5.7,0,1110111111,9,0,3.716,-3.71,0,0,0,0,0,0,1,1,0,-1.0,0.555556,0,0,0,0,0,0,0,0,0,0.347826,0.97979,1
2,1_809687_G_C_MIMAT0010195,4,20,GCCACGGCTGAGGAGGAGGAGTT,MIMAT0010195,4,19,CTGTACAGCCTCCTAGCTTTCC,.((((.((((((..(.,.)..)))))))))).,-16.1,0,1001111111111000,11,1,3.549,-6.275,1,0,0,0,1,1,0,0,1,-1.0,0.222222,0,0,0,0,0,0,0,0,1,0.347826,0.991185,1
3,1_809687_G_C_MIMAT0000063,0,10,GCCACGGCTGAGGAGGAGGAGTT,MIMAT0000063,9,22,TGAGGTAGTAGGTTGTGTGGTT,(((((((((.,.)))..)))))).,-7.8,0,111001111110,9,0,3.393,-8.18,0,0,0,0,0,0,1,1,0,2.0,0.0,0,0,0,0,0,0,0,0,0,0.347826,0.817762,1
4,1_809687_G_C_MIMAT0004482,10,23,GCCACGGCTGAGGAGGAGGAGTT,MIMAT0004482,10,22,CTATACAACCTACTGCCTTCCC,.(((((.(.(((.,.))).)))))).,-10.5,0,11101111110,9,0,3.716,-3.71,0,0,0,0,0,0,1,1,0,-1.0,0.555556,0,0,0,0,0,0,0,0,0,0.347826,0.97979,1
5,1_809687_G_C_MIMAT0000064,0,10,GCCACGGCTGAGGAGGAGGAGTT,MIMAT0000064,9,22,TGAGGTAGTAGGTTGTATGGTT,(((((((((.,.))))))..))).,-6.7,0,111111001110,9,0,3.393,-8.18,0,0,0,0,0,1,0,1,0,2.0,0.0,0,0,0,0,0,0,0,0,0,0.347826,0.931993,1
6,1_809687_G_C_MIMAT0026472,4,17,GCCACGGCTGAGGAGGAGGAGTT,MIMAT0026472,7,19,CTGTACAACCTTCTAGCTTTCC,.((((.((((((.,.)))))))))).,-12.5,0,1111111111000,10,0,3.612,-5.529,1,0,0,0,1,1,0,1,1,-1.0,0.222222,0,0,0,0,0,0,0,1,0,0.347826,0.993181,1
7,1_809687_G_C_MIMAT0000065,0,11,GCCACGGCTGAGGAGGAGGAGTT,MIMAT0000065,4,17,AGAGGTAGTAGGTTGCATAGTT,((.((.((((.,.))))..)).)).,-6.0,0,1111001101100000,8,2,3.393,-8.18,1,0,1,0,0,0,0,0,0,2.0,0.0,0,0,0,0,0,0,0,0,0,0.347826,0.9638,1
8,1_809687_G_C_MIMAT0004484,8,23,GCCACGGCTGAGGAGGAGGAGTT,MIMAT0004484,3,20,CTATACGACCTGCTGCCTTTCT,.((((((.(((.((.,.))..))).)).)))).,-11.1,0,11001110110111100,11,2,2.48,-4.871,0,0,0,0,0,0,0,0,0,-1.0,0.444444,0,0,0,0,0,0,0,0,1,0.347826,0.994785,1
9,1_809687_G_C_MIMAT0000066,2,10,GCCACGGCTGAGGAGGAGGAGTT,MIMAT0000066,9,17,TGAGGTAGGAGGTTGTATAGTT,.((((((.,.)))))).,-4.9,0,11111100000,6,0,3.393,-8.18,0,0,0,0,0,1,0,1,0,2.0,0.111111,0,0,0,0,0,0,0,0,0,0.347826,0.965578,1


In [9]:
df[df["id"].str.startswith("1_809687_G_C_MIMAT0000066")]

Unnamed: 0,id,mrna_start,mrna_end,mrna_sequence,mirna_accession,mirna_start,mirna_end,mirna_sequence,mrna_dot_bracket_5to3,mirna_dot_bracket_5to3,pred_energy,is_mutated,alignment_string,pred_num_basepairs,pred_seed_basepairs,ta_log10,sps_mean,anchor_a,6mer_seed,match_8,6mer_seed_1_mismatch,compensatory_site,supplementary_site,supplementary_site_2,empty_seed,9_consecutive_match_anywhere,mirna_conservation,midpoint,close_proximity,seed_8mer,seed_7mer_a1,seed_7mer_m8,seed_compensatory,seed_clash_2,seed_clash_3,seed_clash_4,seed_clash_5,mre_au_content,predictions,binary_predictions
9,1_809687_G_C_MIMAT0000066,2,10,GCCACGGCTGAGGAGGAGGAGTT,MIMAT0000066,9,17,TGAGGTAGGAGGTTGTATAGTT,.((((((.,.)))))).,-4.9,0,11111100000,6,0,3.393,-8.18,0,0,0,0,0,1,0,1,0,2.0,0.111111,0,0,0,0,0,0,0,0,0,0.347826,0.965578,1
5321,1_809687_G_C_MIMAT0000066_mutated,2,13,GCCACGGCTGACGAGGAGGAGTT,MIMAT0000066,6,17,TGAGGTAGGAGGTTGTATAGTT,.((((((..(.,.)..)))))).,-7.5,1,10011111100000,7,0,3.393,-8.18,0,0,1,0,0,1,0,0,0,2.0,0.111111,0,0,0,0,0,0,0,0,0,0.347826,0.989621,1


In [10]:
df.loc

<pandas.core.indexing._LocIndexer at 0x7fd6973b4690>