In [1]:
%cd ../..

import pandas as pd
import xgboost as xgb

from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import StandardScaler


/run/media/nazif/2F946E411BA61D49/thesis


In [2]:
def scale_columns(df, cols):

    # Create a scaler object
    scaler = StandardScaler()

    # Scale the specified columns in the dataframe
    df_scaled = df.copy()
    df_scaled[cols] = scaler.fit_transform(df[cols])

    return df_scaled


def report_feature_importances(model, X_train):

    # Get the feature importances
    importances = model.feature_importances_

    # Create a dataframe of feature importances
    feature_importances = pd.DataFrame(
        {"feature": X_train.columns, "importance": importances})

    # Sort the dataframe by feature importance in descending order
    feature_importances = feature_importances.sort_values(
        "importance", ascending=False)

    return feature_importances

In [3]:
raw_df = pd.read_csv("results/final.csv")

In [4]:
cols_to_drop = ["mrna_start", "mrna_end", "mirna_start", "mirna_end",
                "mirna_dot_bracket_5to3", "mirna_sequence", "mirna_accession",
                "mre_region", "enst", "alignment_string"]

df = raw_df.drop(cols_to_drop, axis=1)

X = df.drop("label", axis=1)
y = df["label"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y)


model = xgb.XGBClassifier()
model.fit(X_train, y_train)
score = model.score(X_test, y_test)

print(score)

0.9832428238944918


# scaling columns

In [5]:
cols_to_scale = ["pred_energy", "ta_log10",
                 "ta_percentile_rank", "sps_mean", "sps_mean_percentile_rank"]

df = scale_columns(df, cols_to_scale)

model.fit(X_train, y_train)
score = model.score(X_test, y_test)

print(score)

0.9832428238944918


In [6]:
report_feature_importances(model, X_train)

Unnamed: 0,feature,importance
0,pred_energy,0.7482
6,ta_log10,0.071819
8,sps_mean,0.064458
4,seed_match,0.032977
1,pred_num_basepairs,0.026946
2,pred_seed_basepairs,0.026356
5,seed_1_mismatch,0.010714
3,anchor_a,0.009916
7,ta_percentile_rank,0.008613
9,sps_mean_percentile_rank,0.0


# GridSearchCV


In [7]:
param_grid = {
    "max_depth": [3, 4, 5],
    "learning_rate": [0.01, 0.1, 0.5],
    "n_estimators": [100, 500, 1000]
}

grid_search = GridSearchCV(
    estimator=model, param_grid=param_grid, cv=5, n_jobs=-1)

grid_search.fit(X_train, y_train)

print("Best hyperparameters: ", grid_search.best_params_)
print("Accuracy: {:.2f}%".format(grid_search.best_score_ * 100))

Best hyperparameters:  {'learning_rate': 0.5, 'max_depth': 3, 'n_estimators': 100}
Accuracy: 98.34%


In [8]:
model_after_cv = xgb.XGBClassifier(**grid_search.best_params_)

model_after_cv.fit(X_train, y_train)
score_after_cv = model_after_cv.score(X_test, y_test)
print(f"accuracy after cv: {score_after_cv}")

accuracy after cv: 0.9841737781225757
