#miRNA ML for gliomas
Jan, 2025
Biniam



> 1 - Loading the data

> 2 - Modelling for Random Forest

> 3 - Modelling Knn

> 4 - Modelling for XGB

Load processed Data 

In [None]:
# read miRNA data
import pandas as pd
import numpy as np
miRNA_data_corrected = pd.read_csv('combined_miRNAs_scaled.csv')
miRNA_data_corrected.rename(columns={'Unnamed: 0': 'pid'}, inplace=True)  # 
miRNA_data_corrected.set_index('pid', inplace=True)  # set pid as index
# # # subset last 17 rows
validation_miRNA_data = miRNA_data_corrected.iloc[-17:, :]  # last 17 rows
trainging_miRNA_data = miRNA_data_corrected.iloc[:-17, :]  # all but last 17 rows
trainging_miRNA_data['target'] = np.where(trainging_miRNA_data.index.str.contains('H'), 0, 1)

# # # save to csv 
trainging_miRNA_data.to_csv('miRNA_trainging_data.csv', index=True)
validation_miRNA_data.to_csv('miRNA_validation_data.csv', index=True)

validation_miRNA_data.drop(columns=['dataset'], inplace=True)
trainging_miRNA_data.drop(columns=['dataset'], inplace=True)

In [None]:
# count Healthy vs GBM in trainging_miRNA_data
print(trainging_miRNA_data['target'].value_counts())
X = trainging_miRNA_data.drop(['target',], axis = 1)
y = trainging_miRNA_data['target']

RandomForest Feature Selection Processs

In [None]:
from sklearn.feature_selection import RFECV
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV, StratifiedKFold, cross_val_score, train_test_split
import numpy as np
import matplotlib.pyplot as plt 


# Split data (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Initialize model
rf = RandomForestClassifier(n_estimators=100, random_state=42)



cv_split = StratifiedKFold(5, shuffle=True, random_state=42)

rf_rfecv = RFECV(
    estimator=rf,
    step=1,
    min_features_to_select=1,
    cv=cv_split,
    scoring="roc_auc",
    n_jobs=-1,
)

rf_rfecv.fit(X_train, y_train)

print(f"Optimal number of features: {rf_rfecv.n_features_}")
print(f"Optimal features: {list(rf_rfecv.get_feature_names_out())}")

rf_rfecv.cv_results_

def plot_features_vs_cvscore_95ci(rfecv_model, cv_folds=5):
    n_features = len(rfecv_model.cv_results_["mean_test_score"])
    mean_scores = rfecv_model.cv_results_["mean_test_score"]
    std_scores = rfecv_model.cv_results_["std_test_score"]

    # Calculate 95% CI
    ci = 1.96 * (std_scores / np.sqrt(cv_folds))

    x_range = range(1, n_features + 1)

    # Plotting
    plt.figure(figsize=(12, 8))
    plt.plot(x_range, mean_scores, marker='o', label='Mean ROC AUC')
    plt.fill_between(x_range,
                 np.clip(mean_scores - ci, 0, 1),
                 np.clip(mean_scores + ci, 0, 1),
                 color='lightblue',
                 alpha=0.4,
                 label='95% Confidence Interval')
    # Annotate best feature point
    best_idx = np.argmax(mean_scores)
    best_score = mean_scores[best_idx]
    best_n_features = best_idx + 1
    plt.axvline(best_n_features, color='red', linestyle='--', label=f'Best = {best_n_features} features')
    plt.scatter(best_n_features, best_score, color='red')
    plt.text(best_n_features + 10, best_score,
             f'Best = {best_n_features}\nROC AUC = {best_score:.3f}',
             color='black', fontsize=10, bbox=dict(facecolor='white', alpha=0.6))

    # Annotate point at 10 features 
    if n_features >= 10:
        auc_at_10 = mean_scores[9]
        plt.scatter(10, auc_at_10, color='green')
        plt.text(10 + 10, auc_at_10,
                 f'10 features\nROC AUC = {auc_at_10:.3f}',
                 color='darkgreen', fontsize=8, bbox=dict(facecolor='white', alpha=0.6))

    plt.xlabel("Number of Features Selected", fontsize=12)
    plt.ylabel("Mean CV ROC_AUC", fontsize=12)
    plt.title("ROC_AUC vs Number of Features (with 95% CI)", fontsize=14)

    plt.xticks(np.arange(0, n_features + 1, step=50))
    plt.grid(True)
    plt.legend()
    plt.tight_layout()
    plt.show()
plot_features_vs_cvscore_95ci(rf_rfecv, cv_folds=5)

2a. RandomForest with Feature Selection

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix, ConfusionMatrixDisplay, classification_report, roc_curve, auc
from sklearn.model_selection import GridSearchCV, StratifiedKFold, cross_val_score, train_test_split
from sklearn.feature_selection import RFE

# Initialize result lists
AUC, AUC_sd, AC, AC_sd, Pre, Pre_sd, Re, Re_sd, f1, f1_sd, Best1, Best2 = ([] for _ in range(12))

# Reset indices of DataFrame
X, y = X.reset_index(drop=True), y.reset_index(drop=True)

# Split data (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# # Perform RFE with Random Forest to select top 10 features
rf = RandomForestClassifier(n_estimators=100, random_state=42)
selector = RFE(rf, n_features_to_select=10)
selector.fit(X_train, y_train)

# Get selected features
top_10_features_rf = X_train.columns[selector.support_]
print("Top 10 Selected Features:", top_10_features_rf)

# # Use only the selected features
rf_train_selected, rf_test_selected = X_train[top_10_features_rf], X_test[top_10_features_rf]


# Hyperparameter tuning using GridSearchCV
param_grid = {'n_estimators': [50, 100, 200], 'max_depth': [1, 2, 3, 5, 10, 20, None]}
gs = GridSearchCV(estimator=RandomForestClassifier(random_state=42),
                  param_grid=param_grid,
                  scoring='accuracy', cv=5, refit=True, n_jobs=-1)
gs.fit(rf_train_selected, y_train)

# Best hyperparameters
best_params = gs.best_params_
print(f'Best Parameters: {best_params}')

# Train final model
model_rf = RandomForestClassifier(**best_params, random_state=42)
model_rf.fit(rf_train_selected, y_train)

# Predictions on test set
test_pred = model_rf.predict(rf_test_selected)

# Confusion matrix
print("\nConfusion matrix:")
confusion_mat = confusion_matrix(y_test, test_pred)
disp = ConfusionMatrixDisplay(confusion_mat, display_labels=['Healthy', 'GBM'])
disp.plot()
plt.savefig("RF_bestModel_ConfusionMatrix_miRNA_HG.png", format="png", dpi=600)
plt.savefig("RF_bestModel_ConfusionMatrix_miRNA_HG.svg", format="svg", dpi=600)
plt.show()

# Classification report
print(classification_report(y_test, test_pred, target_names=['Healthy', 'GBM']))

# ROC/AUC Calculation for Cross-Validation
cv = list(StratifiedKFold(n_splits=5).split(rf_train_selected, y_train))
fig = plt.figure(figsize=(7, 5))
mean_tpr = 0.0
mean_fpr = np.linspace(0, 1, 100)
sd_list = []

for i, (train_idx, test_idx) in enumerate(cv):
    probas = model_rf.fit(rf_train_selected.iloc[train_idx], y_train.iloc[train_idx]).predict_proba(rf_train_selected.iloc[test_idx])
    fpr, tpr, _ = roc_curve(y_train.iloc[test_idx], probas[:, 1], pos_label=1)
    mean_tpr += np.interp(mean_fpr, fpr, tpr)
    roc_auc = auc(fpr, tpr)
    plt.plot(fpr, tpr, label=f'ROC fold {i+1} (area = {roc_auc:.2f})')
    sd_list.append(roc_auc)

plt.plot([0, 1], [0, 1], linestyle='--', color=(0.6, 0.6, 0.6), label='Random guessing')

mean_tpr /= len(cv)
mean_auc, mean_sd = auc(mean_fpr, mean_tpr), np.std(sd_list)

print(f'CV AUC: {mean_auc:.3f} +/- {mean_sd:.3f}')

plt.plot(mean_fpr, mean_tpr, 'k--', label=f'Mean ROC (area = {mean_auc:.2f})', lw=2)
plt.plot([0, 0, 1], [0, 1, 1], linestyle=':', color='black', label='Perfect performance')
plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.legend(loc="lower right")
plt.tight_layout()
plt.savefig("RF_bestModel_AUC_miRNA_HG.png", format="png", dpi=600)
plt.savefig("RF_bestModel_AUC_niRNA_HG.svg", format="svg", dpi=600)
plt.show()

AUC.append(mean_auc)
AUC_sd.append(mean_sd)

# Nested Cross-Validation with Multiple Metrics
scores1 = cross_val_score(gs, rf_train_selected, y_train, scoring='accuracy', cv=5)
scores2 = cross_val_score(gs, rf_train_selected, y_train, scoring='precision', cv=5)
scores3 = cross_val_score(gs, rf_train_selected, y_train, scoring='recall', cv=5)
scores4 = cross_val_score(gs, rf_train_selected, y_train, scoring='f1', cv=5)

print(f'cv accuracy: {np.mean(scores1):.3f} +/- {np.std(scores1):.3f}')
print(f'cv precision: {np.mean(scores2):.3f} +/- {np.std(scores2):.3f}')
print(f'cv recall: {np.mean(scores3):.3f} +/- {np.std(scores3):.3f}')
print(f'cv f1: {np.mean(scores4):.3f} +/- {np.std(scores4):.3f}')

AC.append(np.mean(scores1))
AC_sd.append(np.std(scores1))
Pre.append(np.mean(scores2))
Pre_sd.append(np.std(scores2))
Re.append(np.mean(scores3))
Re_sd.append(np.std(scores3))
f1.append(np.mean(scores4))
f1_sd.append(np.std(scores4))
Best1.append(best_params['n_estimators'])
Best2.append(best_params['max_depth'])


Predict Using External Validation miRNA batch corrected

In [None]:
# predict on validation set
validation_miRNA_data_rf = validation_miRNA_data[top_10_features_rf]
validation_pred = model_rf.predict(validation_miRNA_data_rf)
# predictions to csv
validation_miRNA_data_rf['predicted'] = validation_pred
validation_miRNA_data_rf.to_csv('miRNA_validation_predictions.csv', index=True)

3a. Knn model training using top-10 features

In [None]:
# Recressive Feature Elimination
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix, ConfusionMatrixDisplay, classification_report, roc_curve, auc
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold, cross_val_score, train_test_split
from sklearn.feature_selection import RFE

# Initialize result lists
AUC = []
AUC_sd = []
AC = []
AC_sd = []
Pre = []
Pre_sd = []
Re = []
Re_sd = []
f1 = []
f1_sd = []
Best1 = []
Best2 = []

# Reset indices of the DataFrame
X = X.reset_index(drop=True)
y = y.reset_index(drop=True)

# Split data into training and test sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Perform Recursive Feature Elimination (RFE) to select top 10 features
rf = RandomForestClassifier(n_estimators=100, random_state=42)
selector = RFE(rf, n_features_to_select=10)
selector = selector.fit(X_train, y_train)

# Get selected features
top_10_features = X_train.columns[selector.support_]
print("Top 10 Selected Features:", top_10_features)

#Use only the selected features for training
knn_train_selected = X_train[top_10_features]
knn_test_selected = X_test[top_10_features]


# Perform GridSearchCV for hyperparameter tuning
param_range1 = [2, 3, 4, 5, 7, 9]
gs = GridSearchCV(estimator=KNeighborsClassifier(),
                  param_grid=[{'n_neighbors': param_range1, 'metric': ['minkowski']}],
                  scoring='accuracy', cv=5, refit=True, n_jobs=-1)
gs = gs.fit(knn_train_selected, y_train)

# Get the best hyperparameters
best_n_neighbors = gs.best_params_['n_neighbors']
print(f'Best n_neighbors: {best_n_neighbors}')

# Evaluate model using the best parameters
model_knn = KNeighborsClassifier(n_neighbors=best_n_neighbors)
model_knn.fit(knn_train_selected, y_train)

# Test the model on the test set
test_pred = model_knn.predict(knn_test_selected)

# Confusion matrix
print("\nConfusion matrix:")
confusion_mat = confusion_matrix(y_test, test_pred)
disp = ConfusionMatrixDisplay(confusion_mat, display_labels=['Healthy', 'GBM'])
disp.plot()
plt.show()

# Classification report
print(classification_report(y_test, test_pred, target_names=['Healthy', 'GBM']))

# ROC/AUC Calculation for Cross-Validation
cv = list(StratifiedKFold(n_splits=5).split(knn_train_selected, y_train))
fig = plt.figure(figsize=(7, 5))
mean_tpr = 0.0
mean_fpr = np.linspace(0, 1, 100)
all_tpr = []

sd_list = []
for i, (train_idx, test_idx) in enumerate(cv):
    probas = model_knn.fit(knn_train_selected.iloc[train_idx], y_train.iloc[train_idx]).predict_proba(knn_train_selected.iloc[test_idx])
    fpr, tpr, _ = roc_curve(y_train.iloc[test_idx], probas[:, 1], pos_label=1)
    mean_tpr += np.interp(mean_fpr, fpr, tpr)
    mean_tpr[0] = 0.0
    roc_auc = auc(fpr, tpr)
    plt.plot(fpr, tpr, label=f'ROC fold {i+1} (area = {roc_auc:.2f})')
    sd_list.append(roc_auc)

plt.plot([0, 1], [0, 1], linestyle='--', color=(0.6, 0.6, 0.6), label='Random guessing')

mean_tpr /= len(cv)
mean_tpr[-1] = 1.0
mean_auc = auc(mean_fpr, mean_tpr)
mean_sd = np.std(sd_list)

print(f'CV AUC: {mean_auc:.3f} +/- {mean_sd:.3f}')

# Plot mean ROC
plt.plot(mean_fpr, mean_tpr, 'k--', label=f'Mean ROC (area = {mean_auc:.2f})', lw=2)
plt.plot([0, 0, 1], [0, 1, 1], linestyle=':', color='black', label='Perfect performance')
plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.legend(loc="lower right")
plt.tight_layout()
plt.show()

AUC.append(mean_auc)
AUC_sd.append(mean_sd)

# Nested Cross-Validation with Multiple Metrics
scores1 = cross_val_score(gs, knn_train_selected, y_train, scoring='accuracy', cv=5)
scores2 = cross_val_score(gs, knn_train_selected, y_train, scoring='precision', cv=5)
scores3 = cross_val_score(gs, knn_train_selected, y_train, scoring='recall', cv=5)
scores4 = cross_val_score(gs, knn_train_selected, y_train, scoring='f1', cv=5)

print(f'cv accuracy: {np.mean(scores1):.3f} +/- {np.std(scores1):.3f}')
print(f'cv precision: {np.mean(scores2):.3f} +/- {np.std(scores2):.3f}')
print(f'cv recall: {np.mean(scores3):.3f} +/- {np.std(scores3):.3f}')
print(f'cv f1: {np.mean(scores4):.3f} +/- {np.std(scores4):.3f}')

AC.append(np.mean(scores1))
AC_sd.append(np.std(scores1))
Pre.append(np.mean(scores2))
Pre_sd.append(np.std(scores2))
Re.append(np.mean(scores3))
Re_sd.append(np.std(scores3))
f1.append(np.mean(scores4))
f1_sd.append(np.std(scores4))
Best1.append(gs.best_params_['n_neighbors'])
Best2.append('NaN')


In [None]:
# predict on validation set
validation_miRNA_data_knn = validation_miRNA_data[top_10_features]
validation_pred_knn = model_knn.predict(validation_miRNA_data_knn)
# predictions to csv
validation_miRNA_data_knn['predicted'] = validation_pred_knn
validation_miRNA_data_knn.to_csv('miRNA_validation_predictions_knn.csv', index=True)

XGB Feature Selection

In [None]:
from sklearn.feature_selection import RFECV
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV, StratifiedKFold, cross_val_score, train_test_split
import xgboost as xgb
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd


# Split data (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Initialize model

xgboost_model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42)

cv_split = StratifiedKFold(5, shuffle=True, random_state=42)

xgb_rfecv = RFECV(
    estimator=xgboost_model,
    step=1,
    min_features_to_select=1,
    cv=cv_split,
    scoring="roc_auc",
    n_jobs=-1,
)

xgb_rfecv.fit(X_train, y_train)

print(f"Optimal number of features: {xgb_rfecv.n_features_}")
print(f"Optimal features: {list(xgb_rfecv.get_feature_names_out())}")

xgb_rfecv.cv_results_

def plot_features_vs_cvscore_95ci(rfecv_model, cv_folds=5):
    n_features = len(rfecv_model.cv_results_["mean_test_score"])
    mean_scores = rfecv_model.cv_results_["mean_test_score"]
    std_scores = rfecv_model.cv_results_["std_test_score"]

    # Calculate 95% CI
    ci = 1.96 * (std_scores / np.sqrt(cv_folds))

    x_range = range(1, n_features + 1)

    # Plotting
    plt.figure(figsize=(12, 8))
    plt.plot(x_range, mean_scores, marker='o', label='Mean ROC AUC')
    plt.fill_between(x_range,
                 np.clip(mean_scores - ci, 0, 1),
                 np.clip(mean_scores + ci, 0, 1),
                 color='lightblue',
                 alpha=0.4,
                 label='95% Confidence Interval')

    # Annotate best feature point
    best_idx = np.argmax(mean_scores)
    best_score = mean_scores[best_idx]
    best_n_features = best_idx + 1
    plt.axvline(best_n_features, color='red', linestyle='--', label=f'Best = {best_n_features} features')
    plt.scatter(best_n_features, best_score, color='red')
    plt.text(best_n_features + 10, best_score,
             f'Best = {best_n_features}\nROC AUC = {best_score:.3f}',
             color='black', fontsize=10, bbox=dict(facecolor='white', alpha=0.6))



    plt.xlabel("Number of Features Selected", fontsize=12)
    plt.ylabel("Mean CV ROC_AUC", fontsize=12)
    plt.title("ROC_AUC vs Number of Features (with 95% CI)", fontsize=14)

    plt.xticks(np.arange(0, n_features + 1, step=50))
    plt.grid(True)
    plt.legend()
    plt.tight_layout()
    plt.show()
plot_features_vs_cvscore_95ci(xgb_rfecv, cv_folds=5)

4a. Xgboost model training using top 10 features

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV, StratifiedKFold, train_test_split, cross_val_score
from sklearn.feature_selection import RFE
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix, ConfusionMatrixDisplay, classification_report, roc_curve, auc
import xgboost as xgb
from sklearn.preprocessing import OneHotEncoder

# Initialize result lists
AUC = []
AUC_sd = []
AC = []
AC_sd = []
Pre = []
Pre_sd = []
Re = []
Re_sd = []
f1 = []
f1_sd = []
Best1 = []
Best2 = []

# Reset indices of the DataFrame
X = X.reset_index(drop=True)
y = y.reset_index(drop=True)

X = X.astype('float64')

# Split data into training and test sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Perform Recursive Feature Elimination (RFE) to select top 10 features
xgboost_model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')

# Perform RFE with XGBoost as the estimator
selector = RFE(xgboost_model, n_features_to_select=10)
selector = selector.fit(X_train, y_train)

# Get selected features
top_10_features_xgb = X_train.columns[selector.support_]
print("Top 10 Selected Features:", top_10_features_xgb)

# Use only the selected features for training

xgb_train_selected = X_train[top_10_features_xgb]
xgb_test_selected = X_test[top_10_features_xgb]

# Perform GridSearchCV for hyperparameter tuning
param_range1 = [50, 100, 200]
gs = GridSearchCV(estimator=xgboost_model,
                  param_grid={'n_estimators': param_range1, 'max_depth': [1, 3, 5, 7, None], 'learning_rate': [0.08, 0.15, 0.2]},
                  scoring='accuracy', cv=5, refit=True, n_jobs=-1)
gs = gs.fit(xgb_train_selected, y_train)

# Get the best hyperparameters
best_params = gs.best_params_
print(f'Best Hyperparameters: {best_params}')

# Train the model with best hyperparameters
model_xgb = xgb.XGBClassifier(**best_params, use_label_encoder=False, eval_metric='mlogloss')
model_xgb.fit(xgb_train_selected, y_train)

# Test the model on the test set
test_pred = model_xgb.predict(xgb_test_selected)

# Confusion matrix
print("\nConfusion matrix:")
confusion_mat = confusion_matrix(y_test, test_pred)
disp = ConfusionMatrixDisplay(confusion_mat, display_labels=['Healthy', 'GBM'])
disp.plot()
plt.show()

# Classification report
print(classification_report(y_test, test_pred, target_names=['Healthy', 'GBM']))

# ROC/AUC Calculation for Cross-Validation
cv = list(StratifiedKFold(n_splits=5).split(xgb_train_selected, y_train))
fig = plt.figure(figsize=(7, 5))
mean_tpr = 0.0
mean_fpr = np.linspace(0, 1, 100)
all_tpr = []

sd_list = []
for i, (train_idx, test_idx) in enumerate(cv):
    probas = model_xgb.fit(xgb_train_selected.iloc[train_idx], y_train.iloc[train_idx]).predict_proba(xgb_train_selected.iloc[test_idx])
    fpr, tpr, _ = roc_curve(y_train.iloc[test_idx], probas[:, 1], pos_label=1)
    mean_tpr += np.interp(mean_fpr, fpr, tpr)
    mean_tpr[0] = 0.0
    roc_auc = auc(fpr, tpr)
    plt.plot(fpr, tpr, label=f'ROC fold {i+1} (area = {roc_auc:.2f})')
    sd_list.append(roc_auc)

plt.plot([0, 1], [0, 1], linestyle='--', color=(0.6, 0.6, 0.6), label='Random guessing')

mean_tpr /= len(cv)
mean_tpr[-1] = 1.0
mean_auc = auc(mean_fpr, mean_tpr)
mean_sd = np.std(sd_list)

print(f'CV AUC: {mean_auc:.3f} +/- {mean_sd:.3f}')

# Plot mean ROC
plt.plot(mean_fpr, mean_tpr, 'k--', label=f'Mean ROC (area = {mean_auc:.2f})', lw=2)
plt.plot([0, 0, 1], [0, 1, 1], linestyle=':', color='black', label='Perfect performance')
plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.legend(loc="lower right")
plt.tight_layout()
plt.show()

AUC.append(mean_auc)
AUC_sd.append(mean_sd)

# Nested Cross-Validation with Multiple Metrics
scores1 = cross_val_score(gs, xgb_train_selected, y_train, scoring='accuracy', cv=5)
scores2 = cross_val_score(gs, xgb_train_selected, y_train, scoring='precision', cv=5)
scores3 = cross_val_score(gs, xgb_train_selected, y_train, scoring='recall', cv=5)
scores4 = cross_val_score(gs, xgb_train_selected, y_train, scoring='f1', cv=5)

print(f'cv accuracy: {np.mean(scores1):.3f} +/- {np.std(scores1):.3f}')
print(f'cv precision: {np.mean(scores2):.3f} +/- {np.std(scores2):.3f}')
print(f'cv recall: {np.mean(scores3):.3f} +/- {np.std(scores3):.3f}')
print(f'cv f1: {np.mean(scores4):.3f} +/- {np.std(scores4):.3f}')

AC.append(np.mean(scores1))
AC_sd.append(np.std(scores1))
Pre.append(np.mean(scores2))
Pre_sd.append(np.std(scores2))
Re.append(np.mean(scores3))
Re_sd.append(np.std(scores3))
f1.append(np.mean(scores4))
f1_sd.append(np.std(scores4))
Best1.append(gs.best_params_['n_estimators'])
Best2.append('NaN')


In [None]:
# predict on validation set
validation_miRNA_data_xgb = validation_miRNA_data[top_10_features_xgb]
validation_pred_xgb = model_xgb.predict(validation_miRNA_data_xgb)
# predictions to csv
validation_miRNA_data_xgb['predicted'] = validation_pred_xgb
validation_miRNA_data_xgb.to_csv('miRNA_validation_predictions_xgb.csv', index=True)

Shuffling classes to test using best model RF

In [None]:
# count the number of 0s and 1s in the target column
miRNA_data_shuffled = trainging_miRNA_data.copy()

# Select samples from 0 and change them to 1
O_to_1_indices = miRNA_data_shuffled[miRNA_data_shuffled['target'] == 0].sample(n=23, random_state=2).index
print(O_to_1_indices)
# Select samples from 1 and change them to 0
I_to_O_indices = miRNA_data_shuffled[miRNA_data_shuffled['target'] == 1].sample(n=23, random_state=2).index
print(I_to_O_indices)



In [None]:

miRNA_data_shuffled.loc[O_to_1_indices, 'target'] = 1
miRNA_data_shuffled.loc[I_to_O_indices, 'target'] = 0
print(miRNA_data_shuffled['target'].value_counts())
print(miRNA_data_shuffled[miRNA_data_shuffled['target'] == 0].index)
print(miRNA_data_shuffled[miRNA_data_shuffled['target'] == 1].index)
# split data into train and test 80:20 ratio  
X = miRNA_data_shuffled.drop(['target',], axis = 1)
y = miRNA_data_shuffled['target']

print(miRNA_data_shuffled['target'].value_counts())

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix, ConfusionMatrixDisplay, classification_report, roc_curve, auc
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold, cross_val_score, train_test_split
from sklearn.feature_selection import RFE

# Initialize result lists
AUC, AUC_sd, AC, AC_sd, Pre, Pre_sd, Re, Re_sd, f1, f1_sd, Best1, Best2 = ([] for _ in range(12))

# Reset indices of DataFrame
X, y = X.reset_index(drop=True), y.reset_index(drop=True)

# Split data (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
# Use only the selected features for training
X_train_selected = X_train[top_10_features]
X_test_selected = X_test[top_10_features]


# Perform GridSearchCV for hyperparameter tuning
param_range1 = [2, 3, 4, 5, 7, 9]
gs = GridSearchCV(estimator=KNeighborsClassifier(),
                  param_grid=[{'n_neighbors': param_range1, 'metric': ['minkowski']}],
                  scoring='accuracy', cv=5, refit=True, n_jobs=-1)
gs = gs.fit(knn_train_selected, y_train)

# Get the best hyperparameters
best_n_neighbors = gs.best_params_['n_neighbors']
print(f'Best n_neighbors: {best_n_neighbors}')

# Evaluate model using the best parameters
model_knn = KNeighborsClassifier(n_neighbors=best_n_neighbors)
model_knn.fit(X_train_selected, y_train)

# Test the model on the test set
test_pred = model_knn.predict(X_test_selected)

# Confusion matrix
print("\nConfusion matrix:")
confusion_mat = confusion_matrix(y_test, test_pred)
# use display_labels GBM = 1, Healthy = 0, for the confusion matrix
disp = ConfusionMatrixDisplay(confusion_matrix=confusion_mat, display_labels=['Healthy', 'GBM'])
disp.plot()
plt.savefig("knn_bestModel_shuffled_ConfusionMatrix_miRNA_HG.png", format="png", dpi=600)
plt.savefig("knn_bestModel_shuffled_ConfusionMatrix_miRNA_HG.svg", format="svg", dpi=600)
plt.show()

# Classification report
print(classification_report(y_test, test_pred, target_names=['Healthy', 'GBM']))

# get AUC plot for the best model
fig = plt.figure(figsize=(7, 5))
probas = model_knn.predict_proba(X_test_selected)
fpr, tpr, _ = roc_curve(y_test, probas[:, 1], pos_label=1)
roc_auc = auc(fpr, tpr)
plt.plot(fpr, tpr, label=f'ROC (area = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], linestyle='--', color=(0.6, 0.6, 0.6), label='Random guessing')
plt.plot([0, 0, 1], [0, 1, 1], linestyle=':', color='black', label='Perfect performance')
plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.legend(loc="lower right")
plt.tight_layout()
plt.savefig("knn_bestModel_shuffled_in_AUC_miRNA_HG.png", format="png", dpi=600)
plt.savefig("knn_bestModel_shuffled_in_AUC_miRNA_HG.svg", format="svg", dpi=600)
plt.show()
