In [5]:
import pandas as pd
import numpy as np
import xgboost as xgb
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, balanced_accuracy_score, roc_auc_score, make_scorer, f1_score, matthews_corrcoef
from sklearn.model_selection import RandomizedSearchCV,  GridSearchCV
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import plot_confusion_matrix

In [14]:
embeds = ["emb/sc_ppi_emb_d32_e1_l120_w20_k10_p1", "emb/sc_ppi_emb_d32_e3_l120_w10_k10_p2",
          "emb/sc_ppi_emb_d32_e3_l80_w20_k10_p0.5", "emb/sc_ppi_emb_d32_e3_l80_w20_k10_p1", 
          "emb/sc_ppi_emb_d64_e1_l120_w20_k10_p1", "emb/sc_ppi_emb_d64_e3_l120_w10_k10_p1",
          "emb/sc_ppi_emb_d64_e3_l120_w20_k20_p2", "emb/sc_ppi_emb_d64_e3_l80_w10_k10_p2",
          "emb/sc_ppi_emb_d64_e3_l80_w20_k10_p0.5", "emb/sc_ppi_emb_d64_e3_l80_w20_k20_p0.5"]
fill = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

In [15]:
#print(embeds[0]+"_out.csv")
#s = embeds[0]+"_out.csv"
#m = pd.read_csv(s)
#m.head()
data = {'Embeddings': embeds,
        'Accuracy': fill,
        'Balanced Accuracy Score': fill,
        'F1 Score': fill,
        'Matthews Correlation Coefficient': fill
        }
results = pd.DataFrame(data, columns= ['Embeddings', 'Accuracy', 'Balanced Accuracy Score', 'F1 Score', 'Matthews Correlation Coefficient'])
results.head()

Unnamed: 0,Embeddings,Accuracy,Balanced Accuracy Score,F1 Score,Matthews Correlation Coefficient
0,emb/sc_ppi_emb_d32_e1_l120_w20_k10_p1,0,0,0,0
1,emb/sc_ppi_emb_d32_e3_l120_w10_k10_p2,0,0,0,0
2,emb/sc_ppi_emb_d32_e3_l80_w20_k10_p0.5,0,0,0,0
3,emb/sc_ppi_emb_d32_e3_l80_w20_k10_p1,0,0,0,0
4,emb/sc_ppi_emb_d64_e1_l120_w20_k10_p1,0,0,0,0


In [17]:
df = pd.DataFrame(columns= ['Embeddings', 'Accuracy', 'Balanced Accuracy Score', 'F1 Score', 'Matthews Correlation Coefficient'])
df.head()

Unnamed: 0,Embeddings,Accuracy,Balanced Accuracy Score,F1 Score,Matthews Correlation Coefficient


In [26]:
for x in embeds:
    X = pd.read_csv(x+".emb.csv")
    X.drop(columns=X.columns[0], axis=1, inplace=True)
    y = pd.read_csv(x+".emb_out.csv")
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)
    param_grid = {
         'max_depth': [5, 6, 7],
         'learning_rate': [0.2, 0.15, 0.1],
         'min_child_weight' : [1, 3, 5],
         'gamma': [1.0, 2.0, 3.0],
         'reg_lambda': [10.0, 20.0, 100.0],
         'scale_pos_weight': [1]
    }

    clf = GridSearchCV(estimator = xgb.XGBClassifier(objective='binary:logistic',
                                                              seed=42,
                                                              subsample=0.9,
                                                              colsample_bytree=0.5
                                                               ),
                                                              param_grid = param_grid,
                                                              scoring = 'roc_auc',
                                                              verbose = 2,
                                                              n_jobs = 10,
                                                              cv = 4)

    clf.fit(X_train,
            y_train,
            early_stopping_rounds=10,
            eval_metric='auc',
            eval_set=[(X_test, y_test)],
            verbose=True)


    print(clf.best_estimator_)
    print(clf.best_params_)
    
    predictions = clf.predict(X_test)
    acc = accuracy_score(y_test, predictions)
    balanced_acc = balanced_accuracy_score(y_test, predictions)
    f1 = f1_score(y_test, predictions)
    matt = matthews_corrcoef(y_test, predictions)
    row = [x, acc, balanced_acc, f1, matt]
    df.loc[x] = row
   




In [None]:
df.head()
np.savetxt("results.txt", df, fmt='%s', header='embed acc bal_acc f1 matt_corr', comments='')