In [1]:
import re
import os
import pickle 
import numpy as np 
import pandas as pd
import sys
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.model_selection import StratifiedGroupKFold 
import xgboost as xgb
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score


from google.colab import drive
drive.mount("/content/drive")


Mounted at /content/drive


In [2]:
# load the tab used for embedding, only the training set of course

with open("drive/MyDrive/OrlyPred/Homomer_embeds/results/embeds_Mar_22/train_set.pkl", 'rb') as f:
  overall_train_set = pickle.load(f)

# index reset is important for the stratified splitting and the saving to lists
overall_train_set.reset_index(drop=True, inplace=True)

In [3]:
# define the input, using the codes since this is convenient to later extract rows from the general table. Actually the input is the embeddings
# the labls, y, are the predifined nsub (number of subunits annotated to the relevant pdb code)
# groups - the cluster representatives, used in order to jave all the sequences from the same cluster in the same set (train/validation)

X = overall_train_set["code"]
y = overall_train_set["nsub"]
groups = overall_train_set["representative"]
X

0        5ahz_1
1        3q6m_1
2        1luq_1
3        3t6f_1
4        1srf_1
          ...  
28823    4zt1_1
28824    4a56_1
28825    5hap_1
28826    4s2l_1
28827    5faq_1
Name: code, Length: 28828, dtype: object

In [4]:
# generate groups for k-fold cross validation, used in the next few cells
# this is used when one run is carried out, for the cross validation there is a different code below 

cv = StratifiedGroupKFold(n_splits=10, shuffle=True, random_state=1)
train_lst = []
test_lst = []
for train_idxs, test_idxs in cv.split(X, y, groups):
    train_lst.append(X[train_idxs].tolist())
    test_lst.append(X[test_idxs].tolist())
    print("train_lst", train_lst)
    print("test_lst", test_lst)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [None]:
train_lst[0]

In [None]:
train_idx_df = pd.DataFrame(train_lst).transpose()
train_idx_df.rename(columns={0:"train_0", 1:"train_1", 2:"train_2", 3:"train_3", 4:"train_4", 5:"train_5", 6:"train_6", 7:"train_7", 8:"train_8", 9:"train_9"}, inplace=True)
print(train_idx_df)
test_idx_df = pd.DataFrame(test_lst).transpose()
test_idx_df.rename(columns={0:"test_0", 1:"test_1", 2:"test_2", 3:"test_3", 4:"test_4", 5:"test_5", 6:"test_6", 7:"test_7", 8:"test_8", 9:"test_9"}, inplace=True)
print(test_idx_df)
merged_train_test = pd.concat([train_idx_df, test_idx_df], axis=1, join="outer")


In [None]:
#For 5-fold cv
train_idx_df = pd.DataFrame(train_lst).transpose()
train_idx_df.rename(columns={0:"train_0", 1:"train_1", 2:"train_2", 3:"train_3", 4:"train_4"}, inplace=True)
print(train_idx_df)
test_idx_df = pd.DataFrame(test_lst).transpose()
test_idx_df.rename(columns={0:"test_0", 1:"test_1", 2:"test_2", 3:"test_3", 4:"test_4"}, inplace=True)
print(test_idx_df)
merged_train_test = pd.concat([train_idx_df, test_idx_df], axis=1, join="outer")


In [6]:
train_set = overall_train_set[overall_train_set["code"].isin(merged_train_test["train_0"])]
test_set = overall_train_set[overall_train_set["code"].isin(merged_train_test["test_0"])]

In [None]:
from sklearn.neural_network import MLPClassifier

X_train = train_set['embeddings'].tolist()
y_train = train_set['nsub']

X_test = test_set['embeddings'].tolist()
y_test = test_set['nsub']



In [None]:
 # the basic plain vanilla MLP trained on one fold, for a baseline/initial model
clf = MLPClassifier(solver='adam', random_state=1, learning_rate_init=0.001)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)




In [None]:
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))
print("adjusted Balanced accuracy:", metrics.balanced_accuracy_score(y_test, y_pred, adjusted=True))
# print("roc_auc_score:", metrics.roc_auc_score(y_test, y_pred, multi_class='ovr'))
# print("PR:", metrics.precision_recall_fscore_support(y_test,y_pred))
print('Precision: %.3f' % precision_score(y_test, y_pred, average='weighted'))
print('Recall: %.3f' % recall_score(y_test, y_pred, average='weighted'))
print('F-measure: %.3f' % f1_score(y_test, y_pred, average='weighted'))
print(metrics.classification_report(y_test,y_pred))
print(metrics.confusion_matrix(y_test,y_pred))



import matplotlib.pyplot as plt 

# metrics.plot_roc_curve(clf, X_test, y_test)  
# plt.show()                                   

# metrics.RocCurveDisplay.from_predictions(clf, X_test, y_test)
# plt.show()

Accuracy: 0.6069635085369937
adjusted Balanced accuracy: 0.3066657481475936
Precision: 0.624
Recall: 0.607
F-measure: 0.609
              precision    recall  f1-score   support

         1.0       0.76      0.68      0.72      1582
         2.0       0.47      0.60      0.53       935
         3.0       0.50      0.48      0.49        93
         4.0       0.47      0.46      0.46       213
         5.0       1.00      1.00      1.00         2
         6.0       0.41      0.25      0.31        75
         8.0       0.27      0.11      0.16        27
        10.0       0.64      0.18      0.29        38
        12.0       0.36      0.36      0.36        14
        13.0       0.00      0.00      0.00         1
        14.0       1.00      0.25      0.40         4
        24.0       0.00      0.00      0.00         3

    accuracy                           0.61      2987
   macro avg       0.49      0.36      0.39      2987
weighted avg       0.62      0.61      0.61      2987

[[1073  4

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
# use the DummyClassifier to examine the quality of the model
from sklearn.dummy import DummyClassifier

clf_dum = DummyClassifier(strategy='most_frequent', random_state=1)
clf_dum.fit(X_train, y_train)

y_pred = clf_dum.predict(X_test)
print("DummyClassifier Accuracy:", metrics.accuracy_score(y_test, y_pred))
print("DummyClassifier adjusted Balanced accuracy:", metrics.balanced_accuracy_score(y_test, y_pred, adjusted=True))


DummyClassifier Accuracy: 0.5296283896886508
DummyClassifier adjusted Balanced accuracy: 0.0


In [None]:
# use the DummyClassifier to examine the quality of the model, here with a different strategy for generating the dummy classifier
from sklearn.dummy import DummyClassifier

clf_dum2 = DummyClassifier(strategy='stratified', random_state=1)
clf_dum2.fit(X_train, y_train)

y_pred = clf_dum2.predict(X_test)
print("DummyClassifier Accuracy:", metrics.accuracy_score(y_test, y_pred))
print("DummyClassifier adjusted Balanced accuracy:", metrics.balanced_accuracy_score(y_test, y_pred, adjusted=True))


DummyClassifier Accuracy: 0.3592233009708738
DummyClassifier adjusted Balanced accuracy: -0.009255158786592813




In [None]:
# train an MLP with k-fold cross valdation (k=10)
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import cross_val_score



X = overall_train_set["embeddings"]
y = overall_train_set["nsub"]
groups = overall_train_set["representative"]
cv = StratifiedGroupKFold(n_splits=10)



clf = MLPClassifier(solver='adam', random_state=1, learning_rate_init=0.001)


for train_idxs, test_idxs in cv.split(X, y, groups):
    clf.fit(np.vstack(X[train_idxs]), y[train_idxs])
    print(clf.score(np.vstack(X[test_idxs]), y[test_idxs]))
    clf.fit(np.vstack(X[train_idxs]), y[train_idxs])
    y_pred = clf.predict(np.vstack(X[test_idxs]))
    print("Adjusted Balanced accuracy:", metrics.balanced_accuracy_score(y[test_idxs], y_pred, adjusted=True))
    print("Balanced accuracy:", metrics.balanced_accuracy_score(y[test_idxs], y_pred))
    print('F-measure: %.3f' % f1_score(y[test_idxs], y_pred, average='weighted'))

    # scores = cross_val_score(clf, X, y, cv=cv)

    # print("TRAIN:", X[train_idxs])
    # print("      ", y[train_idxs])
    # print(" TEST:", X[test_idxs])
    # print("      ", y[test_idxs])



*Decision tree*
xgboost


In [12]:
# original cell

X = overall_train_set["embeddings"]
y = overall_train_set["nsub"]
groups = overall_train_set["representative"]
cv = StratifiedGroupKFold(n_splits=10)

X_train = pd.DataFrame(np.vstack(train_set['embeddings']))
y_train = train_set['nsub']

X_test = pd.DataFrame(np.vstack(test_set['embeddings']))
y_test = test_set['nsub']




df = pd.DataFrame(np.vstack(X))


In [9]:
#for understanding how to work with the model

X = overall_train_set["embeddings"][:1000]
y = overall_train_set["nsub"][:1000]
groups = overall_train_set["representative"][:1000]
cv = StratifiedGroupKFold(n_splits=1000)

X_train = pd.DataFrame(np.vstack(train_set['embeddings'][:1000]))
y_train = train_set['nsub'][:1000]

X_test = pd.DataFrame(np.vstack(test_set['embeddings'][:1000]))
y_test = test_set['nsub'][:1000]




df = pd.DataFrame(np.vstack(X))


In [13]:
 data_dmatrix = xgb.DMatrix(data=df,label=y)

In [14]:
# train model from params after HPT July 2022
best_params = {'eta': 0.4,
 'max_depth': 6,
 'min_child_weight': 9,
 'n_estimators': 1500,
 'objective': 'multi:softprob',
 'tree_method': 'approx'}


xg_class = xgb.XGBClassifier(objective ='multi:softprob', eta=0.4, max_depth=6, min_child_weight=9, n_estimators=1500, tree_method="approx")

xg_class.fit(X_train,y_train)

pickle.dump(xg_class, open("drive/MyDrive/OrlyPred/Homomer_embeds/results/embeds_Mar_22/xgb_model.pkl", "wb"))
joblib.dump(xg_class, "drive/MyDrive/OrlyPred/Homomer_embeds/results/embeds_Mar_22/xgb_random.joblib")


preds = xg_class.predict(X_test)




KeyboardInterrupt: ignored

In [None]:
# flow (and params) from here: https://www.datacamp.com/community/tutorials/xgboost-in-python

xg_class = xgb.XGBClassifier(objective ='reg:logistic', colsample_bytree = 0.3, learning_rate = 0.1,
                max_depth = 5, alpha = 10, n_estimators = 10, random_state=1)

xg_class.fit(X_train,y_train)

preds = xg_class.predict(X_test)



In [None]:
rmse = np.sqrt(mean_squared_error(y_test, preds))
print("RMSE: %f" % (rmse))
print('Accuracy: %.3f' % accuracy_score(y_test, preds))
print('Precision: %.3f' % precision_score(y_test, preds, average='weighted'))
print('Recall: %.3f' % recall_score(y_test, preds, average='weighted'))
print('F-measure: %.3f' % f1_score(y_test, preds, average='weighted'))
print("adjusted Balanced accuracy:", metrics.balanced_accuracy_score(y_test, y_pred, adjusted=True))


RMSE: 1.852170
Accuracy: 0.608
Precision: 0.591
Recall: 0.608
F-measure: 0.570
adjusted Balanced accuracy: 0.046639279667793895


  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
####### Doesnt work yet
# fit xgboost on an imbalanced classification dataset

from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.metrics import roc_auc_score, make_scorer
from xgboost import XGBClassifier
# generate dataset

X = pd.DataFrame(np.vstack(overall_train_set['embeddings']))
cv = StratifiedGroupKFold(n_splits=10)
# cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)


xgb_model = xgb.XGBClassifier(objective ='reg:logistic', cv=cv, groups=groups, random_state=1)
# xgb_model.fit(X, y)

# y_pred = xgb_model.predict(X)




f1_score_weighted = make_scorer(f1_score, average="weighted")
f1_score_weighted

# scores = cross_val_score(xgb_model, scoring=roc_auc_ovr_scorer, n_jobs=-1, error_score='raise')
# # original line:
scores = cross_val_score(xgb_model, X, y, cv=cv, n_jobs=-1, scoring=f1_score_weighted, error_score='raise', groups=groups) #, scoring=roc_auc_ovr_scorer #scoring='roc_auc_ovr', 
# print(y_true)
# print(y_score)

print('f1_score_weighted: %.5f' % np.mean(scores))




f1_score_weighted: 0.56044
