In [None]:
import re
import os
import pickle 
import numpy as np 
import pandas as pd
import sys
import matplotlib.pyplot as plt 

from sklearn import metrics
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedGroupKFold 
from sklearn.model_selection import cross_val_score

from sklearn.metrics import roc_auc_score, make_scorer
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.dummy import DummyClassifier

import xgboost as xgb
from xgboost import XGBClassifier



from google.colab import drive
drive.mount("/content/drive")


Mounted at /content/drive


In [None]:
# load the tab used for embedding, only the training set of course

with open("drive/MyDrive/OrlyPred/Homomer_embeds/results/embeds_Mar_22/train_set.pkl", 'rb') as f:
  overall_train_set = pickle.load(f)

# index reset is important for the stratified splitting and the saving to lists
overall_train_set.reset_index(drop=True, inplace=True)

In [None]:
# define the input, using the codes since this is convenient to later extract rows from the general table. Actually the input is the embeddings
# the labls, y, are the predifined nsub (number of subunits annotated to the relevant pdb code)
# groups - the cluster representatives, used in order to jave all the sequences from the same cluster in the same set (train/validation)

X = overall_train_set["code"]
y = overall_train_set["nsub"]
groups = overall_train_set["representative"]


In [None]:
# generate groups for k-fold cross validation, used in the next few cells
# this is used when one run is carried out, for the cross validation there is a different code below 

cv = StratifiedGroupKFold(n_splits=10, shuffle=True, random_state=1)
train_lst = []
test_lst = []
for train_idxs, test_idxs in cv.split(X, y, groups):
    train_lst.append(X[train_idxs].tolist())
    test_lst.append(X[test_idxs].tolist())
    # print("train_lst", train_lst)
    # print("test_lst", test_lst)

Output hidden; open in https://colab.research.google.com to view.

In [None]:
train_idx_df = pd.DataFrame(train_lst).transpose()
train_idx_df.rename(columns={0:"train_0", 1:"train_1", 2:"train_2", 3:"train_3", 4:"train_4", 5:"train_5", 6:"train_6", 7:"train_7", 8:"train_8", 9:"train_9"}, inplace=True)
# print(train_idx_df)
test_idx_df = pd.DataFrame(test_lst).transpose()
test_idx_df.rename(columns={0:"test_0", 1:"test_1", 2:"test_2", 3:"test_3", 4:"test_4", 5:"test_5", 6:"test_6", 7:"test_7", 8:"test_8", 9:"test_9"}, inplace=True)
# print(test_idx_df)
merged_train_test = pd.concat([train_idx_df, test_idx_df], axis=1, join="outer")


In [None]:
train_set = overall_train_set[overall_train_set["code"].isin(merged_train_test["train_0"])]
test_set = overall_train_set[overall_train_set["code"].isin(merged_train_test["test_0"])]

*Decision tree*
xgboost


In [None]:
# original cell

X = overall_train_set["embeddings"]
y = overall_train_set["nsub"]
groups = overall_train_set["representative"]
cv = StratifiedGroupKFold(n_splits=10)

# X_train = pd.DataFrame(np.vstack(train_set['embeddings']))
# y_train = train_set['nsub']

# X_test = pd.DataFrame(np.vstack(test_set['embeddings']))
# y_test = test_set['nsub']


X = pd.DataFrame(np.vstack(overall_train_set['embeddings']))


df = pd.DataFrame(np.vstack(X))


In [None]:
#for understanding how to work with the model

X = overall_train_set["embeddings"][:1000]
y = overall_train_set["nsub"][:1000]
groups = overall_train_set["representative"][:1000]
cv = StratifiedGroupKFold(n_splits=1000)

X_train = pd.DataFrame(np.vstack(train_set['embeddings'][:1000]))
y_train = train_set['nsub'][:1000]

X_test = pd.DataFrame(np.vstack(test_set['embeddings'][:1000]))
y_test = test_set['nsub'][:1000]


X = pd.DataFrame(np.vstack(overall_train_set['embeddings'][:1000]))


df = pd.DataFrame(np.vstack(X))


In [None]:
y.astype(int)

0        3
1        6
2        4
3        4
4        4
        ..
28823    2
28824    2
28825    2
28826    2
28827    2
Name: nsub, Length: 28828, dtype: int64

In [None]:
data_dmatrix = xgb.DMatrix(data=df,label=y)

In [None]:
# flow (and params) from here: https://www.datacamp.com/community/tutorials/xgboost-in-python

xg_class = xgb.XGBClassifier(objective ='reg:logistic', colsample_bytree = 0.3, learning_rate = 0.1,
                max_depth = 5, alpha = 10, n_estimators = 10, random_state=1)

xg_class.fit(X_train,y_train)

preds = xg_class.predict(X_test)


# params = {"objective":"reg:linear",'colsample_bytree': 0.3,'learning_rate': 0.1,
#                 'max_depth': 5, 'alpha': 10}

# cv_results = xgb.cv(dtrain=data_dmatrix, params=params, nfold=3,
                    # num_boost_round=50,early_stopping_rounds=10,metrics="rmse", as_pandas=True, seed=123)



# for train_idxs, test_idxs in cv.split(X, y, groups):
#    # data_dmatrix = xgb.DMatrix(data=np.vstack(X[train_idxs],label=y[train_idxs])

#     clf.fit(np.vstack(X[train_idxs]), y[train_idxs])
#     y_pred = clf.predict(np.vstack(X[test_idxs]))
#     print(clf.score(np.vstack(X[test_idxs]), y[test_idxs]))
#     print("Adjusted Balanced accuracy:", metrics.balanced_accuracy_score(y[test_idxs], y_pred, adjusted=True))
#     print("Balanced accuracy:", metrics.balanced_accuracy_score(y[test_idxs], y_pred))


In [None]:
rmse = np.sqrt(mean_squared_error(y_test, preds))
print("RMSE: %f" % (rmse))
# print('Accuracy: %.3f' % accuracy_score(y_test, preds))
print('Precision: %.3f' % precision_score(y_test, preds, average='weighted'))
print('Recall: %.3f' % recall_score(y_test, preds, average='weighted'))
print('F-measure: %.3f' % f1_score(y_test, preds, average='weighted'))
print("adjusted Balanced accuracy: %.3f" % metrics.balanced_accuracy_score(y_test, preds, adjusted=True))


RMSE: 1.852170
Accuracy: 0.608
Precision: 0.591
Recall: 0.608
F-measure: 0.570
adjusted Balanced accuracy: 0.046639279667793895


  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
# Code from here: 
# https://towardsdatascience.com/cross-validation-and-hyperparameter-tuning-how-to-optimise-your-machine-learning-model-13f005af9d7d

# Number of trees to be used
xgb_n_estimators = [int(x) for x in np.linspace(200, 2000, 10)]

# Maximum number of levels in tree
xgb_max_depth = [int(x) for x in np.linspace(2, 20, 10)]

# Minimum number of instaces needed in each node
xgb_min_child_weight = [int(x) for x in np.linspace(1, 10, 10)]

# Tree construction algorithm used in XGBoost
xgb_tree_method = ['auto', 'exact', 'approx', 'hist', 'gpu_hist']

# Learning rate
xgb_eta = [x for x in np.linspace(0.1, 0.6, 6)]
xgb_eta = [round(x, 1) for x in xgb_eta]

# Minimum loss reduction required to make further partition
xgb_gamma = [int(x) for x in np.linspace(0, 0.5, 6)]

# Learning objective used
xgb_objective = ['reg:squarederror', 'reg:squaredlogerror']

# Create the grid
xgb_grid = {'n_estimators': xgb_n_estimators,
            'max_depth': xgb_max_depth,
            'min_child_weight': xgb_min_child_weight,
            'tree_method': xgb_tree_method,
            'eta': xgb_eta,
            'gamma': xgb_gamma,
            'objective': xgb_objective}


In [None]:
# Code from here: 
# https://towardsdatascience.com/cross-validation-and-hyperparameter-tuning-how-to-optimise-your-machine-learning-model-13f005af9d7d


from xgboost import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV

xgb_base = XGBClassifier()

# Create the random search Random Forest
xgb_random = RandomizedSearchCV(estimator = xgb_base, param_distributions = xgb_grid, 
                                n_iter = 60, cv = 10, verbose = 2, 
                                random_state = 1, n_jobs = -1)

# Fit the random search model
xgb_random.fit(X, y)

# Get the optimal parameters
xgb_random.best_params_


Fitting 10 folds for each of 60 candidates, totalling 600 fits




In [None]:
print(xgb_random.best_params_)

In [None]:
# k-fold croos val without tuning
# fit xgboost on an imbalanced classification dataset

from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.metrics import roc_auc_score, make_scorer
from xgboost import XGBClassifier
# generate dataset

cv = StratifiedGroupKFold(n_splits=10, random_state=1)
# cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)


xgb_model = xgb.XGBClassifier(objective ='reg:logistic', cv=cv, groups=groups, random_state=1)
# xgb_model.fit(X, y)

# y_pred = xgb_model.predict(X)




# X, y = make_classification(n_samples=10000, n_features=2, n_redundant=0,
# 	n_clusters_per_class=2, weights=[0.99], flip_y=0, random_state=7)
# define model
# model = XGBClassifier()
# define evaluation procedure
# evaluate model

f1_score_weighted = make_scorer(f1_score, average="weighted")
f1_score_weighted

# scores = cross_val_score(xgb_model, scoring=roc_auc_ovr_scorer, n_jobs=-1, error_score='raise')
# # original line:
scores = cross_val_score(xgb_model, X, y, cv=cv, n_jobs=-1, scoring=f1_score_weighted, error_score='raise', groups=groups) 
# summarize performance
# print(y_true)
# print(y_score)
print('f1_score_weighted: %.5f' % np.mean(scores))




f1_score_weighted: 0.56044


In [None]:
# scores = cross_val_score(xgb_model, X, y, scoring=roc_auc_ovr_scorer, cv=cv, n_jobs=-1, error_score='raise', groups=groups)


In [None]:
cv = StratifiedGroupKFold(n_splits=10, shuffle=True, random_state=1, )
lst_accu_stratified = []
  
for train_index, test_index in cv.split(x, y, groups):
    x_train_fold, x_test_fold = x_scaled[train_index], x_scaled[test_index]
    y_train_fold, y_test_fold = y[train_index], y[test_index]
    lr.fit(x_train_fold, y_train_fold)
    lst_accu_stratified.append(lr.score(x_test_fold, y_test_fold))
  
# Print the output.
print('List of possible accuracy:', lst_accu_stratified)
print('\nMaximum Accuracy That can be obtained from this model is:',
      max(lst_accu_stratified)*100, '%')
print('\nMinimum Accuracy:',
      min(lst_accu_stratified)*100, '%')
print('\nOverall Accuracy:',
      mean(lst_accu_stratified)*100, '%')
print('\nStandard Deviation is:', stdev(lst_accu_stratified))




for train_idxs, test_idxs in cv.split(X, y, groups):
    clf.fit(np.vstack(X[train_idxs]), y[train_idxs])
    print(clf.score(np.vstack(X[test_idxs]), y[test_idxs]))
    clf.fit(np.vstack(X[train_idxs]), y[train_idxs])
    y_pred = clf.predict(np.vstack(X[test_idxs]))
    print("Adjusted Balanced accuracy:", metrics.balanced_accuracy_score(y[test_idxs], y_pred, adjusted=True))
    print("Balanced accuracy:", metrics.balanced_accuracy_score(y[test_idxs], y_pred))
