In [2]:
from string import punctuation
from nltk import word_tokenize
from nltk.corpus import stopwords
import sent2vec


expression_unified_ds = [line.strip().split("\t") for line in open("../unified_dataset/expression.tsv")][1:]
kinaseact_unified_ds = [line.strip().split("\t") for line in open("../unified_dataset/kinaseact.tsv")][1:]

stop_words = set(stopwords.words('english'))
def preprocess_sentence(text):
    text = text.replace('/', ' / ')
    text = text.replace('.-', ' .- ')
    text = text.replace('.', ' . ')
    text = text.replace('\'', ' \' ')
    text = text.lower()

    tokens = [token for token in word_tokenize(text) if token not in punctuation and token not in stop_words]

    return ' '.join(tokens)

sentences_only_expression_data = [preprocess_sentence(row[0]) for row in expression_unified_ds]
sentences_only_kinaseact_data = [preprocess_sentence(row[0]) for row in kinaseact_unified_ds]
bio_sent_vec_model_location = "../biosentvec/model.bin"

In [3]:
model_path = bio_sent_vec_model_location
biosentvec_model = sent2vec.Sent2vecModel()
try:
    biosentvec_model.load_model(model_path)
except Exception as e:
    print(e)
print('model successfully loaded')

model successfully loaded


In [4]:
corpus_expr = biosentvec_model.embed_sentences(sentences_only_expression_data)
corpus_kinaseact = biosentvec_model.embed_sentences(sentences_only_kinaseact_data)

In [5]:
from sklearn.metrics import make_scorer, precision_score, recall_score, f1_score
from sklearn.model_selection import RandomizedSearchCV, cross_validate, StratifiedKFold
from models import POSSIBLE_CLASSIFIERS


def train_classifier(X, y):

    best_score = 0
    best_classifier = None
    best_params = None
    best_classifier_name = ""

    stratified_k_folds = StratifiedKFold(n_splits=5)
    
    scoring = {'precision': make_scorer(precision_score),
           'recall': make_scorer(recall_score),
           'f1': make_scorer(f1_score)}

    best_results = {}
    best_index = 0
    # Iterate over classifiers and perform grid search
    for classifier_name, classifier_info in POSSIBLE_CLASSIFIERS.items():
        random_search = RandomizedSearchCV(estimator=classifier_info['model'], n_iter=100,
                                           param_distributions=classifier_info['params'], cv=stratified_k_folds,
                                           scoring=scoring, refit='f1',
                                           verbose=1, n_jobs=-1)
        random_search.fit(X, y)

        print(f"Finished training model and fitting best hyperparameters for {classifier_name}. F1 score: "
              f"{str(random_search.best_score_)}")

        if random_search.best_score_ > best_score:
            best_score = random_search.best_score_
            best_classifier = random_search.best_estimator_
            best_params = random_search.best_params_
            best_classifier_name = classifier_name
            best_results = random_search.cv_results_
            best_index = random_search.best_index_

    # Retrieve the average precision, recall, and F1 score
    average_precision = best_results['mean_test_precision'][best_index]
    std_precision = best_results['std_test_precision'][best_index]
    average_recall = best_results['mean_test_recall'][best_index]
    std_recall = best_results['std_test_recall'][best_index]
    average_f1 = best_results['mean_test_f1'][best_index]
    std_f1 = best_results['std_test_f1'][best_index]

    # Return the trained model and performance metrics
    return best_classifier, average_precision, average_recall, average_f1, best_classifier_name, best_params, std_precision, std_recall, std_f1


Note: You have installed the 'manylinux2014' variant of XGBoost. Certain features such as GPU algorithms or federated learning are not available. To use these features, please upgrade to a recent Linux distro with glibc 2.28+, and install the 'manylinux_2_28' variant.


## Task 1

### Expression

In [6]:
import joblib

y = [int(row[1]) for row in expression_unified_ds]
classifier, precision, recall, fscore, classifier_name, params, std_precision, std_recall, std_f1 = train_classifier(corpus_expr, y)
joblib.dump(classifier, '../classifiers/all_info_expression.joblib')
print(f'precision: {str(precision)} ± {str(std_precision)}')
print(f'recall: {str(recall)} ± {str(std_recall)}')
print(f'fscore: {str(fscore)} ± {str(std_f1)}')
print(f'selected model: {classifier_name}')
print(f'selected params {str(params)}')

Fitting 5 folds for each of 100 candidates, totalling 500 fits




Finished training model and fitting best hyperparameters for LogisticRegression. F1 score: 0.7442677304743524
Fitting 5 folds for each of 100 candidates, totalling 500 fits


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

Finished training model and fitting best hyperparameters for MLPClassifier. F1 score: 0.7544612682003986
Fitting 5 folds for each of 100 candidates, totalling 500 fits


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

Finished training model and fitting best hyperparameters for SVC. F1 score: 0.7664123019906259
precision: 0.7315976530186112
recall: 0.8870967741935484
fscore: 0.7664123019906259
selected model: SVC
selected params {'C': 33.51126179747934, 'class_weight': 'balanced', 'coef0': 0.5320160202506291, 'decision_function_shape': 'ovr', 'degree': 4, 'gamma': 0.0019991354466630975, 'kernel': 'rbf', 'random_state': 74, 'shrinking': True, 'tol': 0.007020564809520105}


### Kinaseact

In [7]:
y = [int(row[1]) for row in kinaseact_unified_ds]
classifier, precision, recall, fscore, classifier_name, params, std_precision, std_recall, std_f1 = train_classifier(corpus_kinaseact, y)
joblib.dump(classifier, '../classifiers/all_info_kinase.joblib')
print(f'precision: {str(precision)} ± {str(std_precision)}')
print(f'recall: {str(recall)} ± {str(std_recall)}')
print(f'fscore: {str(fscore)} ± {str(std_f1)}')
print(f'selected model: {classifier_name}')
print(f'selected params {str(params)}')

Fitting 5 folds for each of 100 candidates, totalling 500 fits


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

Finished training model and fitting best hyperparameters for LogisticRegression. F1 score: 0.7236461643591355
Fitting 5 folds for each of 100 candidates, totalling 500 fits


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

Finished training model and fitting best hyperparameters for MLPClassifier. F1 score: 0.6951654537745618
Fitting 5 folds for each of 100 candidates, totalling 500 fits


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

Finished training model and fitting best hyperparameters for SVC. F1 score: 0.7222443847661528
precision: 0.6732166781338769
recall: 0.882316384180791
fscore: 0.7236461643591355
selected model: LogisticRegression
selected params {'C': 0.02480207617940087, 'class_weight': 'balanced', 'solver': 'liblinear', 'warm_start': True}


## Task 2

### Expression

In [8]:
y = [int(row[2]) for row in expression_unified_ds]
classifier, precision, recall, fscore, classifier_name, params, std_precision, std_recall, std_f1 = train_classifier(corpus_expr, y)
joblib.dump(classifier, '../classifiers/curatable_expression.joblib')
print(f'precision: {str(precision)} ± {str(std_precision)}')
print(f'recall: {str(recall)} ± {str(std_recall)}')
print(f'fscore: {str(fscore)} ± {str(std_f1)}')
print(f'selected model: {classifier_name}')
print(f'selected params {str(params)}')

Fitting 5 folds for each of 100 candidates, totalling 500 fits


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

Finished training model and fitting best hyperparameters for LogisticRegression. F1 score: 0.8097007871252793
Fitting 5 folds for each of 100 candidates, totalling 500 fits


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

Finished training model and fitting best hyperparameters for MLPClassifier. F1 score: 0.8192864717076738
Fitting 5 folds for each of 100 candidates, totalling 500 fits


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

Finished training model and fitting best hyperparameters for SVC. F1 score: 0.8135709639069628
precision: 0.8278109640465964
recall: 0.8552507836990596
fscore: 0.8192864717076738
selected model: MLPClassifier
selected params {'activation': 'relu', 'alpha': 4.311714586017449e-05, 'beta_1': 0.0010997977625716898, 'beta_2': 0.003446941002522844, 'epsilon': 6.779577014134857e-06, 'hidden_layer_sizes': (500, 500, 500), 'learning_rate': 'adaptive', 'learning_rate_init': 0.0026151790169623122, 'solver': 'adam'}


### Kinaseact

In [9]:
y = [int(row[2]) for row in kinaseact_unified_ds]
classifier, precision, recall, fscore, classifier_name, params, std_precision, std_recall, std_f1 = train_classifier(corpus_kinaseact, y)
joblib.dump(classifier, '../classifiers/curatable_kinase.joblib')
print(f'precision: {str(precision)} ± {str(std_precision)}')
print(f'recall: {str(recall)} ± {str(std_recall)}')
print(f'fscore: {str(fscore)} ± {str(std_f1)}')
print(f'selected model: {classifier_name}')
print(f'selected params {str(params)}')

Fitting 5 folds for each of 100 candidates, totalling 500 fits


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

Finished training model and fitting best hyperparameters for LogisticRegression. F1 score: 0.7616277781037506
Fitting 5 folds for each of 100 candidates, totalling 500 fits


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

Finished training model and fitting best hyperparameters for MLPClassifier. F1 score: 0.7273218774880561
Fitting 5 folds for each of 100 candidates, totalling 500 fits


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

Finished training model and fitting best hyperparameters for SVC. F1 score: 0.75783392178594
precision: 0.6973410933527261
recall: 0.9123287671232877
fscore: 0.7616277781037506
selected model: LogisticRegression
selected params {'C': 0.006359428271196924, 'class_weight': 'balanced', 'solver': 'liblinear', 'warm_start': False}


## Task 3

### Expression

In [10]:
y = [int(row[3]) for row in expression_unified_ds]
classifier, precision, recall, fscore, classifier_name, params, std_precision, std_recall, std_f1 = train_classifier(corpus_expr, y)
joblib.dump(classifier, '../classifiers/language_expression.joblib')
print(f'precision: {str(precision)} ± {str(std_precision)}')
print(f'recall: {str(recall)} ± {str(std_recall)}')
print(f'fscore: {str(fscore)} ± {str(std_f1)}')
print(f'selected model: {classifier_name}')
print(f'selected params {str(params)}')

Fitting 5 folds for each of 100 candidates, totalling 500 fits


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

Finished training model and fitting best hyperparameters for LogisticRegression. F1 score: 0.876896506531542
Fitting 5 folds for each of 100 candidates, totalling 500 fits


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Finished training model and fitting best hyperparameters for MLPClassifier. F1 score: 0.8868965952037822
Fitting 5 folds for each of 100 candidates, totalling 500 fits


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

Finished training model and fitting best hyperparameters for SVC. F1 score: 0.8810825150623366
precision: 0.9008177651405764
recall: 0.8897435897435898
fscore: 0.8868965952037822
selected model: MLPClassifier
selected params {'activation': 'logistic', 'alpha': 2.716165633798464e-05, 'beta_1': 0.011200869411973721, 'beta_2': 0.006065390502538206, 'epsilon': 1.591737027128211e-05, 'hidden_layer_sizes': (50, 50, 50), 'learning_rate': 'constant', 'learning_rate_init': 0.023896775508448816, 'solver': 'adam'}


### Kinaseact

In [11]:
y = [int(row[3]) for row in kinaseact_unified_ds]
classifier, precision, recall, fscore, classifier_name, params, std_precision, std_recall, std_f1 = train_classifier(corpus_kinaseact, y)
joblib.dump(classifier, '../classifiers/language_kinase.joblib')
print(f'precision: {str(precision)} ± {str(std_precision)}')
print(f'recall: {str(recall)} ± {str(std_recall)}')
print(f'fscore: {str(fscore)} ± {str(std_f1)}')
print(f'selected model: {classifier_name}')
print(f'selected params {str(params)}')

Fitting 5 folds for each of 100 candidates, totalling 500 fits


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

Finished training model and fitting best hyperparameters for LogisticRegression. F1 score: 0.9413149693397959
Fitting 5 folds for each of 100 candidates, totalling 500 fits


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

Finished training model and fitting best hyperparameters for MLPClassifier. F1 score: 0.945482729038555
Fitting 5 folds for each of 100 candidates, totalling 500 fits
Finished training model and fitting best hyperparameters for SVC. F1 score: 0.9512353659601939
precision: 0.9628211731552525
recall: 0.9424812030075188
fscore: 0.9512353659601939
selected model: SVC
selected params {'C': 39.860641390563515, 'class_weight': 'balanced', 'coef0': 4.588030025927282, 'decision_function_shape': 'ovo', 'degree': 9, 'gamma': 0.0014902696445210633, 'kernel': 'rbf', 'random_state': 32, 'shrinking': False, 'tol': 0.0028710524642014297}
