In [36]:
from string import punctuation
from nltk import word_tokenize
from nltk.corpus import stopwords
import sent2vec

expression_unified_ds = [line.strip().split("\t") for line in open("../unified_dataset/expression.tsv")][1:]
kinaseact_unified_ds = [line.strip().split("\t") for line in open("../unified_dataset/kinaseact.tsv")][1:]

stop_words = set(stopwords.words('english'))
def preprocess_sentence(text):
    text = text.replace('/', ' / ')
    text = text.replace('.-', ' .- ')
    text = text.replace('.', ' . ')
    text = text.replace('\'', ' \' ')
    text = text.lower()

    tokens = [token for token in word_tokenize(text) if token not in punctuation and token not in stop_words]

    return ' '.join(tokens)

sentences_only_expression_data = [preprocess_sentence(row[0]) for row in expression_unified_ds]
sentences_only_kinaseact_data = [preprocess_sentence(row[0]) for row in kinaseact_unified_ds]
bio_sent_vec_model_location = "../biosentvec/model.bin"

In [37]:
model_path = bio_sent_vec_model_location
biosentvec_model = sent2vec.Sent2vecModel()
try:
    biosentvec_model.load_model(model_path)
except Exception as e:
    print(e)
print('model successfully loaded')

model successfully loaded


In [38]:
corpus_expr = biosentvec_model.embed_sentences(sentences_only_expression_data)
corpus_kinaseact = biosentvec_model.embed_sentences(sentences_only_kinaseact_data)

In [39]:
from sklearn.metrics import make_scorer, precision_score, recall_score, f1_score
from sklearn.model_selection import RandomizedSearchCV, cross_validate, StratifiedKFold
from models import POSSIBLE_CLASSIFIERS


def train_classifier(X, y):

    best_score = 0
    best_classifier = None
    best_params = None
    best_classifier_name = ""

    stratified_k_folds = StratifiedKFold(n_splits=5)
    
    scoring = {'precision': make_scorer(precision_score),
           'recall': make_scorer(recall_score),
           'f1': make_scorer(f1_score)}

    best_results = {}
    best_index = 0
    # Iterate over classifiers and perform grid search
    for classifier_name, classifier_info in POSSIBLE_CLASSIFIERS.items():
        random_search = RandomizedSearchCV(estimator=classifier_info['model'], n_iter=100,
                                           param_distributions=classifier_info['params'], cv=stratified_k_folds,
                                           scoring=scoring, refit='f1',
                                           verbose=1, n_jobs=-1)
        random_search.fit(X, y)

        print(f"Finished training model and fitting best hyperparameters for {classifier_name}. F1 score: "
              f"{str(random_search.best_score_)}")

        if random_search.best_score_ > best_score:
            best_score = random_search.best_score_
            best_classifier = random_search.best_estimator_
            best_params = random_search.best_params_
            best_classifier_name = classifier_name
            best_results = random_search.cv_results_
            best_index = random_search.best_index_

    # Retrieve the average precision, recall, and F1 score
    average_precision = best_results['mean_test_precision'][best_index]
    average_recall = best_results['mean_test_recall'][best_index]
    average_f1 = best_results['mean_test_f1'][best_index]

    # Return the trained model and performance metrics
    return best_classifier, average_precision, average_recall, average_f1, best_classifier_name, best_params


## Task 1

### Expression

In [40]:
import joblib

y = [int(row[1]) for row in expression_unified_ds]
classifier, precision, recall, fscore, classifier_name, params = train_classifier(corpus_expr, y)
joblib.dump(classifier, '../classifiers/all_info_expression.joblib')
print('precision: {}'.format(precision))
print('recall: {}'.format(recall))
print('fscore: {}'.format(fscore))
print('selected model: {}'.format(classifier_name))
print('selected params {}'.format(str(params)))

Fitting 5 folds for each of 20 candidates, totalling 100 fits
Finished training model and fitting best hyperparameters for LogisticRegression. F1 score: 0.7310977872503861
Fitting 5 folds for each of 20 candidates, totalling 100 fits
Finished training model and fitting best hyperparameters for RandomForestClassifier. F1 score: 0.34801255801255804
Fitting 5 folds for each of 20 candidates, totalling 100 fits
Finished training model and fitting best hyperparameters for GradientBoostingClassifier. F1 score: 0.6511586887688582
Fitting 5 folds for each of 20 candidates, totalling 100 fits
Finished training model and fitting best hyperparameters for XGBClassifier. F1 score: 0.6472743159647389
Fitting 5 folds for each of 20 candidates, totalling 100 fits
Finished training model and fitting best hyperparameters for MLPClassifier. F1 score: 0.7291825845215676
Fitting 5 folds for each of 20 candidates, totalling 100 fits
Finished training model and fitting best hyperparameters for SVC. F1 score:

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
Note: You have installed the 'manylinux2014' variant of XGBoost. Certain features such as GPU algorithms or federated learning are not available. To use these features, please

Finished training model and fitting best hyperparameters for SGDClassifier. F1 score: 0.6526044884931299
Fitting 5 folds for each of 20 candidates, totalling 100 fits


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

Finished training model and fitting best hyperparameters for Perceptron. F1 score: 0.6373678012018853
precision: 0.6995533666248405
recall: 0.8743983614951357
fscore: 0.7391292150654056
selected model: SVC
selected params {'C': 64.37642374776033, 'class_weight': 'balanced', 'coef0': 0.3684899812181808, 'decision_function_shape': 'ovo', 'degree': 1, 'gamma': 0.0017903389125618317, 'kernel': 'poly', 'random_state': 30, 'shrinking': True, 'tol': 0.0040083415667067435}


### Kinaseact

In [41]:
y = [int(row[1]) for row in kinaseact_unified_ds]
classifier, precision, recall, fscore, classifier_name, params = train_classifier(corpus_kinaseact, y)
joblib.dump(classifier, '../classifiers/all_info_kinase.joblib')
print('precision: {}'.format(precision))
print('recall: {}'.format(recall))
print('fscore: {}'.format(fscore))
print('selected model: {}'.format(classifier_name))
print('selected params {}'.format(str(params)))

Fitting 5 folds for each of 20 candidates, totalling 100 fits


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
Note: You have installed the 'manylinux2014' variant of XGBoost. Certain features such as GPU algorithms or federated learning are not available. To use these features, please upgrade to a recent Linux distro with glibc 2.28+, and install the 'manylinux_2_28' variant.
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(aver

Finished training model and fitting best hyperparameters for LogisticRegression. F1 score: 0.7077209495661612
Fitting 5 folds for each of 20 candidates, totalling 100 fits


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
Note: You have installed the 'manylinux2014' variant of XGBoost. Certain features such as GPU algorithms or federated learning are not available. To use these features, please upgrade to a recent Linux distro with glibc 2.28+, and install the 'manylinux_2_28' variant.
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(aver

Finished training model and fitting best hyperparameters for RandomForestClassifier. F1 score: 0.38148661340285545
Fitting 5 folds for each of 20 candidates, totalling 100 fits
Finished training model and fitting best hyperparameters for GradientBoostingClassifier. F1 score: 0.5846275343899057
Fitting 5 folds for each of 20 candidates, totalling 100 fits
Finished training model and fitting best hyperparameters for XGBClassifier. F1 score: 0.5955191865786269
Fitting 5 folds for each of 20 candidates, totalling 100 fits


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
Note: You have installed the 'manylinux2014' variant of XGBoost. Certain features such as GPU algorithms or federated learning are not available. To use these features, please upgrade to a recent Linux distro with glibc 2.28+, and install the 'manylinux_2_28' variant.
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(aver

Finished training model and fitting best hyperparameters for MLPClassifier. F1 score: 0.6415431454779522
Fitting 5 folds for each of 20 candidates, totalling 100 fits


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
Note: You have installed the 'manylinux2014' variant of XGBoost. Certain features such as GPU algorithms or federated learning are not available. To use these features, please upgrade to a recent Linux distro with glibc 2.28+, and install the 'manylinux_2_28' variant.
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(aver

Finished training model and fitting best hyperparameters for SVC. F1 score: 0.7107365189489943
Fitting 5 folds for each of 20 candidates, totalling 100 fits
Finished training model and fitting best hyperparameters for DecisionTreeClassifier. F1 score: 0.4703724170792074
Fitting 5 folds for each of 20 candidates, totalling 100 fits
Finished training model and fitting best hyperparameters for KNeighborsClassifier. F1 score: 0.5950299076538904
Fitting 5 folds for each of 20 candidates, totalling 100 fits


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
Note: You have installed the 'manylinux2014' variant of XGBoost. Certain features such as GPU algorithms or federated learning are not available. To use these features, please upgrade to a recent Linux distro with glibc 2.28+, and install the 'manylinux_2_28' variant.
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(aver

Finished training model and fitting best hyperparameters for SGDClassifier. F1 score: 0.6027710195842495
Fitting 5 folds for each of 20 candidates, totalling 100 fits
Finished training model and fitting best hyperparameters for Perceptron. F1 score: 0.576251730682177
precision: 0.6505218374783592
recall: 0.8922598870056497
fscore: 0.7107365189489943
selected model: SVC
selected params {'C': 134.70383060773347, 'class_weight': 'balanced', 'coef0': 0.5624640319470131, 'decision_function_shape': 'ovr', 'degree': 3, 'gamma': 0.00015172209466466608, 'kernel': 'sigmoid', 'random_state': 36, 'shrinking': False, 'tol': 0.0022898460578036125}


## Task 2

### Expression

In [42]:
y = [int(row[2]) for row in expression_unified_ds]
classifier, precision, recall, fscore, classifier_name, params = train_classifier(corpus_expr, y)
joblib.dump(classifier, '../classifiers/curatable_expression.joblib')
print('precision: {}'.format(precision))
print('recall: {}'.format(recall))
print('fscore: {}'.format(fscore))
print('selected model: {}'.format(classifier_name))
print('selected params {}'.format(str(params)))

Fitting 5 folds for each of 20 candidates, totalling 100 fits
Finished training model and fitting best hyperparameters for LogisticRegression. F1 score: 0.8070396912022819
Fitting 5 folds for each of 20 candidates, totalling 100 fits
Finished training model and fitting best hyperparameters for RandomForestClassifier. F1 score: 0.6789640535704734
Fitting 5 folds for each of 20 candidates, totalling 100 fits
Finished training model and fitting best hyperparameters for GradientBoostingClassifier. F1 score: 0.7530398071156241
Fitting 5 folds for each of 20 candidates, totalling 100 fits
Finished training model and fitting best hyperparameters for XGBClassifier. F1 score: 0.7714985915688304
Fitting 5 folds for each of 20 candidates, totalling 100 fits


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

Finished training model and fitting best hyperparameters for MLPClassifier. F1 score: 0.8071656982212625
Fitting 5 folds for each of 20 candidates, totalling 100 fits
Finished training model and fitting best hyperparameters for SVC. F1 score: 0.8104123264973208
Fitting 5 folds for each of 20 candidates, totalling 100 fits
Finished training model and fitting best hyperparameters for DecisionTreeClassifier. F1 score: 0.5623456098245524
Fitting 5 folds for each of 20 candidates, totalling 100 fits
Finished training model and fitting best hyperparameters for KNeighborsClassifier. F1 score: 0.7773921331621301
Fitting 5 folds for each of 20 candidates, totalling 100 fits
Finished training model and fitting best hyperparameters for SGDClassifier. F1 score: 0.7881389124212538
Fitting 5 folds for each of 20 candidates, totalling 100 fits


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

Finished training model and fitting best hyperparameters for Perceptron. F1 score: 0.7585512233010898
precision: 0.7726421829173311
recall: 0.9082288401253917
fscore: 0.8104123264973208
selected model: SVC
selected params {'C': 114.41023411330886, 'class_weight': 'balanced', 'coef0': 0.47076244129353684, 'decision_function_shape': 'ovr', 'degree': 3, 'gamma': 0.0007009990668551018, 'kernel': 'poly', 'random_state': 13, 'shrinking': False, 'tol': 0.002936876590278472}


### Kinaseact

In [43]:
y = [int(row[2]) for row in kinaseact_unified_ds]
classifier, precision, recall, fscore, classifier_name, params = train_classifier(corpus_kinaseact, y)
joblib.dump(classifier, '../classifiers/curatable_kinase.joblib')
print('precision: {}'.format(precision))
print('recall: {}'.format(recall))
print('fscore: {}'.format(fscore))
print('selected model: {}'.format(classifier_name))
print('selected params {}'.format(str(params)))

Fitting 5 folds for each of 20 candidates, totalling 100 fits
Finished training model and fitting best hyperparameters for LogisticRegression. F1 score: 0.7441397247670236
Fitting 5 folds for each of 20 candidates, totalling 100 fits


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

Finished training model and fitting best hyperparameters for RandomForestClassifier. F1 score: 0.4611864725198253
Fitting 5 folds for each of 20 candidates, totalling 100 fits
Finished training model and fitting best hyperparameters for GradientBoostingClassifier. F1 score: 0.6469834292223429
Fitting 5 folds for each of 20 candidates, totalling 100 fits
Finished training model and fitting best hyperparameters for XGBClassifier. F1 score: 0.6892398794507287
Fitting 5 folds for each of 20 candidates, totalling 100 fits


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

Finished training model and fitting best hyperparameters for MLPClassifier. F1 score: 0.7345941809541252
Fitting 5 folds for each of 20 candidates, totalling 100 fits
Finished training model and fitting best hyperparameters for SVC. F1 score: 0.7485967812947679
Fitting 5 folds for each of 20 candidates, totalling 100 fits
Finished training model and fitting best hyperparameters for DecisionTreeClassifier. F1 score: 0.5511190771583243
Fitting 5 folds for each of 20 candidates, totalling 100 fits
Finished training model and fitting best hyperparameters for KNeighborsClassifier. F1 score: 0.6373317125450879
Fitting 5 folds for each of 20 candidates, totalling 100 fits


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

Finished training model and fitting best hyperparameters for SGDClassifier. F1 score: 0.6525659229967895
Fitting 5 folds for each of 20 candidates, totalling 100 fits
Finished training model and fitting best hyperparameters for Perceptron. F1 score: 0.622922732886914
precision: 0.7034790166244731
recall: 0.8712328767123287
fscore: 0.7485967812947679
selected model: SVC
selected params {'C': 87.43012124429951, 'class_weight': 'balanced', 'coef0': 2.1076130344459667, 'decision_function_shape': 'ovo', 'degree': 2, 'gamma': 0.000357167360705869, 'kernel': 'sigmoid', 'random_state': 45, 'shrinking': True, 'tol': 0.0037441318064746387}


## Task 3

### Expression

In [44]:
y = [int(row[3]) for row in expression_unified_ds]
classifier, precision, recall, fscore, classifier_name, params = train_classifier(corpus_expr, y)
joblib.dump(classifier, '../classifiers/language_expression.joblib')
print('precision: {}'.format(precision))
print('recall: {}'.format(recall))
print('fscore: {}'.format(fscore))
print('selected model: {}'.format(classifier_name))
print('selected params {}'.format(str(params)))

Fitting 5 folds for each of 20 candidates, totalling 100 fits
Finished training model and fitting best hyperparameters for LogisticRegression. F1 score: 0.8792731939764987
Fitting 5 folds for each of 20 candidates, totalling 100 fits
Finished training model and fitting best hyperparameters for RandomForestClassifier. F1 score: 0.7981126835013382
Fitting 5 folds for each of 20 candidates, totalling 100 fits
Finished training model and fitting best hyperparameters for GradientBoostingClassifier. F1 score: 0.8505854194221735
Fitting 5 folds for each of 20 candidates, totalling 100 fits
Finished training model and fitting best hyperparameters for XGBClassifier. F1 score: 0.856062495695776
Fitting 5 folds for each of 20 candidates, totalling 100 fits
Finished training model and fitting best hyperparameters for MLPClassifier. F1 score: 0.8839990096469437
Fitting 5 folds for each of 20 candidates, totalling 100 fits
Finished training model and fitting best hyperparameters for SVC. F1 score: 0

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

Finished training model and fitting best hyperparameters for SGDClassifier. F1 score: 0.8748457671679131
Fitting 5 folds for each of 20 candidates, totalling 100 fits
Finished training model and fitting best hyperparameters for Perceptron. F1 score: 0.8552258748329054
precision: 0.8875662274636594
recall: 0.8938510534051936
fscore: 0.8839990096469437
selected model: MLPClassifier
selected params {'activation': 'logistic', 'alpha': 1.410611356392809e-05, 'beta_1': 0.003821723885372423, 'beta_2': 0.1908341084518872, 'epsilon': 1.8115881880847127e-06, 'hidden_layer_sizes': (500, 100, 500), 'learning_rate': 'adaptive', 'learning_rate_init': 0.010579984929116684, 'solver': 'adam'}


### Kinaseact

In [None]:
y = [int(row[3]) for row in kinaseact_unified_ds]
classifier, precision, recall, fscore, classifier_name, params = train_classifier(corpus_kinaseact, y)
joblib.dump(classifier, '../classifiers/language_kinase.joblib')
print('precision: {}'.format(precision))
print('recall: {}'.format(recall))
print('fscore: {}'.format(fscore))
print('selected model: {}'.format(classifier_name))
print('selected params {}'.format(str(params)))

Fitting 5 folds for each of 20 candidates, totalling 100 fits
Finished training model and fitting best hyperparameters for LogisticRegression. F1 score: 0.9317180990488982
Fitting 5 folds for each of 20 candidates, totalling 100 fits


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr