In [6]:
#@title Imports, load sequences, helper functions
import torch
import random
from tqdm import tqdm
import numpy as np
import pandas as pd
from sklearn.metrics import r2_score
from sklearn.svm import SVR
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
from sklearn.utils import resample
from scipy.stats import spearmanr
from sklearn.metrics import ndcg_score
from sklearn.svm import SVR
from sklearn.linear_model import Ridge
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler, normalize
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.utils import resample
from scipy.stats import spearmanr
from scipy.stats import pearsonr
from sklearn.metrics import ndcg_score
from sklearn.kernel_ridge import KernelRidge
from sklearn.svm import SVR, SVC
from sklearn.linear_model import Ridge
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.metrics import f1_score
from sklearn.pipeline import Pipeline
from sklearn.metrics import precision_score, recall_score, roc_auc_score, accuracy_score
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

df = pd.read_csv('./LazBF_sequences.csv')
LazBF_sequences = df['sequences'].tolist()
LazBF_labels = df['labels'].tolist()

df = pd.read_csv('./LazBF_sample.csv')
LazBF_sample = df['sequences'].tolist()
LazBF_sample_labels = np.array(df['labels'].values)

df = pd.read_csv('./LazDEF_sequences.csv')
LazDEF_sequences = df['sequences'].tolist()
LazDEF_labels = df['labels'].tolist()

df = pd.read_csv('./LazDEF_sample.csv')
LazDEF_sample = df['sequences'].tolist()
LazDEF_sample_labels = np.array(df['labels'].values)

lazbf_mlm_none = np.load("./LazBF_mlm_none.npy")
lazdef_mlm_none = np.load("./LazDEF_mlm_none.npy")

lazbf_mlm_lazbf = np.load("./LazBF_mlm_LazBF.npy")
lazdef_mlm_lazbf = np.load("./LazDEF_mlm_LazBF.npy")

lazbf_mlm_lazdef = np.load("./LazBF_mlm_LazDEF.npy")
lazdef_mlm_lazdef = np.load("./LazDEF_mlm_LazDEF.npy")

metric_names = ['Accuracy', 'Precision', 'Recall', 'F1 Score', 'Auroc']

def print_metrics(y_true, y_pred):
  metrics = [accuracy_score(y_true, y_pred), precision_score(y_true, y_pred), recall_score(y_true, y_pred), f1_score(y_true, y_pred), roc_auc_score(y_true, y_pred)]
  for i in range(len(metrics)):
    if metrics[i] < 0.5:
      metrics[i] = 1-metrics[i]
  return metrics

def print_avg_performance(performances):
  performances = np.array(performances)
  performances_mean = np.mean(performances, axis=0)
  performances_std = np.std(performances, axis=0)
  for met, std, name in zip(performances_mean, performances_std, metric_names):
    print(f'Avgerage {name}: {met} +- {std}')

In [None]:
#@title Train all downstream model types with High-N condition
tr_size=40000

# performance on lazbf_mlm_none
model_list = [LogisticRegression(C=0.1), KNeighborsClassifier(n_neighbors=25, weights='distance'), RandomForestClassifier(criterion='log_loss', n_estimators=500), AdaBoostClassifier(learning_rate=1, n_estimators=500), SVC(C=5), MLPClassifier(hidden_layer_sizes=1000)]
model_names = ['Logistic regression', 'KNN classif', 'RF', 'AdaBoost', 'SVC', 'MLP']

embs = lazbf_mlm_none
labels = LazBF_sample_labels

for model_type, name in zip(model_list, model_names):
    kf = StratifiedKFold(n_splits=5, shuffle=True)
    print("---")
    print(f'{name} performance on lazbf_mlm_none')
    performances = []
    for train_index, test_index in kf.split(embs, labels):
      X_train, X_test = embs[train_index][:tr_size], embs[test_index]
      y_train, y_test = labels[train_index][:tr_size], labels[test_index]

      steps = [
        ('scaler', StandardScaler()),
        ('pca', PCA(n_components=50)),
        ('classifier', model_type)
      ]
      pipeline = Pipeline(steps)
      pipeline.fit(X_train, y_train)
      y_pred = pipeline.predict(X_test)
      integer_predictions = [1 if p >= 0.5 else 0 for p in y_pred]

      performances.append(print_metrics(y_test, integer_predictions))
    print_avg_performance(performances)

# performance on lazdef_mlm_none
model_list = [LogisticRegression(C=0.1), KNeighborsClassifier(n_neighbors=50), RandomForestClassifier(n_estimators=500), AdaBoostClassifier(learning_rate=1, n_estimators=500), SVC(C=5), MLPClassifier(hidden_layer_sizes=1000)]
model_names = ['Logistic regression', 'KNN classif', 'RF', 'AdaBoost', 'SVC', 'MLP']

embs = lazdef_mlm_none
labels = LazDEF_sample_labels

for model_type, name in zip(model_list, model_names):
    kf = StratifiedKFold(n_splits=5, shuffle=True)
    print("---")
    print(f'{name} performance on lazdef_mlm_none')
    performances = []
    for train_index, test_index in kf.split(embs, labels):
      X_train, X_test = embs[train_index][:tr_size], embs[test_index]
      y_train, y_test = labels[train_index][:tr_size], labels[test_index]

      steps = [
        ('scaler', StandardScaler()),
        ('pca', PCA(n_components=50)),
        ('classifier', model_type)
      ]
      pipeline = Pipeline(steps)
      pipeline.fit(X_train, y_train)
      y_pred = pipeline.predict(X_test)
      integer_predictions = [1 if p >= 0.5 else 0 for p in y_pred]

      performances.append(print_metrics(y_test, integer_predictions))
    print_avg_performance(performances)

# performance on lazbf_mlm_lazbf
model_list = [LogisticRegression(C=0.1, penalty=None), KNeighborsClassifier(n_neighbors=25, weights='distance'), RandomForestClassifier(), AdaBoostClassifier(learning_rate=0.1, n_estimators=500), SVC(C=10),  MLPClassifier(hidden_layer_sizes=1000)]
model_names = ['Logistic regression', 'KNN classif', 'RF', 'AdaBoost', 'SVC', 'MLP']

embs = lazbf_mlm_lazbf
labels = LazBF_sample_labels

for model_type, name in zip(model_list, model_names):
    kf = StratifiedKFold(n_splits=5, shuffle=True)
    print("---")
    print(f'{name} performance on lazbf_mlm_lazbf')
    performances = []
    for train_index, test_index in kf.split(embs, labels):
      X_train, X_test = embs[train_index][:tr_size], embs[test_index]
      y_train, y_test = labels[train_index][:tr_size], labels[test_index]

      steps = [
        ('scaler', StandardScaler()),
        ('pca', PCA(n_components=50)),
        ('classifier', model_type)
      ]
      pipeline = Pipeline(steps)
      pipeline.fit(X_train, y_train)
      y_pred = pipeline.predict(X_test)
      integer_predictions = [1 if p >= 0.5 else 0 for p in y_pred]

      performances.append(print_metrics(y_test, integer_predictions))
    print_avg_performance(performances)

# performance on lazdef_mlm_lazbf
model_list = [LogisticRegression(C=0.1), KNeighborsClassifier(n_neighbors=50, weights='distance'), RandomForestClassifier(criterion='log_loss', n_estimators=500), AdaBoostClassifier(learning_rate=1, n_estimators=500), SVC(C=5), MLPClassifier(hidden_layer_sizes=1000)]
model_names = ['Logistic regression', 'KNN classif', 'RF', 'AdaBoost', 'SVC', 'MLP']

embs = lazdef_mlm_lazbf
labels = LazDEF_sample_labels

for model_type, name in zip(model_list, model_names):
    kf = StratifiedKFold(n_splits=5, shuffle=True)
    print("---")
    print(f'{name} performance on lazdef_mlm_lazbf')
    performances = []
    for train_index, test_index in kf.split(embs, labels):
      X_train, X_test = embs[train_index][:tr_size], embs[test_index]
      y_train, y_test = labels[train_index][:tr_size], labels[test_index]

      steps = [
        ('scaler', StandardScaler()),
        ('pca', PCA(n_components=50)),
        ('classifier', model_type)
      ]
      pipeline = Pipeline(steps)
      pipeline.fit(X_train, y_train)
      y_pred = pipeline.predict(X_test)
      integer_predictions = [1 if p >= 0.5 else 0 for p in y_pred]

      performances.append(print_metrics(y_test, integer_predictions))
    print_avg_performance(performances)

# performance on lazbf_mlm_lazdef
model_list = [LogisticRegression(C=0.1, penalty=None), KNeighborsClassifier(n_neighbors=50), RandomForestClassifier(criterion='log_loss', n_estimators=500), AdaBoostClassifier(learning_rate=1, n_estimators=500), SVC(C=5), MLPClassifier(hidden_layer_sizes=1000)]
model_names = ['Logistic regression', 'KNN classif', 'RF', 'AdaBoost', 'SVC', 'MLP']

embs = lazbf_mlm_lazdef
labels = LazBF_sample_labels

for model_type, name in zip(model_list, model_names):
    kf = StratifiedKFold(n_splits=5, shuffle=True)
    print("---")
    print(f'{name} performance on lazbf_mlm_lazdef')
    performances = []
    for train_index, test_index in kf.split(embs, labels):
      X_train, X_test = embs[train_index][:tr_size], embs[test_index]
      y_train, y_test = labels[train_index][:tr_size], labels[test_index]

      steps = [
        ('scaler', StandardScaler()),
        ('pca', PCA(n_components=50)),
        ('classifier', model_type)
      ]
      pipeline = Pipeline(steps)
      pipeline.fit(X_train, y_train)
      y_pred = pipeline.predict(X_test)
      integer_predictions = [1 if p >= 0.5 else 0 for p in y_pred]

      performances.append(print_metrics(y_test, integer_predictions))
    print_avg_performance(performances)

# performance on lazdef_mlm_lazdef
model_list = [LogisticRegression(C=10, penalty=None), KNeighborsClassifier(n_neighbors=50), RandomForestClassifier(n_estimators=500), AdaBoostClassifier(learning_rate=1, n_estimators=500), SVC(C=1), MLPClassifier(hidden_layer_sizes=1000)]
model_names = ['Logistic regression', 'KNN classif', 'RF', 'AdaBoost', 'SVC', 'MLP']

embs = lazdef_mlm_lazdef
labels = LazDEF_sample_labels

for model_type, name in zip(model_list, model_names):
    kf = StratifiedKFold(n_splits=5, shuffle=True)
    print("---")
    print(f'{name} performance on lazdef_mlm_lazdef')
    performances = []
    for train_index, test_index in kf.split(embs, labels):
      X_train, X_test = embs[train_index][:tr_size], embs[test_index]
      y_train, y_test = labels[train_index][:tr_size], labels[test_index]

      steps = [
        ('scaler', StandardScaler()),
        ('pca', PCA(n_components=50)),
        ('classifier', model_type)
      ]
      pipeline = Pipeline(steps)
      pipeline.fit(X_train, y_train)
      y_pred = pipeline.predict(X_test)
      integer_predictions = [1 if p >= 0.5 else 0 for p in y_pred]

      performances.append(print_metrics(y_test, integer_predictions))
    print_avg_performance(performances)

In [None]:
#@title Train all downstream model types with Medium-N condition
tr_size=1000

# performance on lazbf_mlm_none
model_list = [SVC(C=10, kernel='linear'), MLPClassifier(hidden_layer_sizes=500), LogisticRegression(C=10, penalty=None), RandomForestClassifier(n_estimators=200), AdaBoostClassifier(learning_rate=1, n_estimators=200), KNeighborsClassifier(n_neighbors=50, weights='distance')]
model_names = ['SVC', 'MLP', 'Logistic regression', 'RF', 'AdaBoost', 'KNN classif']

embs = lazbf_mlm_none
labels = LazBF_sample_labels

for model_type, name in zip(model_list, model_names):
    kf = StratifiedKFold(n_splits=5, shuffle=True)
    print("---")
    print(f'{name} performance on lazbf_mlm_none')
    performances = []
    for train_index, test_index in kf.split(embs, labels):
      X_train, X_test = embs[train_index][:tr_size], embs[test_index]
      y_train, y_test = labels[train_index][:tr_size], labels[test_index]

      steps = [
        ('scaler', StandardScaler()),
        ('pca', PCA(n_components=50)),
        ('classifier', model_type)
      ]
      pipeline = Pipeline(steps)
      pipeline.fit(X_train, y_train)
      y_pred = pipeline.predict(X_test)
      integer_predictions = [1 if p >= 0.5 else 0 for p in y_pred]

      performances.append(print_metrics(y_test, integer_predictions))
    print_avg_performance(performances)

# performance on lazdef_mlm_none
model_list = [SVC(C=1), MLPClassifier(hidden_layer_sizes=100), LogisticRegression(C=0.1, penalty=None), RandomForestClassifier(criterion='entropy', n_estimators=200), AdaBoostClassifier(learning_rate=0.1, n_estimators=500), KNeighborsClassifier(n_neighbors=50)]
model_names = ['SVC', 'MLP', 'Logistic regression', 'RF', 'AdaBoost', 'KNN classif']

embs = lazdef_mlm_none
labels = LazDEF_sample_labels

for model_type, name in zip(model_list, model_names):
    kf = StratifiedKFold(n_splits=5, shuffle=True)
    print("---")
    print(f'{name} performance on lazdef_mlm_none')
    performances = []
    for train_index, test_index in kf.split(embs, labels):
      X_train, X_test = embs[train_index][:tr_size], embs[test_index]
      y_train, y_test = labels[train_index][:tr_size], labels[test_index]

      steps = [
        ('scaler', StandardScaler()),
        ('pca', PCA(n_components=50)),
        ('classifier', model_type)
      ]
      pipeline = Pipeline(steps)
      pipeline.fit(X_train, y_train)
      y_pred = pipeline.predict(X_test)
      integer_predictions = [1 if p >= 0.5 else 0 for p in y_pred]

      performances.append(print_metrics(y_test, integer_predictions))
    print_avg_performance(performances)

# performance on lazbf_mlm_lazbf
model_list = [SVC(C=0.1, kernel='linear'), MLPClassifier(activation='tanh', hidden_layer_sizes=50), LogisticRegression(C=0.1), RandomForestClassifier(criterion='entropy', n_estimators=200), AdaBoostClassifier(learning_rate=0.1, n_estimators=200), KNeighborsClassifier(n_neighbors=25)]
model_names = ['SVC', 'MLP', 'Logistic regression', 'RF', 'AdaBoost', 'KNN classif']

embs = lazbf_mlm_lazbf
labels = LazBF_sample_labels

for model_type, name in zip(model_list, model_names):
    kf = StratifiedKFold(n_splits=5, shuffle=True)
    print("---")
    print(f'{name} performance on lazbf_mlm_lazbf')
    performances = []
    for train_index, test_index in kf.split(embs, labels):
      X_train, X_test = embs[train_index][:tr_size], embs[test_index]
      y_train, y_test = labels[train_index][:tr_size], labels[test_index]

      steps = [
        ('scaler', StandardScaler()),
        ('pca', PCA(n_components=50)),
        ('classifier', model_type)
      ]
      pipeline = Pipeline(steps)
      pipeline.fit(X_train, y_train)
      y_pred = pipeline.predict(X_test)
      integer_predictions = [1 if p >= 0.5 else 0 for p in y_pred]

      performances.append(print_metrics(y_test, integer_predictions))
    print_avg_performance(performances)

# performance on lazdef_mlm_lazbf
model_list = [SVC(C=1, kernel='linear'), MLPClassifier(activation='tanh', hidden_layer_sizes=100), LogisticRegression(C=0.1), RandomForestClassifier(n_estimators=500), AdaBoostClassifier(learning_rate=1, n_estimators=200), KNeighborsClassifier(n_neighbors=25)]
model_names = ['SVC', 'MLP', 'Logistic regression', 'RF', 'AdaBoost', 'KNN classif']

embs = lazdef_mlm_lazbf
labels = LazDEF_sample_labels

for model_type, name in zip(model_list, model_names):
    kf = StratifiedKFold(n_splits=5, shuffle=True)
    print("---")
    print(f'{name} performance on lazdef_mlm_lazbf')
    performances = []
    for train_index, test_index in kf.split(embs, labels):
      X_train, X_test = embs[train_index][:tr_size], embs[test_index]
      y_train, y_test = labels[train_index][:tr_size], labels[test_index]

      steps = [
        ('scaler', StandardScaler()),
        ('pca', PCA(n_components=50)),
        ('classifier', model_type)
      ]
      pipeline = Pipeline(steps)
      pipeline.fit(X_train, y_train)
      y_pred = pipeline.predict(X_test)
      integer_predictions = [1 if p >= 0.5 else 0 for p in y_pred]

      performances.append(print_metrics(y_test, integer_predictions))
    print_avg_performance(performances)

# performance on lazbf_mlm_lazdef
model_list = [SVC(C=0.1, kernel='linear'), MLPClassifier(hidden_layer_sizes=500), LogisticRegression(C=0.1), RandomForestClassifier(criterion='log_loss', n_estimators=500), AdaBoostClassifier(learning_rate=1, n_estimators=200), KNeighborsClassifier(n_neighbors=50)]
model_names = ['SVC', 'MLP', 'Logistic regression', 'RF', 'AdaBoost', 'KNN classif']

embs = lazbf_mlm_lazdef
labels = LazBF_sample_labels

for model_type, name in zip(model_list, model_names):
    kf = StratifiedKFold(n_splits=5, shuffle=True)
    print("---")
    print(f'{name} performance on lazbf_mlm_lazdef')
    performances = []
    for train_index, test_index in kf.split(embs, labels):
      X_train, X_test = embs[train_index][:tr_size], embs[test_index]
      y_train, y_test = labels[train_index][:tr_size], labels[test_index]

      steps = [
        ('scaler', StandardScaler()),
        ('pca', PCA(n_components=50)),
        ('classifier', model_type)
      ]
      pipeline = Pipeline(steps)
      pipeline.fit(X_train, y_train)
      y_pred = pipeline.predict(X_test)
      integer_predictions = [1 if p >= 0.5 else 0 for p in y_pred]

      performances.append(print_metrics(y_test, integer_predictions))
    print_avg_performance(performances)

# performance on lazdef_mlm_lazdef
model_list = [SVC(C=1), MLPClassifier(hidden_layer_sizes=100), LogisticRegression(C=5, penalty=None), RandomForestClassifier(criterion='entropy', n_estimators=50), AdaBoostClassifier(learning_rate=0.1, n_estimators=500), KNeighborsClassifier(n_neighbors=25)]
model_names = ['SVC', 'MLP', 'Logistic regression', 'RF', 'AdaBoost', 'KNN classif']

embs = lazdef_mlm_lazdef
labels = LazDEF_sample_labels

for model_type, name in zip(model_list, model_names):
    kf = StratifiedKFold(n_splits=5, shuffle=True)
    print("---")
    print(f'{name} performance on lazdef_mlm_lazdef')
    performances = []
    for train_index, test_index in kf.split(embs, labels):
      X_train, X_test = embs[train_index][:tr_size], embs[test_index]
      y_train, y_test = labels[train_index][:tr_size], labels[test_index]

      steps = [
        ('scaler', StandardScaler()),
        ('pca', PCA(n_components=50)),
        ('classifier', model_type)
      ]
      pipeline = Pipeline(steps)
      pipeline.fit(X_train, y_train)
      y_pred = pipeline.predict(X_test)
      integer_predictions = [1 if p >= 0.5 else 0 for p in y_pred]

      performances.append(print_metrics(y_test, integer_predictions))
    print_avg_performance(performances)

In [None]:
#@title Train all downstream model types with Low-N condition
tr_size=100

# performance on lazbf_mlm_none
model_list = [SVC(C=5), MLPClassifier(hidden_layer_sizes=100), LogisticRegression(C=0.1, penalty=None), RandomForestClassifier(criterion='entropy'), AdaBoostClassifier(learning_rate=1, n_estimators=500), KNeighborsClassifier()]
model_names = ['SVC', 'MLP', 'Logistic regression', 'RF', 'AdaBoost', 'KNN classif']

embs = lazbf_mlm_none
labels = LazBF_sample_labels

for model_type, name in zip(model_list, model_names):
    kf = StratifiedKFold(n_splits=5, shuffle=True)
    print("---")
    print(f'{name} performance on lazbf_mlm_none')
    performances = []
    for train_index, test_index in kf.split(embs, labels):
      X_train, X_test = embs[train_index][:tr_size], embs[test_index]
      y_train, y_test = labels[train_index][:tr_size], labels[test_index]

      steps = [
        ('scaler', StandardScaler()),
        ('pca', PCA(n_components=50)),
        ('classifier', model_type)
      ]
      pipeline = Pipeline(steps)
      pipeline.fit(X_train, y_train)
      y_pred = pipeline.predict(X_test)
      integer_predictions = [1 if p >= 0.5 else 0 for p in y_pred]

      performances.append(print_metrics(y_test, integer_predictions))
    print_avg_performance(performances)

# performance on lazdef_mlm_none
model_list = [SVC(C=0.1, kernel='linear'), MLPClassifier(hidden_layer_sizes=1000), LogisticRegression(C=0.1), RandomForestClassifier(criterion='entropy', n_estimators=500), AdaBoostClassifier(learning_rate=1, n_estimators=200), KNeighborsClassifier(n_neighbors=10)]
model_names = ['SVC', 'MLP', 'Logistic regression', 'RF', 'AdaBoost', 'KNN classif']

embs = lazdef_mlm_none
labels = LazDEF_sample_labels

for model_type, name in zip(model_list, model_names):
    kf = StratifiedKFold(n_splits=5, shuffle=True)
    print("---")
    print(f'{name} performance on lazdef_mlm_none')
    performances = []
    for train_index, test_index in kf.split(embs, labels):
      X_train, X_test = embs[train_index][:tr_size], embs[test_index]
      y_train, y_test = labels[train_index][:tr_size], labels[test_index]

      steps = [
        ('scaler', StandardScaler()),
        ('pca', PCA(n_components=50)),
        ('classifier', model_type)
      ]
      pipeline = Pipeline(steps)
      pipeline.fit(X_train, y_train)
      y_pred = pipeline.predict(X_test)
      integer_predictions = [1 if p >= 0.5 else 0 for p in y_pred]

      performances.append(print_metrics(y_test, integer_predictions))
    print_avg_performance(performances)

# performance on lazbf_mlm_lazbf
model_list = [SVC(C=0.1),  MLPClassifier(hidden_layer_sizes=500), LogisticRegression(C=0.1), RandomForestClassifier(criterion='entropy', n_estimators=200), AdaBoostClassifier(learning_rate=5, n_estimators=200), KNeighborsClassifier()]
model_names = ['SVC', 'MLP', 'Logistic regression', 'RF', 'AdaBoost', 'KNN classif']

embs = lazbf_mlm_lazbf
labels = LazBF_sample_labels

for model_type, name in zip(model_list, model_names):
    kf = StratifiedKFold(n_splits=5, shuffle=True)
    print("---")
    print(f'{name} performance on lazbf_mlm_lazbf')
    performances = []
    for train_index, test_index in kf.split(embs, labels):
      X_train, X_test = embs[train_index][:tr_size], embs[test_index]
      y_train, y_test = labels[train_index][:tr_size], labels[test_index]

      steps = [
        ('scaler', StandardScaler()),
        ('pca', PCA(n_components=50)),
        ('classifier', model_type)
      ]
      pipeline = Pipeline(steps)
      pipeline.fit(X_train, y_train)
      y_pred = pipeline.predict(X_test)
      integer_predictions = [1 if p >= 0.5 else 0 for p in y_pred]

      performances.append(print_metrics(y_test, integer_predictions))
    print_avg_performance(performances)

# performance on lazdef_mlm_lazbf
model_list = [SVC(C=5), MLPClassifier(hidden_layer_sizes=750), LogisticRegression(C=0.1), RandomForestClassifier(n_estimators=200), AdaBoostClassifier(learning_rate=1, n_estimators=500), KNeighborsClassifier(n_neighbors=50, weights='distance')]
model_names = ['SVC', 'MLP', 'Logistic regression', 'RF', 'AdaBoost', 'KNN classif']

embs = lazdef_mlm_lazbf
labels = LazDEF_sample_labels

for model_type, name in zip(model_list, model_names):
    kf = StratifiedKFold(n_splits=5, shuffle=True)
    print("---")
    print(f'{name} performance on lazdef_mlm_lazbf')
    performances = []
    for train_index, test_index in kf.split(embs, labels):
      X_train, X_test = embs[train_index][:tr_size], embs[test_index]
      y_train, y_test = labels[train_index][:tr_size], labels[test_index]

      steps = [
        ('scaler', StandardScaler()),
        ('pca', PCA(n_components=50)),
        ('classifier', model_type)
      ]
      pipeline = Pipeline(steps)
      pipeline.fit(X_train, y_train)
      y_pred = pipeline.predict(X_test)
      integer_predictions = [1 if p >= 0.5 else 0 for p in y_pred]

      performances.append(print_metrics(y_test, integer_predictions))
    print_avg_performance(performances)

# performance on lazbf_mlm_lazdef
model_list = [SVC(C=0.1, kernel='linear'), MLPClassifier(hidden_layer_sizes=750), LogisticRegression(C=0.1), RandomForestClassifier(criterion='log_loss', n_estimators=50), AdaBoostClassifier(learning_rate=1, n_estimators=500), KNeighborsClassifier(n_neighbors=10)]
model_names = ['SVC', 'MLP', 'Logistic regression', 'RF', 'AdaBoost', 'KNN classif']

embs = lazbf_mlm_lazdef
labels = LazBF_sample_labels

for model_type, name in zip(model_list, model_names):
    kf = StratifiedKFold(n_splits=5, shuffle=True)
    print("---")
    print(f'{name} performance on lazbf_mlm_lazdef')
    performances = []
    for train_index, test_index in kf.split(embs, labels):
      X_train, X_test = embs[train_index][:tr_size], embs[test_index]
      y_train, y_test = labels[train_index][:tr_size], labels[test_index]

      steps = [
        ('scaler', StandardScaler()),
        ('pca', PCA(n_components=50)),
        ('classifier', model_type)
      ]
      pipeline = Pipeline(steps)
      pipeline.fit(X_train, y_train)
      y_pred = pipeline.predict(X_test)
      integer_predictions = [1 if p >= 0.5 else 0 for p in y_pred]

      performances.append(print_metrics(y_test, integer_predictions))
    print_avg_performance(performances)

# performance on lazdef_mlm_lazdef
model_list = [SVC(C=1), MLPClassifier(activation='tanh', hidden_layer_sizes=500), LogisticRegression(C=0.1), RandomForestClassifier(criterion='log_loss', n_estimators=200), AdaBoostClassifier(learning_rate=0.1), KNeighborsClassifier(n_neighbors=50)]
model_names = ['SVC', 'MLP', 'Logistic regression', 'RF', 'AdaBoost', 'KNN classif']

embs = lazdef_mlm_lazdef
labels = LazDEF_sample_labels

for model_type, name in zip(model_list, model_names):
    kf = StratifiedKFold(n_splits=5, shuffle=True)
    print("---")
    print(f'{name} performance on lazdef_mlm_lazdef')
    performances = []
    for train_index, test_index in kf.split(embs, labels):
      X_train, X_test = embs[train_index][:tr_size], embs[test_index]
      y_train, y_test = labels[train_index][:tr_size], labels[test_index]

      steps = [
        ('scaler', StandardScaler()),
        ('pca', PCA(n_components=50)),
        ('classifier', model_type)
      ]
      pipeline = Pipeline(steps)
      pipeline.fit(X_train, y_train)
      y_pred = pipeline.predict(X_test)
      integer_predictions = [1 if p >= 0.5 else 0 for p in y_pred]

      performances.append(print_metrics(y_test, integer_predictions))
    print_avg_performance(performances)