In [1]:
#pip uninstall simpletransformers transformers torch

In [None]:
import simpletransformers
from simpletransformers.classification import ClassificationModel, ClassificationArgs

import pandas as pd
import torch
import logging
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
import logging 
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
df_author = pd.read_excel("./data/training/1229_subset_authorinfo.xlsx", engine="openpyxl")

In [None]:
df_first = df_author[["FA AFFILIATION", "FA domain"]]
df_last = df_author[["LA AFFILIATION", "LA domain"]]

In [None]:
df_first = df_first.rename(columns={'FA AFFILIATION': 'AFFILIATION', 'FA domain': 'DOMAIN'})
df_last = df_last.rename(columns={'LA AFFILIATION': 'AFFILIATION', 'LA domain': 'DOMAIN'})
df_combined = df_first.append(df_last, ignore_index=True)

df_combined.loc[df_combined['DOMAIN'] == "Both?", 'DOMAIN'] = "Domain expert"
df_combined.loc[df_combined['DOMAIN'] == "DOmain expert", 'DOMAIN'] = "Domain expert"
df_combined.loc[df_combined['DOMAIN'] == "Doma+K278", 'DOMAIN'] = "Domain expert"
df_combined.loc[df_combined['DOMAIN'] == ".", 'DOMAIN'] = "other"
df_combined["DOMAIN"] = df_combined['DOMAIN'].fillna("other")

df_combined["DOMAIN"].unique()

In [None]:
import numpy as np
X = df_combined["AFFILIATION"]
df_combined["DOMAIN"] = df_combined["DOMAIN"].astype("category")
df_combined["DOMAIN_cat"] = df_combined["DOMAIN"].cat.codes
y = df_combined["DOMAIN_cat"].astype(int)

## train valid split
sss = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=0)
for train_index, test_index in sss.split(X, y):
    X_train, X_valid = X.iloc[train_index], X.iloc[test_index]
    y_train, y_valid = y.iloc[train_index], y.iloc[test_index]
 
df_train = X_train.to_frame().join(y_train)
df_valid = X_valid.to_frame().join(y_valid)
df_train.columns=["text", "labels"]
df_valid.columns=["text", "labels"]


In [None]:
import wandb
wandb.login()
 
sweep_config = {
    'method': 'random', #grid, random, bayes
    'metric': {
      'name': 'eval_loss',
      'goal': 'minimize'   
    },
    'parameters': {
        'learning_rate': {
            'values': [3e-5, 2e-5]
        },
        'num_train_epochs':{
            'values':[32,64,128,256]
        },
        'train_batch_size': {
            'values':[32,64,128,256]
        },
        'eval_batch_size': {
            'values':[8,16,32]
        }
    }
}

sweep_id = wandb.sweep(sweep_config,project="author_biases_in_ml-0221")

logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

In [None]:
# Training arguments
model_args = ClassificationArgs()
model_args.reprocess_input_data = True
model_args.overwrite_output_dir = True
model_args.evaluate_during_training = True
#model_args.num_train_epochs = 60
#model_args.use_early_stopping = True
#model_args.early_stopping_delta = 0.1
#model_args.early_stopping_metric = "mcc"
#model_args.early_stopping_metric_minimize = False
#model_args.early_stopping_patience : 5
#model_args.manual_seed = 4
model_args.use_multiprocessing = False
model_args.lazy_loading = False
model_args.save_optimizer_and_scheduler = False
model_args.save_eval_checkpoints = False
model_args.save_model_every_epoch = False 
model_args.wandb_project = "author_biases_in_ml-0221"


def train():
    wandb.init()
    
    model = simpletransformers.classification.ClassificationModel("bert", "monologg/biobert_v1.1_pubmed", num_labels=3, args=model_args, use_cuda=True, sweep_config=wandb.config)
 
    # Train the model
    model.train_model(df_train, eval_df = df_valid)
    #wandb.save('model.h5')
    #PATH = wandb.run._run_id + '.h5'
    #torch.save(model, PATH)
    
    # Evaluate the model
    model.eval_model(df_valid)
    #wandb.save('model.h5')
    
    wandb.join()
    torch.cuda.empty_cache()
    del model
 
wandb.agent(sweep_id,function=train)

In [None]:
"""#Alternative
def train():
    wandb.init()
    
    # Training arguments
    train_args = {
        'num_train_epochs' : 30,
        'evaluate_during_training': True,
        'save_eval_checkpoints': False,
        'overwrite_output_dir': True,
        'use_multiprocessing' : True,
        'reprocess_input_data' : True,
        'fp16': False,
        'use_early_stopping' : True,
        'early_stopping_delta' : 0.1,
        'early_stopping_metric' : "mcc",
        'early_stopping_metric_minimize' : False,
        'early_stopping_patience' : 5,
        'train_batch_size': wandb.config.train_batch_size,
        'eval_batch_size': wandb.config.eval_batch_size,
        'learning_rate': wandb.config.learning_rate,
        'wandb_project': "author_biases_in_ml-0221"
    }
    
    model = simpletransformers.classification.ClassificationModel("bert", "microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext", num_labels=3, args=train_args, use_cuda=True)
 
    # Train the model
    model.train_model(df_train, eval_df = df_train)
    #wandb.save('model.h5')
    #PATH = wandb.run._run_id + '.h5'
    #torch.save(model, PATH)
    
    # Evaluate the model
    model.eval_model(df_valid)
    #wandb.save('model.h5')
 
wandb.agent(sweep_id,function=train)
"""