In [None]:
# set WORKDIR to the top of experiment repository
%cd ..
%pwd

In [None]:
import mlflow
from sklearn.metrics import matthews_corrcoef
from src.utils import timestamp
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import pipeline
from transformers.pipelines.text_classification import ClassificationFunction
import torch

from src.settings import (
    MLFLOW_TRACKING_USERNAME,
    EXPERIMENT_NAME,
    )


# Check if a GPU is available and set the device
device = 0 if torch.cuda.is_available() else -1

# select the model for evaluation
all_models = {
    'all-mpnet-base-v2': 'sentence-transformers/all-mpnet-base-v2',
    'stsb-roberta-large': 'cross-encoder/stsb-roberta-large',
    'stsb-roberta-base': 'cross-encoder/stsb-roberta-base',
    'Legal-BERT': 'nlpaueb/legal-bert-base-uncased',
    'EURLEX-BERT': 'nlpaueb/bert-base-uncased-eurlex',
    'SciBERT': 'allenai/scibert_scivocab_uncased',
}

selected_model = all_models['stsb-roberta-base']


In [None]:
# load test dataset
df_test = pd.read_parquet('data/test_clean.parquet')

sentence_pairs = list(zip(df_test['text'].tolist(),df_test['text_b'].tolist()))
labels_true = df_test['label'].tolist()

# sentence pairs as list of dicts for transformer's pipeline
sentence_pairs_lods = [{"text": x[0], "text_pair": x[1]} for x in sentence_pairs]


In [None]:
model = AutoModelForSequenceClassification.from_pretrained(selected_model)
tokenizer = AutoTokenizer.from_pretrained(selected_model)

pipe = pipeline("text-classification", 
                model = model, 
                tokenizer = tokenizer, 
                padding = True, 
                truncation = True,
                # max_length = 512, 
                device = device, 
                function_to_apply = ClassificationFunction.SIGMOID,
                )

predictions = pipe(sentence_pairs_lods)

# binarization
for threshold in [
    0.33,
    0.53,
    0.66,
    0.75,
    0.85,
]:
    labels_pred = [0 if x['score'] <= threshold else 1 for x in predictions]
    matthews_corrcoef_values = matthews_corrcoef(y_true=labels_true, y_pred=labels_pred)
    print(matthews_corrcoef_values)


In [None]:
config = {
    'test_key': 'test_value',
    'num_epochs': 5,
}

# init mlflow experiment (use existing one)
experiment = mlflow.get_experiment_by_name(EXPERIMENT_NAME)

# run experiment
with mlflow.start_run(experiment_id=experiment.experiment_id, log_system_metrics=True) as run:
    # set run name
    mlflow.set_tag(key='mlflow.runName',
                    value=f'{selected_model.split('/')[1]}_{timestamp()}')
    
    # log parameters from config
    mlflow.log_params(config)

    
    for epoch in range(config['num_epochs']):
        # model.train()
        mlflow.log_metric("random_metric", epoch+random.randint(1, 100), step=epoch)

# end experiment
mlflow.end_run()


- mlflow
- Implement a classic transformer-based classifier.
    - Split the training data into train and validation sets
    - fine-tune
    - test with test data
- Obtain quality metrics on the test set. Do include F1 score and Matthews Correlation Coefficient https://scikit-learn.org/stable/modules/generated/sklearn.metrics.matthews_corrcoef.html
- 