In [1]:
import os

import pandas as pd

data_dir = "data/readability"
model_out_dir = os.path.join(data_dir, "model_out")
train_csv = pd.read_csv(os.path.join(data_dir, "train.csv"))

In [2]:
import torch
from sklearn import model_selection
from torch.utils.data import Dataset
from transformers import AutoTokenizer


class ClassificationDataset(Dataset):
    def __init__(self, data, tokenizer):
        super().__init__()
        self.tokens = tokenizer(
            data["excerpt"].tolist(),
            padding="max_length",
            truncation=True,
            max_length=512,
            return_tensors="pt",
        )
        if "target" in data.columns:
            self.lables = data["target"].tolist()

    def __len__(self):
        return len(self.tokens["input_ids"])

    def __getitem__(self, index):
        return_dict = {
            "input_ids": self.tokens["input_ids"][index],
            "attention_mask": self.tokens["attention_mask"][index],
        }
        if "token_type_ids" in self.tokens:
            return_dict.update({"token_type_ids": self.tokens["token_type_ids"][index]})
        if hasattr(self, "lables"):
            return_dict.update({"labels": torch.tensor(self.lables[index])})
        return return_dict

In [3]:
from torch import nn


class BertBasedClassifier(nn.Module):
    def __init__(self, bert, dropout=0.3):
        super().__init__()
        self.bert = bert
        bert_output_size = bert.config.hidden_size
        self.dropout = nn.Dropout(dropout)
        self.attention = nn.Sequential(
            nn.Linear(bert_output_size, 512),
            nn.Tanh(),
            nn.Linear(512, 1),
            nn.Softmax(dim=1),
        )
        self.regressor = nn.Linear(bert_output_size, 1)

    def forward(
        self, input_ids=None, token_type_ids=None, attention_mask=None, labels=None
    ):
        if token_type_ids is None:
            bert_output = self.bert(
                input_ids=input_ids,
                attention_mask=attention_mask,
            )
        elif attention_mask is None:
            bert_output = self.bert(input_ids=input_ids, token_type_ids=token_type_ids)
        else:
            bert_output = self.bert(
                input_ids=input_ids,
                attention_mask=attention_mask,
                token_type_ids=token_type_ids,
            )
        last_layer_hidden_states = bert_output.last_hidden_state
        attention = self.attention(last_layer_hidden_states)
        output = torch.sum(attention * last_layer_hidden_states, dim=1)
        logits = self.regressor(self.dropout(output))
        if labels is None:
            return logits
        loss_fn = torch.nn.MSELoss()
        loss = loss_fn(logits.reshape(labels.shape), labels)
        return loss, logits

In [4]:
from transformers import AutoModel, Trainer, TrainerCallback, TrainingArguments
import numpy as np


class CollateMetricsCallback(TrainerCallback):
    def __init__(self):
        super().__init__()
        self.rmse = 0.0

    def on_train_end(self, args, state, control, **kwargs):
        self.rmse += state.best_metric


def train(
    model,
    model_name,
    train_data,
    eval_data=None,
    output_dir=None,
    model_save_dir=None,
    logging_dir=None,
    batch_size=8,
    num_train_epochs=5,
    learning_rate=5e-6,
    trainer_callback=CollateMetricsCallback(),
):
    torch.cuda.empty_cache()
    tokenizer = AutoTokenizer.from_pretrained(
        model_name, clean_up_tokenization_spaces=True
    )
    output_dir = os.path.join(
        model_out_dir, f"{model_name}-output" if output_dir is None else output_dir
    )
    model_save_dir = os.path.join(
        model_out_dir,
        f"{model_name}-model" if model_save_dir is None else model_save_dir,
    )
    logging_dir = os.path.join(
        model_out_dir, f"{model_name}-runs" if logging_dir is None else logging_dir
    )
    print(
        f"output_dir: {output_dir}, model_save_dir: {model_save_dir}, logging_dir: {logging_dir}"
    )
    train_dataset = ClassificationDataset(train_data, tokenizer)
    if eval_data is not None:
        eval_dataset = ClassificationDataset(eval_data, tokenizer)
        args = TrainingArguments(
            output_dir=output_dir,
            overwrite_output_dir=True,
            logging_dir=logging_dir,
            report_to=["tensorboard"],
            per_device_train_batch_size=batch_size,
            per_device_eval_batch_size=batch_size,
            num_train_epochs=num_train_epochs,
            learning_rate=learning_rate,
            fp16=True,
            eval_strategy="steps",
            logging_steps=0.01,
            save_steps=0.2,
            eval_steps=0.2,
            # torch_empty_cache_steps=1,
            weight_decay=0.01,
            load_best_model_at_end=True,
            greater_is_better=False,
            metric_for_best_model="eval_rmse",
            save_only_model=True,
            lr_scheduler_type="cosine",
            warmup_ratio=0.1,
        )
        trainer = Trainer(
            args=args,
            model=model,
            train_dataset=train_dataset,
            eval_dataset=eval_dataset,
            compute_metrics=lambda x: {
                "rmse": np.sqrt(
                    np.mean(
                        (x.predictions.reshape(x.label_ids.shape) - x.label_ids) ** 2
                    )
                )
            },
            callbacks=[trainer_callback],
        )
    else:
        args = TrainingArguments(
            output_dir=output_dir,
            overwrite_output_dir=True,
            logging_dir=logging_dir,
            report_to=["tensorboard"],
            per_device_train_batch_size=batch_size,
            num_train_epochs=num_train_epochs,
            learning_rate=learning_rate,
            fp16=True,
            logging_steps=0.01,
            eval_steps=0.2,
            save_steps=0.2,
            # torch_empty_cache_steps=1,
            weight_decay=0.01,
            save_only_model=True,
            lr_scheduler_type="cosine",
            warmup_ratio=0.1,
        )
        trainer = Trainer(
            args=args,
            model=model,
            train_dataset=train_dataset,
        )
    trainer.train()
    trainer.save_model(model_save_dir)
    return trainer_callback.rmse

In [5]:
def wrapped_bert(bert_name, hidden_dropout_prob=None):
    def init_model():
        if hidden_dropout_prob is not None:
            bert = AutoModel.from_pretrained(
                bert_name,
                hidden_dropout_prob=hidden_dropout_prob,
            )
        else:
            bert = AutoModel.from_pretrained(bert_name)
        return BertBasedClassifier(bert)

    return init_model


train_data, eval_data = model_selection.train_test_split(
    train_csv, test_size=0.2, random_state=42
)

hyperparameters = {
    "bert-large-cased": {
        "model_init": wrapped_bert("bert-large-cased", hidden_dropout_prob=0.0),
        "train_params": {
            "batch_size": 8,
            "num_train_epochs": 3,
        },
    },
    "albert-xxlarge-v2": {
        "model_init": wrapped_bert("albert-xxlarge-v2", hidden_dropout_prob=0.0),
    },
    "roberta-large": {
        "model_init": wrapped_bert("roberta-large", hidden_dropout_prob=0.0),
    },
    "deberta-v2-xlarge": {
        "model_name": "microsoft/deberta-v2-xlarge",
        "model_init": wrapped_bert(
            "microsoft/deberta-v2-xlarge", hidden_dropout_prob=0.0
        ),
        "train_params": {
            "batch_size": 2,
            "num_train_epochs": 5,
            "output_dir": "deberta-v2-xlarge-output",
            "model_save_dir": "deberta-v2-xlarge-model",
            "logging_dir": "deberta-v2-xlarge-runs",
        },
    },
    "muppet-roberta-large": {
        "model_name": "facebook/muppet-roberta-large",
        "model_init": wrapped_bert(
            "facebook/muppet-roberta-large", hidden_dropout_prob=0.0
        ),
        "train_params": {
            "output_dir": "muppet-roberta-large-output",
            "model_save_dir": "muppet-roberta-large-model",
            "logging_dir": "muppet-roberta-large-runs",
        },
    },
}


def model_select(hyperparameters):
    metrics = {}
    for key, value in hyperparameters.items():
        print(f"Training {key}....")
        model = value["model_init"]()
        model_name = value.get("model_name", key)
        train_params = value.get("train_params", {})
        train(
            model,
            model_name,
            train_data,
            eval_data,
            **train_params,
        )
    return pd.DataFrame(metrics.items(), columns=["model", "best_rmse"])


select_results = model_select(hyperparameters)

Training bert-large-cased....
output_dir: data/readability/model_out/bert-large-cased-output, model_save_dir: data/readability/model_out/bert-large-cased-model, logging_dir: data/readability/model_out/bert-large-cased-runs


Step,Training Loss,Validation Loss,Rmse
171,0.4393,0.393092,0.62697
342,0.2219,0.315312,0.561527
513,0.1885,0.300294,0.547991
684,0.1901,0.276998,0.526306


Training albert-xxlarge-v2....
output_dir: data/readability/model_out/albert-xxlarge-v2-output, model_save_dir: data/readability/model_out/albert-xxlarge-v2-model, logging_dir: data/readability/model_out/albert-xxlarge-v2-runs


Step,Training Loss,Validation Loss,Rmse
284,0.3898,0.337839,0.581239
568,0.2921,0.283436,0.532387
852,0.2037,0.267793,0.517488
1136,0.128,0.260339,0.510235
1420,0.0929,0.259897,0.509801


Training roberta-large....


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


output_dir: data/readability/model_out/roberta-large-output, model_save_dir: data/readability/model_out/roberta-large-model, logging_dir: data/readability/model_out/roberta-large-runs


Step,Training Loss,Validation Loss,Rmse
284,0.3679,0.308041,0.555014
568,0.2749,0.24011,0.49001
852,0.1257,0.242153,0.49209
1136,0.0598,0.246342,0.496328
1420,0.0579,0.244904,0.494877


Training deberta-v2-xlarge....
output_dir: data/readability/model_out/deberta-v2-xlarge-output, model_save_dir: data/readability/model_out/deberta-v2-xlarge-model, logging_dir: data/readability/model_out/deberta-v2-xlarge-runs


Step,Training Loss,Validation Loss,Rmse
1134,0.3361,0.442573,0.665261
2268,0.2601,0.370421,0.608622
3402,0.2002,0.24632,0.496307
4536,0.1273,0.225542,0.474913
5670,0.083,0.22181,0.470967


Training muppet-roberta-large....


Some weights of RobertaModel were not initialized from the model checkpoint at facebook/muppet-roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


output_dir: data/readability/model_out/muppet-roberta-large-output, model_save_dir: data/readability/model_out/muppet-roberta-large-model, logging_dir: data/readability/model_out/muppet-roberta-large-runs


Step,Training Loss,Validation Loss,Rmse
284,0.4278,0.311388,0.558021
568,0.2782,0.280583,0.529701
852,0.1564,0.270674,0.520263
1136,0.0731,0.256173,0.506135
1420,0.0523,0.254979,0.504954


In [6]:
from safetensors.torch import load_model
from torch.utils.data import DataLoader

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
labels = eval_data["target"].values
for key, value in hyperparameters.items():
    model = value["model_init"]()
    model_name = value.get("model_name", key)
    model_save_dir = os.path.join(model_out_dir, f"{key}-model")
    load_model(model, os.path.join(model_save_dir, "model.safetensors"))
    model.eval()
    tokenizer = AutoTokenizer.from_pretrained(
        model_name, clean_up_tokenization_spaces=True
    )
    eval_dataloader = DataLoader(
        ClassificationDataset(eval_data, tokenizer), batch_size=2, shuffle=False
    )
    model.to(device)
    all_logits = []
    for inputs in eval_dataloader:
        inputs.pop("labels")
        inputs = {k: v.to(device) for k, v in inputs.items()}
        with torch.no_grad():
            logits = model(**inputs)
            all_logits.append(logits.detach().cpu().numpy())
    all_logits = np.concatenate(all_logits).flatten()
    pd.DataFrame({"labels": labels, "target": all_logits}).to_csv(
        os.path.join(model_out_dir, f"{key}-eval.csv"), index=False
    )
    del model
    torch.cuda.empty_cache()

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at facebook/muppet-roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
from scipy.optimize import minimize
from sklearn.metrics import root_mean_squared_error

all_logits = []
for key, value in hyperparameters.items():
    eval_csv = pd.read_csv(os.path.join(model_out_dir, f"{key}-eval.csv"))
    eval_logits = eval_csv["target"].values
    all_logits.append(eval_logits)


def ensemble(weights, *logits):
    return np.sum(
        np.concatenate([w * np.array(l) for w, l in zip(weights, logits)], axis=1),
        axis=0,
    )


def weight_sum_loss(weights):
    return root_mean_squared_error(labels, ensemble(weights, all_logits))


init_guess = [1.0 / len(all_logits)] * len(all_logits)
minimize(weight_sum_loss, init_guess, method="Nelder-Mead")

       message: Optimization terminated successfully.
       success: True
        status: 0
           fun: 0.461673542701431
             x: [ 2.076e-01  2.064e-01  2.027e-01  1.911e-01  1.932e-01]
           nit: 49
          nfev: 98
 final_simplex: (array([[ 2.076e-01,  2.064e-01, ...,  1.911e-01,
                         1.932e-01],
                       [ 2.076e-01,  2.064e-01, ...,  1.912e-01,
                         1.932e-01],
                       ...,
                       [ 2.076e-01,  2.064e-01, ...,  1.912e-01,
                         1.932e-01],
                       [ 2.077e-01,  2.064e-01, ...,  1.912e-01,
                         1.932e-01]]), array([ 4.617e-01,  4.617e-01,  4.617e-01,  4.617e-01,
                        4.617e-01,  4.617e-01]))

In [8]:
for key, value in hyperparameters.items():
    model_name = value.get("model_name", key)
    AutoModel.from_pretrained(model_name).save_pretrained(
        os.path.join(model_out_dir, f"{key}-structure")
    )
    AutoTokenizer.from_pretrained(model_name).save_pretrained(
        os.path.join(model_out_dir, f"{key}-tokenizer")
    )

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at facebook/muppet-roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
import os

ensemble_classifier_dir = os.path.join(model_out_dir, "bert-based-classifier")
if not os.path.exists(ensemble_classifier_dir):
    os.mkdir(ensemble_classifier_dir)
for key in hyperparameters.keys():
    os.rename(
        os.path.join(model_out_dir, f"{key}-model", "model.safetensors"),
        os.path.join(ensemble_classifier_dir, f"{key}.safetensors"),
    )