## Загрузим данные

In [None]:
! pip install evaluate
! pip install sentence-transformers

In [7]:
import pandas as pd
import numpy as np
import torch
import wandb

In [8]:
ds_train = pd.read_csv("../input/authorstexts/train_data.csv", usecols=[0, 2])
ds_test = pd.read_csv("../input/authorstexts/test_data.csv", usecols=[0, 2])

ds_train = ds_train.sample(frac=1)
ds_test = ds_test.sample(frac=1)

In [9]:
ds_train.shape, ds_test.shape

((47162, 2), (18139, 2))

In [10]:
wr2ids = {k: i for i, k in enumerate(sorted(ds_train.writer.unique()))}
wr2ids

{'Akunin': 0,
 'Averchenko': 1,
 'Belyaev': 2,
 'Bulgakov': 3,
 'Bunin': 4,
 'Chekhov': 5,
 'Dostoevsky': 6,
 'Dovlatov': 7,
 'Fadeev': 8,
 'Fray': 9,
 'Furmanov': 10,
 'Gaydar': 11,
 'Gogol': 12,
 'Goncharov': 13,
 'Gorky': 14,
 'Grin': 15,
 'Ilf_petrov': 16,
 'Kataev': 17,
 'Kazantsev': 18,
 'Kuprin': 19,
 'Leskov': 20,
 'Lukyanenko': 21,
 'Ostrovsky': 22,
 'Pasternak': 23,
 'Paustovskiy': 24,
 'Pelevin': 25,
 'Pikul': 26,
 'Prishvin': 27,
 'Pushkin': 28,
 'Saltykov-schedrin': 29,
 'Serafimovich': 30,
 'Sergeev-Thsenskiy': 31,
 'Shukshin': 32,
 'Solzhenitsin': 33,
 'Struhgatskie': 34,
 'Tolstoy': 35,
 'Turgenev': 36,
 'Zoschenko': 37}

In [11]:
ids2wr = {v: k for k, v in wr2ids.items()}

In [12]:
ds_train.reset_index(drop=True, inplace=True)
ds_test.reset_index(drop=True, inplace=True)

## SBERT with Sentence-Transformers

In [13]:
from sentence_transformers.readers import InputExample
from sentence_transformers import SentenceTransformer, losses
from sentence_transformers.datasets import SentenceLabelDataset
from torch.utils.data import Dataset, DataLoader

In [14]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

In [15]:
import evaluate

f1 = evaluate.load("f1")
recall = evaluate.load("recall")
precision = evaluate.load("precision")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    
    predictions = np.argmax(logits, axis=-1)
    f1_macro = f1.compute(predictions=predictions, references=labels, average='macro')
    f1_micro = f1.compute(predictions=predictions, references=labels, average='micro')
    rec_macro = recall.compute(predictions=predictions, references=labels, average='macro')
    rec_micro = recall.compute(predictions=predictions, references=labels, average='micro')
    prec_macro = precision.compute(predictions=predictions, references=labels, average='macro')
    prec_micro = precision.compute(predictions=predictions, references=labels, average='micro')
    return {"f1_macro": f1_macro["f1"], "f1_micro": f1_micro["f1"], 
            "recall_macro": rec_macro["recall"], "recall_micro": rec_micro["recall"], 
            "precision_macro": prec_macro["precision"], "precision_micro": prec_micro["precision"]}

Downloading builder script:   0%|          | 0.00/6.77k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.36k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.55k [00:00<?, ?B/s]

In [11]:
huggingface_name = "sberbank-ai/ruBert-base"
model = SentenceTransformer(huggingface_name)

train_loss = losses.BatchHardTripletLoss(model=model)

Downloading (…)bb2a0/.gitattributes:   0%|          | 0.00/690 [00:00<?, ?B/s]

Downloading (…)34f3cbb2a0/README.md:   0%|          | 0.00/392 [00:00<?, ?B/s]

Downloading (…)f3cbb2a0/config.json:   0%|          | 0.00/590 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/716M [00:00<?, ?B/s]

Downloading (…)34f3cbb2a0/vocab.txt:   0%|          | 0.00/1.78M [00:00<?, ?B/s]

Some weights of the model checkpoint at /root/.cache/torch/sentence_transformers/sberbank-ai_ruBert-base were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Для обучения модели используется триплетная функция потерь. Для ее вычисления требуется 3 объекта из датасета: якорь, позитивный пример (пример текста того же класса, что и якорь), и негативный пример (пример текста другого класса).

Для обучения модели создадим объект SentenceLabelDataset, который принимает на вход тексты и метки классов для них и формирует из них тройки - якорь, позитивный и негативный примеры.

In [11]:
train_examples = [InputExample(texts=[t], label=wr2ids[l]) for t, l in zip(ds_train.text, ds_train.writer)]

train_dataset = SentenceLabelDataset(train_examples)
train_dataloader = DataLoader(train_dataset, batch_size=8)

Для оценки качества модели нужен похожий объект, но в виде словаря. Для этого напишем класс TripletDataset, который возвращает словарь с объектами для тестирования качества модели.

In [11]:
class TripletDataset(Dataset):
    def __init__(self, df, return_input_example=False):
        super().__init__()
        self.df = df
        self.return_input_example = return_input_example

    def __getitem__(self, idx):
        ancor_idx = self.df.iloc[idx]
        pos_idx = self.df[self.df.writer == ancor_idx.writer].sample(n=1).iloc[0]
        neg_idx = self.df[self.df.writer != ancor_idx.writer].sample(n=1).iloc[0]

        if self.return_input_example:
            return InputExample(texts=[ancor_idx.text.lower(), pos_idx.text.lower(), neg_idx.text.lower()])
        
        return {"ancor": ancor_idx.text.lower(), \
            "pos": pos_idx.text.lower(), \
            "neg": neg_idx.text.lower()}

    def __len__(self):
        return self.df.shape[0]

test_triplets_dataset = TripletDataset(ds_test)

In [12]:
test_ancors, test_pos, test_neg = [], [], []
for ex in test_triplets_dataset:
    test_ancors.append(ex["ancor"])
    test_pos.append(ex["pos"])
    test_neg.append(ex["neg"])

Обучим модель на тренировочных данный с использованием функции потерь BatchHardTripletLoss, которая подбирает батчи так, чтобы в них попадали наиболее сложные для модели сочетания якоря и позитивных/негативных примеров.

Оценка качества будет проводится с помощью доли примеров, для которых расстояние между якорем и позитивным примером меньше, чем соответствующее расстояние от негативного примера.

In [None]:
from sentence_transformers import evaluation

evaluator = evaluation.TripletEvaluator(test_ancors, test_pos, test_neg)

In [None]:
model.fit([(train_dataloader, train_loss)], show_progress_bar=True, epochs=2, 
          evaluator=evaluator, evaluation_steps=3000, output_path="sbert")

Сохраним модель в виде артефакта wandb

In [8]:
wandb.init(project="Diploma")

artifact = wandb.Artifact('sbert', type='model')
artifact.add_dir('sbert/')
wandb.log_artifact(artifact)

### Оценка качества получившейся модели

In [16]:
run = wandb.init()
artifact = run.use_artifact('sava_ml/uncategorized/sbert:v2', type='model')
artifact_dir = artifact.download()

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


[34m[1mwandb[0m: Downloading large artifact sbert:v2, 685.50MB. 12 files... 
[34m[1mwandb[0m:   12 of 12 files downloaded.  
Done. 0:0:36.0


In [10]:
model = SentenceTransformer('artifacts/sbert:v2')

In [17]:
embeddings_anchors = model.encode(test_ancors, batch_size=8, show_progress_bar=False, convert_to_numpy=True)
embeddings_positives = model.encode(test_pos, batch_size=8, show_progress_bar=False, convert_to_numpy=True)
embeddings_negatives = model.encode(test_neg, batch_size=8, show_progress_bar=False, convert_to_numpy=True)

In [18]:
from sklearn.metrics.pairwise import paired_cosine_distances

pos_cos_distances = paired_cosine_distances(embeddings_anchors, embeddings_positives)
neg_cos_distances = paired_cosine_distances(embeddings_anchors, embeddings_negatives)

print("Доля объектов с правильным соотношением расстояний:",(pos_cos_distances < neg_cos_distances).mean())

Доля объектов с правильным соотношением расстояний: 0.9658746347648712


## Classificator on SBERT

Теперь обучим классификатор, который будет получать на вход эмбеддинги текстов, полученные с помощью предыдущей модели:

In [17]:
import torch.nn as nn
import torch.nn.functional as F

class Classifier(nn.Module):
    def __init__(self, feature_extractor: SentenceTransformer, n_classes: int, label2ids: dict, emb_dim: int = 768):
        super().__init__()
        self.sbert = feature_extractor
        self.sbert.requires_grad = False
        self.dropout = nn.Dropout(p=0.1)
        self.fc = nn.Linear(emb_dim, n_classes)
        self.loss = nn.CrossEntropyLoss()
        self.label2ids = label2ids
        
    def forward(self, text, label):
        emb = self.sbert.encode(text, convert_to_tensor=True, show_progress_bar=False)
            
        out = self.dropout(emb)
        out = self.fc(out)
        logits = F.log_softmax(out, dim=-1)
        
        loss = self.loss(logits, label.cuda())
        return loss, logits                               

In [18]:
BATCH_SIZE = 8

In [19]:
feature_extractor = SentenceTransformer('artifacts/sbert:v2').to(device)
classifier = Classifier(feature_extractor, n_classes=38, label2ids=wr2ids).to(device)

In [20]:
class TextDataset(torch.utils.data.Dataset):
  def __init__(self, df, label2ids):
    super().__init__()
    self.texts = df.text
    self.labels = df.writer.apply(lambda w: label2ids[w])

  def __getitem__(self, idx):
    return self.texts.iloc[idx], torch.Tensor([self.labels.iloc[idx]]).long()

  def __len__(self):
    return len(self.labels)

train_dataset = TextDataset(ds_train, wr2ids)
test_dataset = TextDataset(ds_test, wr2ids)

In [21]:
 def example_collator(example_list):
   
    batched_examples = {"text": [ex[0] for ex in example_list], "label": torch.tensor([ex[1] for ex in example_list])} 
    return batched_examples

In [22]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
        output_dir="clf_trainer",
        evaluation_strategy="steps",
        eval_steps=3000,
        per_device_train_batch_size=BATCH_SIZE,
        per_device_eval_batch_size=BATCH_SIZE,
        report_to=None,
        num_train_epochs=3,
        save_strategy="steps",
        save_steps=3000,
        save_total_limit=2,
        load_best_model_at_end=True,
        metric_for_best_model="f1_macro"
)


trainer = Trainer(
    model=classifier,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
    data_collator=example_collator
)

In [23]:
wandb.init(project="Diploma")
trainer.train()

VBox(children=(Label(value='0.001 MB of 0.016 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=0.069003…

***** Running training *****
  Num examples = 47162
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 17688
  Number of trainable parameters = 178336550
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


Step,Training Loss,Validation Loss,F1 Macro,F1 Micro,Recall Macro,Recall Micro,Precision Macro,Precision Micro
3000,0.2408,1.083274,0.693886,0.726115,0.709775,0.726115,0.734884,0.726115
6000,0.2083,1.133242,0.701601,0.731959,0.718067,0.731959,0.73805,0.731959
9000,0.1822,1.16902,0.699656,0.731793,0.717695,0.731793,0.737901,0.731793
12000,0.1779,1.191084,0.7008,0.73251,0.718661,0.73251,0.738915,0.73251
15000,0.1718,1.195446,0.701615,0.732896,0.718945,0.732896,0.739721,0.732896


***** Running Evaluation *****
  Num examples = 18139
  Batch size = 8
Saving model checkpoint to clf_trainer/checkpoint-3000
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
***** Running Evaluation *****
  Num examples = 18139
  Batch size = 8
Saving model checkpoint to clf_trainer/checkpoint-6000
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
***** Running Evaluation *****
  Num examples = 18139
  Batch size = 8
Saving model checkpoint to clf_trainer/checkpoint-9000
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Deleting older checkpoint [clf_trainer/checkpoint-3000] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 18139
  Batch size = 8
Saving model checkpoint to clf_trainer/checkpoint-12000
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Deleting older checkpoint [clf_trainer/checkpoint-9000] due to args.save_total_limit
***** Running Evaluation *****
  Num examples

TrainOutput(global_step=17688, training_loss=0.2865021856377859, metrics={'train_runtime': 4647.1397, 'train_samples_per_second': 30.446, 'train_steps_per_second': 3.806, 'total_flos': 0.0, 'train_loss': 0.2865021856377859, 'epoch': 3.0})

In [24]:
trainer.save_model("clf_trainer")

artifact = wandb.Artifact('sbert-clf', type='model')
artifact.add_dir('clf_trainer/')
wandb.log_artifact(artifact)

Saving model checkpoint to clf_trainer
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
[34m[1mwandb[0m: Adding directory to artifact (./clf_trainer)... Done. 8.5s


<wandb.sdk.wandb_artifacts.Artifact at 0x7f3ea4dc9450>

### Оценка качества классификации (раздельное обучение SBERT и классификатора)

In [None]:
run = wandb.init(project="Diploma")
artifact = run.use_artifact('sava_ml/Diploma/sbert-clf:v0', type='model')
artifact_dir = artifact.download()

feature_extractor = SentenceTransformer('artifacts/sbert:v2')
model = Classifier(feature_extractor, n_classes=38, label2ids=wr2ids)
model.load_state_dict(torch.load('artifacts/sbert-clf:v0/checkpoint-15000/pytorch_model.bin'))

In [27]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
    data_collator=example_collator
)

trainer.evaluate()

***** Running Evaluation *****
  Num examples = 18139
  Batch size = 8


Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


{'eval_loss': 1.1954458951950073,
 'eval_f1_macro': 0.7016149416728183,
 'eval_f1_micro': 0.7328959700093721,
 'eval_recall_macro': 0.7189454475515519,
 'eval_recall_micro': 0.7328959700093721,
 'eval_precision_macro': 0.7397205518411758,
 'eval_precision_micro': 0.7328959700093721,
 'eval_runtime': 361.2154,
 'eval_samples_per_second': 50.217,
 'eval_steps_per_second': 6.279}

## SBERT + классификатор одновременно

Фреймворк SentenceTransformer позволяет обучать одновременно несколько моделей. 

В нашем случае нужно обучить SBERT формировать хорошие ембеддинги текстов и затем эти ембеддинги использовать в классификаторе. Для каждой из задач требуется своя функция потерь, обучающие данные при этом совпадают.

При обучении классификатора мы будем настраивать функцию потерь так, чтобы градиент тек только через слои классификатора, но не через слои SBERT.  

In [28]:
import torch.nn as nn
import torch.nn.functional as F
from sentence_transformers.readers import InputExample
from sentence_transformers import SentenceTransformer, losses
from sentence_transformers.datasets import SentenceLabelDataset
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer
from transformers import Trainer, TrainingArguments
wandb.login()



True

In [29]:
class Classifier(nn.Module):
    def __init__(self, n_classes: int, label2ids: dict, emb_dim: int = 768):
        super().__init__()
        self.dropout = nn.Dropout(p=0.1)
        self.fc = nn.Linear(emb_dim, n_classes)
        self.loss = nn.CrossEntropyLoss()
        self.label2ids = label2ids
        
    def forward(self, sent_emb: torch.Tensor):          
        out = self.dropout(sent_emb)
        out = self.fc(out)
        logits = F.log_softmax(out, dim=-1)
        return logits
    
clf_model = Classifier(n_classes=38, label2ids=wr2ids)

In [30]:
class CrossEntropyClassificatorLoss(nn.Module):
    def __init__(self, emb_model: nn.Module, clf_model: nn.Module):
        super().__init__()
        self.embeddings = emb_model
        self.classifier = clf_model
        
    def forward(self, inp, label):
        sent_emb = self.embeddings(inp[0])["sentence_embedding"]
        # sent_emb = self.embeddings.encode(inp, convert_to_tensor=True, show_progress_bar=False)
        logits = self.classifier(sent_emb.detach())
        
        loss = self.classifier.loss(logits, label)
        
        return loss

In [None]:
huggingface_name = "sberbank-ai/ruBert-base"
sbert_model = SentenceTransformer(huggingface_name)
tokenizer = AutoTokenizer.from_pretrained(huggingface_name)
sbert_model._model_config["label2id"] = wr2ids
sbert_model._model_config["id2label"] = ids2wr

triplet_loss = losses.BatchHardTripletLoss(model=sbert_model)

ce_loss = CrossEntropyClassificatorLoss(emb_model=sbert_model, clf_model=clf_model)

In [32]:
from typing import List

def collate_tokenized_examples(examples: List[InputExample]):
    tokenzed_list = [tokenizer(ex.texts, return_tensors="pt", max_length=512, truncation=True, padding="max_length") for ex in examples]
    result = {"input_ids": torch.vstack([tok["input_ids"] for tok in tokenzed_list]), 
              "attention_mask": torch.vstack([tok["attention_mask"] for tok in tokenzed_list])}
    result["label"] = torch.tensor([ex.label for ex in examples])
    return result

In [33]:
train_examples = [InputExample(texts=[t], label=sbert_model._model_config["label2id"][l]) for t, l in zip(ds_train.text, ds_train.writer)]

train_dataset = SentenceLabelDataset(train_examples)
train_dataloader = DataLoader(train_dataset, batch_size=8, collate_fn=collate_tokenized_examples)

In [34]:
test_examples = [InputExample(texts=[t], label=sbert_model._model_config["label2id"][l]) for t, l in zip(ds_test.text, ds_test.writer)]

test_dataset = SentenceLabelDataset(test_examples)
test_dataloader = DataLoader(test_dataset, batch_size=8, collate_fn=collate_tokenized_examples)

In [35]:
# Проверка dataloader и sentence_transformers

for ex in train_dataloader:
    print(ex["input_ids"].size())
    print(sbert_model(ex).keys())
    break

torch.Size([8, 512])
dict_keys(['input_ids', 'attention_mask', 'label', 'token_embeddings', 'sentence_embedding'])


In [36]:
class SBERTClassifier(nn.Module):
    def __init__(self, emb_model, clf_model):
        super().__init__()
        self.embeddings = emb_model
        self.classifier = clf_model
        self.loss = nn.CrossEntropyLoss()
        
    def forward(self, tokenized_text, label=None):
        # emb = self.embeddings.encode(tokenized_text, convert_to_tensor=True, show_progress_bar=False)
        emb = self.embeddings((tokenized_text[0]))["sentence_embedding"]
        logits = self.classifier(emb)
        
        if label is None:
            return None, {"output": logits}
        
        loss = self.loss(logits, label)
        return loss, {"output": logits}

full_model = SBERTClassifier(sbert_model, clf_model)

In [37]:
from sentence_transformers.evaluation import SentenceEvaluator
from sentence_transformers.util import batch_to_device
import csv
import os
import evaluate

class F1Evaluator(SentenceEvaluator):
    """
    Evaluate a model based on its f1 score on a labeled dataset
    This requires a model with LossFunction.SOFTMAX
    The results are written in a CSV. If a CSV already exists, then values are appended.
    """

    def __init__(self, dataloader: DataLoader, name: str = "", softmax_model = None, write_csv: bool = True, mode="macro"):
        """
        Constructs an evaluator for the given dataset
        :param dataloader:
            the data for the evaluation
        """
        self.dataloader = dataloader
        self.name = name
        self.softmax_model = softmax_model
        self.f1 = evaluate.load("f1")

        if name:
            name = "_" + name
 
        self.write_csv = write_csv
        self.csv_file = "f1_evaluation_" + mode + "_" + name +"_results.csv"
        self.csv_headers = ["epoch", "steps", "f1_" + mode]
        self.mode = mode

    def __call__(self, model, output_path: str = None, epoch: int = -1, steps: int = -1) -> float:
        model.eval()
        self.softmax_model.eval()

        if epoch != -1:
            if steps == -1:
                out_txt = " after epoch {}:".format(epoch)
            else:
                out_txt = " in epoch {} after {} steps:".format(epoch, steps)
        else:
            out_txt = ":"

        # logger.info("Evaluation on the " + self.name + " dataset" + out_txt)
        print("Evaluation on the " + self.name + " dataset" + out_txt)
        
        self.dataloader.collate_fn = model.smart_batching_collate
        
        predictions, labels = [], []
        
        for step, batch in enumerate(self.dataloader):
            features, label_ids = batch
            
            for idx in range(len(features)):
                features[idx] = batch_to_device(features[idx], model.device)
            
            label_ids = label_ids.to(model.device)
            with torch.no_grad():
                logits = self.softmax_model(features)[1]["output"]
                predictions.extend(torch.argmax(logits, dim=-1).tolist())
                labels.extend(label_ids)
                
                
        f1_macro = self.f1.compute(predictions=predictions, references=labels, average='macro')["f1"]
        f1_micro = self.f1.compute(predictions=predictions, references=labels, average='micro')["f1"]

       # logger.info("f1-macro: {:.4f}, f1-micro: {:.4f}\n".format(f1_macro, f1_micro)
        print("f1-macro: {:.4f}, f1-micro: {:.4f}\n".format(f1_macro, f1_micro))
        
        f1 = f1_macro if self.mode == "macro" else f1_micro
        
        if output_path is not None and self.write_csv:
            csv_path = os.path.join(output_path, self.csv_file)
            if not os.path.isfile(csv_path):
                with open(csv_path, newline='', mode="w", encoding="utf-8") as f:
                    writer = csv.writer(f)
                    writer.writerow(self.csv_headers)
                    writer.writerow([epoch, steps, f1])
            else:
                with open(csv_path, newline='', mode="a", encoding="utf-8") as f:
                    writer = csv.writer(f)
                    writer.writerow([epoch, steps, f1])

        return f1

evaluator = F1Evaluator(test_dataloader, softmax_model=full_model)

In [38]:
sbert_model.fit([(train_dataloader, triplet_loss), (train_dataloader, ce_loss)], show_progress_bar=True, epochs=8, 
          evaluator=evaluator, evaluation_steps=3000, output_path="sbert_clf_together")

Epoch:   0%|          | 0/8 [00:00<?, ?it/s]

Iteration:   0%|          | 0/5896 [00:00<?, ?it/s]

Evaluation on the  dataset in epoch 0 after 3000 steps:


Configuration saved in sbert_clf_together/config.json


f1-macro: 0.1065, f1-micro: 0.1638



Model weights saved in sbert_clf_together/pytorch_model.bin
tokenizer config file saved in sbert_clf_together/tokenizer_config.json
Special tokens file saved in sbert_clf_together/special_tokens_map.json


Evaluation on the  dataset after epoch 0:


Configuration saved in sbert_clf_together/config.json


f1-macro: 0.4275, f1-micro: 0.4643



Model weights saved in sbert_clf_together/pytorch_model.bin
tokenizer config file saved in sbert_clf_together/tokenizer_config.json
Special tokens file saved in sbert_clf_together/special_tokens_map.json


Iteration:   0%|          | 0/5896 [00:00<?, ?it/s]

Evaluation on the  dataset in epoch 1 after 3000 steps:


Configuration saved in sbert_clf_together/config.json


f1-macro: 0.5368, f1-micro: 0.5569



Model weights saved in sbert_clf_together/pytorch_model.bin
tokenizer config file saved in sbert_clf_together/tokenizer_config.json
Special tokens file saved in sbert_clf_together/special_tokens_map.json


Evaluation on the  dataset after epoch 1:


Configuration saved in sbert_clf_together/config.json


f1-macro: 0.6007, f1-micro: 0.6258



Model weights saved in sbert_clf_together/pytorch_model.bin
tokenizer config file saved in sbert_clf_together/tokenizer_config.json
Special tokens file saved in sbert_clf_together/special_tokens_map.json


Iteration:   0%|          | 0/5896 [00:00<?, ?it/s]

Evaluation on the  dataset in epoch 2 after 3000 steps:
f1-macro: 0.5666, f1-micro: 0.5856

Evaluation on the  dataset after epoch 2:


Configuration saved in sbert_clf_together/config.json


f1-macro: 0.6356, f1-micro: 0.6590



Model weights saved in sbert_clf_together/pytorch_model.bin
tokenizer config file saved in sbert_clf_together/tokenizer_config.json
Special tokens file saved in sbert_clf_together/special_tokens_map.json


Iteration:   0%|          | 0/5896 [00:00<?, ?it/s]

Evaluation on the  dataset in epoch 3 after 3000 steps:
f1-macro: 0.5905, f1-micro: 0.5941

Evaluation on the  dataset after epoch 3:


Configuration saved in sbert_clf_together/config.json


f1-macro: 0.6442, f1-micro: 0.6594



Model weights saved in sbert_clf_together/pytorch_model.bin
tokenizer config file saved in sbert_clf_together/tokenizer_config.json
Special tokens file saved in sbert_clf_together/special_tokens_map.json


Iteration:   0%|          | 0/5896 [00:00<?, ?it/s]

Evaluation on the  dataset in epoch 4 after 3000 steps:


Configuration saved in sbert_clf_together/config.json


f1-macro: 0.6634, f1-micro: 0.6843



Model weights saved in sbert_clf_together/pytorch_model.bin
tokenizer config file saved in sbert_clf_together/tokenizer_config.json
Special tokens file saved in sbert_clf_together/special_tokens_map.json


Evaluation on the  dataset after epoch 4:


Configuration saved in sbert_clf_together/config.json


f1-macro: 0.6754, f1-micro: 0.7073



Model weights saved in sbert_clf_together/pytorch_model.bin
tokenizer config file saved in sbert_clf_together/tokenizer_config.json
Special tokens file saved in sbert_clf_together/special_tokens_map.json


Iteration:   0%|          | 0/5896 [00:00<?, ?it/s]

Evaluation on the  dataset in epoch 5 after 3000 steps:


Configuration saved in sbert_clf_together/config.json


f1-macro: 0.6888, f1-micro: 0.7136



Model weights saved in sbert_clf_together/pytorch_model.bin
tokenizer config file saved in sbert_clf_together/tokenizer_config.json
Special tokens file saved in sbert_clf_together/special_tokens_map.json


Evaluation on the  dataset after epoch 5:
f1-macro: 0.6840, f1-micro: 0.7044



Iteration:   0%|          | 0/5896 [00:00<?, ?it/s]

Evaluation on the  dataset in epoch 6 after 3000 steps:
f1-macro: 0.6872, f1-micro: 0.7134

Evaluation on the  dataset after epoch 6:


Configuration saved in sbert_clf_together/config.json


f1-macro: 0.6934, f1-micro: 0.7164



Model weights saved in sbert_clf_together/pytorch_model.bin
tokenizer config file saved in sbert_clf_together/tokenizer_config.json
Special tokens file saved in sbert_clf_together/special_tokens_map.json


Iteration:   0%|          | 0/5896 [00:00<?, ?it/s]

Evaluation on the  dataset in epoch 7 after 3000 steps:


Configuration saved in sbert_clf_together/config.json


f1-macro: 0.6968, f1-micro: 0.7195



Model weights saved in sbert_clf_together/pytorch_model.bin
tokenizer config file saved in sbert_clf_together/tokenizer_config.json
Special tokens file saved in sbert_clf_together/special_tokens_map.json


Evaluation on the  dataset after epoch 7:


Configuration saved in sbert_clf_together/config.json


f1-macro: 0.7095, f1-micro: 0.7299



Model weights saved in sbert_clf_together/pytorch_model.bin
tokenizer config file saved in sbert_clf_together/tokenizer_config.json
Special tokens file saved in sbert_clf_together/special_tokens_map.json


In [None]:
import torch
import wandb
wandb.init(project="Diploma")

torch.save(full_model.state_dict(), "full/")

artifact = wandb.Artifact("sbert_clf_together", type="model")
artifact.add_dir("full/")
wandb.log_artifact(artifact)

### Оценка качества SBERT + Classifier Together

In [None]:
tokenizer = AutoTokenizer.from_pretrained(huggingface_name)
model = SBERTClassifier(sbert_model, clf_model)
model.load_state_dict(torch.load('artifacts/sbert_clf_together'))

trainer = Trainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)

trainer.evaluate(test_dataset)
