## Загрузим данные

In [1]:
import pandas as pd
import numpy as np

In [2]:
ds_train = pd.read_csv("../input/authorstexts/train_data.csv", usecols=[0, 2])
ds_test = pd.read_csv("../input/authorstexts/test_data.csv", usecols=[0, 2])

ds_train = ds_train.sample(frac=1)
ds_test = ds_test.sample(frac=1)

In [3]:
ds_train.shape, ds_test.shape

((47162, 2), (18139, 2))

In [4]:
wr2ids = {k: i for i, k in enumerate(ds_train.writer.unique())}
wr2ids

{'Saltykov-schedrin': 0,
 'Solzhenitsin': 1,
 'Chekhov': 2,
 'Goncharov': 3,
 'Ostrovsky': 4,
 'Paustovskiy': 5,
 'Prishvin': 6,
 'Belyaev': 7,
 'Kuprin': 8,
 'Turgenev': 9,
 'Leskov': 10,
 'Pelevin': 11,
 'Sergeev-Thsenskiy': 12,
 'Kataev': 13,
 'Serafimovich': 14,
 'Pasternak': 15,
 'Gorky': 16,
 'Dostoevsky': 17,
 'Dovlatov': 18,
 'Zoschenko': 19,
 'Furmanov': 20,
 'Gogol': 21,
 'Fray': 22,
 'Akunin': 23,
 'Kazantsev': 24,
 'Lukyanenko': 25,
 'Averchenko': 26,
 'Tolstoy': 27,
 'Bulgakov': 28,
 'Bunin': 29,
 'Pushkin': 30,
 'Ilf_petrov': 31,
 'Gaydar': 32,
 'Struhgatskie': 33,
 'Grin': 34,
 'Fadeev': 35,
 'Pikul': 36,
 'Shukshin': 37}

In [5]:
ids2wr = {v: k for k, v in wr2ids.items()}

In [6]:
ds_train.reset_index(drop=True, inplace=True)
ds_test.reset_index(drop=True, inplace=True)

## Создадим даталоадер

In [7]:
import torch

In [8]:
class TextDataset(torch.utils.data.Dataset):
  def __init__(self, df, tokenizer, label2id):
    super().__init__()
    self.texts = df.text
    self.labels = df.writer.apply(lambda s: label2id[s])

  def __getitem__(self, idx):
    examples = tokenizer(self.texts[idx].lower(),
                         truncation=True, padding="max_length", max_length=512)
    examples["label"] = torch.tensor(self.labels[idx])
    return examples

  def __len__(self):
    return len(self.labels)


## Дообучим BERT для задачи классификации авторов

In [9]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer

In [10]:
import wandb
wandb.login()

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [None]:
! pip install evaluate

In [12]:
import evaluate
import os

In [13]:
f1 = evaluate.load("f1")
acc = evaluate.load("accuracy")
recall = evaluate.load("recall")
precision = evaluate.load("precision")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    f1_macro = f1.compute(predictions=predictions, references=labels, average='macro')
    f1_micro = f1.compute(predictions=predictions, references=labels, average='micro')
    acc_score = acc.compute(predictions=predictions, references=labels)
    rec_macro = recall.compute(predictions=predictions, references=labels, average='macro')
    rec_micro = recall.compute(predictions=predictions, references=labels, average='micro')
    prec_macro = precision.compute(predictions=predictions, references=labels, average='macro')
    prec_micro = precision.compute(predictions=predictions, references=labels, average='micro')
    return {"f1_macro": f1_macro["f1"], "f1_micro": f1_micro["f1"], 
            "recall_macro": rec_macro["recall"], "recall_micro": rec_micro["recall"], 
            "precision_macro": prec_macro["precision"], "precision_micro": prec_micro["precision"]}

Downloading builder script:   0%|          | 0.00/6.77k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.36k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.55k [00:00<?, ?B/s]

In [None]:
huggingface_name = "sberbank-ai/ruBert-base"
batch_size = 6
num_train_epochs = 5

tokenizer = AutoTokenizer.from_pretrained(huggingface_name)
model = AutoModelForSequenceClassification.from_pretrained(huggingface_name,
                                                           return_dict=True, num_labels=38,
                                                           ignore_mismatched_sizes=True)     
model.config.id2label = ids2wr
model.config.label2id = wr2ids

wandb.init(project="hw-nlp")
wandb.watch(model)

train_dataset = TextDataset(ds_train, tokenizer, model.config.label2id)
test_dataset = TextDataset(ds_test, tokenizer, model.config.label2id)


training_args = TrainingArguments(
    output_dir="test_trainer",
    evaluation_strategy="epoch",
    # evaluation_strategy="steps",
    # eval_steps=30,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_train_epochs,
    save_strategy="epoch",
    # save_strategy="steps",
    save_steps=3000,
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="f1_macro")

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()

In [None]:
trainer.save_model("model")

In [None]:
artifact = wandb.Artifact('baseline38-balanced', type='model')
artifact.add_dir('model/')
wandb.log_artifact(artifact)

#### Evaluation of baseline BERT

In [15]:
run = wandb.init()
artifact = run.use_artifact('sava_ml/hw-nlp/baseline38:v1', type='model')
artifact_dir = artifact.download()

VBox(children=(Label(value='0.000 MB of 0.000 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

[34m[1mwandb[0m: Downloading large artifact baseline38:v1, 680.37MB. 3 files... Done. 0:0:30.7


In [17]:
tokenizer = AutoTokenizer.from_pretrained(huggingface_name)
model = AutoModelForSequenceClassification.from_pretrained('artifacts/baseline38:v1') 

train_dataset = TextDataset(ds_train, tokenizer, model.config.label2id)
test_dataset = TextDataset(ds_test, tokenizer, model.config.label2id)

trainer = Trainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)

trainer.evaluate(test_dataset)

Could not locate the tokenizer configuration file, will try to use the model config instead.
loading configuration file https://huggingface.co/sberbank-ai/ruBert-base/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/3ff2b30ffd2e83991ada1f23ca4d7adad284baa741ea21704f02d83b72405c79.b7ac951e56a7d9c2e7e295337ac13c91834fc4cd1578bc46e5ebc1fb8ac81fb5
Model config BertConfig {
  "_name_or_path": "sberbank-ai/ruBert-base",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "directionality": "bidi",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head

Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


{'eval_loss': 1.7595490217208862,
 'eval_f1_macro': 0.7433203875114444,
 'eval_f1_micro': 0.7805281437785986,
 'eval_recall_macro': 0.7494152757331275,
 'eval_recall_micro': 0.7805281437785986,
 'eval_precision_macro': 0.7780588645159315,
 'eval_precision_micro': 0.7805281437785986,
 'eval_runtime': 361.3039,
 'eval_samples_per_second': 50.204,
 'eval_steps_per_second': 6.277}