In [51]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import sklearn.metrics as skmetrics
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, precision_recall_fscore_support
import torch
from transformers import TrainingArguments, Trainer
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import EarlyStoppingCallback

In [38]:
data = pd.read_csv("/kaggle/input/natural-disaster/train.csv")

In [40]:
model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2)

loading file vocab.txt from cache at /root/.cache/huggingface/hub/models--bert-base-uncased/snapshots/0a6aa9128b6194f4f3c4db429b6cb4891cdb421b/vocab.txt
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at None
loading file tokenizer_config.json from cache at /root/.cache/huggingface/hub/models--bert-base-uncased/snapshots/0a6aa9128b6194f4f3c4db429b6cb4891cdb421b/tokenizer_config.json
loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--bert-base-uncased/snapshots/0a6aa9128b6194f4f3c4db429b6cb4891cdb421b/config.json
Model config BertConfig {
  "_name_or_path": "bert-base-uncased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_positio

In [41]:
text = list(data["text"])
target = list(data["target"])

In [115]:
# Разделяет выборку на train, val, test.
x_train, x_rest, y_train, y_rest = train_test_split(text, target, test_size=0.2)

x_val, x_test, y_val, y_test = train_test_split(x_rest, y_rest, test_size=0.5)

# Получает токены сообщений, а также приводит к одному размеру - 512 токенов.
x_train_tokenized = tokenizer(x_train, padding=True, truncation=True, max_length=512)
x_val_tokenized = tokenizer(x_val, padding=True, truncation=True, max_length=512)
x_test_tokenized = tokenizer(x_test, padding=True, truncation=True, max_length=512)

In [122]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels:
            item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.encodings["input_ids"])

In [123]:
train_dataset = Dataset(x_train_tokenized, y_train)
val_dataset = Dataset(x_val_tokenized, y_val)
test_dataset = Dataset(x_test_tokenized, y_test)

In [124]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [153]:
args = TrainingArguments(
    output_dir="output",
    evaluation_strategy="steps",
    eval_steps=500,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    seed=0,
    load_best_model_at_end=True,
    report_to="none",
)

PyTorch: setting up devices


In [156]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
)

In [157]:
trainer.train()

***** Running training *****
  Num examples = 6090
  Num Epochs = 5
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 955
  Number of trainable parameters = 109483778


Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
500,0.0655,0.884343,0.817346,0.783151,0.784375,0.781931


***** Running Evaluation *****
  Num examples = 761
  Batch size = 32
Saving model checkpoint to output/checkpoint-500
Configuration saved in output/checkpoint-500/config.json
Model weights saved in output/checkpoint-500/pytorch_model.bin


Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from output/checkpoint-500 (score: 0.8843425512313843).


TrainOutput(global_step=955, training_loss=0.05029099848882066, metrics={'train_runtime': 326.0471, 'train_samples_per_second': 93.391, 'train_steps_per_second': 2.929, 'total_flos': 1314424721484000.0, 'train_loss': 0.05029099848882066, 'epoch': 5.0})

In [160]:
test_trainer = Trainer(model)

# Получает предсказания
raw_pred, _, _ = test_trainer.predict(test_dataset)

# Переводит предсказания в [0, 1]
y_pred = np.argmax(raw_pred, axis=1)


No `TrainingArguments` passed, using `output_dir=tmp_trainer`.
PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
***** Running Prediction *****
  Num examples = 762
  Batch size = 16


In [161]:
print("Bert-based-model accuracy:", accuracy_score(y_pred, y_test))
print("Precision", precision_score(y_pred, y_test))
print("Recall", recall_score(y_pred, y_test))
print("F1 Score:", f1_score(y_pred, y_test))

Bert-based-model accuracy: 0.8097112860892388
Precision 0.7225609756097561
Recall 0.8144329896907216
F1 Score: 0.7657512116316639
