https://amaru-ai.com/entry/2022/10/15/202442

In [2]:
import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [3]:
import json
import pandas as pd
from tqdm import tqdm
from datasets import Dataset
from transformers import AutoTokenizer

datas = []
path = "../tsv/first-match-scatter/42/test.json"
with open(path) as f:
    works = json.load(f)
for work in tqdm(works[:100]):
    label = work["label"]
    contents = [c["paragraph"] for c in work["contents"]]
    if len(contents) == 5:
        for c in contents:
            datas.append({'label': label, 'text': c})

df = pd.DataFrame.from_dict(datas)
ds = Dataset.from_pandas(df)

100%|█████████████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 251607.92it/s]


In [4]:
tokenizer= AutoTokenizer.from_pretrained('cl-tohoku/bert-base-japanese-whole-word-masking')

In [5]:
def preprocess_func(exp):
    MAX_LENGTH = 512
    return tokenizer(exp['text'])

tkn_ds = ds.map(preprocess_func, batched=True)
tkn_ds = tkn_ds.train_test_split(test_size=0.5, shuffle=False)
tkn_ds



  0%|          | 0/1 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['label', 'text', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 245
    })
    test: Dataset({
        features: ['label', 'text', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 245
    })
})

In [6]:
from sklearn.metrics import accuracy_score, f1_score

def compute_metrics(pred):
    labels = pred.label_ids
    print(pred.predictions)
    preds = pred.predictions.argmax(-1)
    print(preds)
    f1 = f1_score(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {'acc': acc, 'f1': f1}

In [13]:
from torch import nn
from torch import Tensor
import torch.nn.functional as F
import numpy as np
from transformers import BertPreTrainedModel, BertModel
from transformers.modeling_outputs import SequenceClassifierOutput

class ModifiedBERT(BertPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.bert = BertModel(config)
        self.linear = nn.Linear(in_features=768, out_features=2)

        
    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, labels=None, **kwargs):
        outputs = self.bert(
            input_ids,
            attention_mask = attention_mask,
            token_type_ids = token_type_ids,
            **kwargs
        )
        
        #print(outputs)
        pooled_output = outputs.pooler_output
        logits = self.linear(pooled_output)
        #sigmoid = self.sigmoid(logits)
        
        loss = None
        #print(logits, labels)
        #print(logits)
        #print(labels.view(-1, 1))
        #print(labels.view(-1, 1))
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, 2), labels.view(-1))
       
        return SequenceClassifierOutput(
            loss = loss,
            logits = logits,
            hidden_states = outputs.hidden_states,
            attentions = outputs.attentions
        )

In [14]:

t = Tensor([-0.2]).long()
t

tensor([0])

In [15]:
modBERT = ModifiedBERT.from_pretrained("cl-tohoku/bert-base-japanese-whole-word-masking", num_labels=2)

loading configuration file https://huggingface.co/cl-tohoku/bert-base-japanese-whole-word-masking/resolve/main/config.json from cache at /home/realive333/.cache/huggingface/transformers/573af37b6c39d672f2df687c06ad7d556476cbe43e5bf7771097187c45a3e7bf.abeb707b5d79387dd462e8bfb724637d856e98434b6931c769b8716c6f287258
Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "tokenizer_class": "BertJapaneseTokenizer",
  "transformers_version": "4.20.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 32000
}

loading weights file https://huggingface.co/cl-tohok

In [16]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy='epoch',
    logging_strategy='epoch',
    save_strategy='epoch',
    save_total_limit=1,
    learning_rate=2e-5,
    per_device_train_batch_size=5,
    per_device_eval_batch_size=5,
    num_train_epochs=5,
    weight_decay=0.01,
    no_cuda=True
)

trainer = Trainer(
    model=modBERT,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=tkn_ds['train'],
    eval_dataset=tkn_ds['test'],
    tokenizer=tokenizer
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [17]:
trainer.evaluate()

The following columns in the evaluation set don't have a corresponding argument in `ModifiedBERT.forward` and have been ignored: text. If text are not expected by `ModifiedBERT.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 245
  Batch size = 5


[[ 0.22259602 -0.17254454]
 [ 0.17542663 -0.21671806]
 [ 0.20553018 -0.17237909]
 [ 0.16029926 -0.17865528]
 [ 0.12426941 -0.27894285]
 [ 0.16356932 -0.29384774]
 [ 0.20987979 -0.21027187]
 [ 0.16317041 -0.2278499 ]
 [ 0.15296105 -0.18783979]
 [ 0.16717957 -0.25884706]
 [ 0.24715096 -0.36734647]
 [ 0.19021347 -0.27587086]
 [ 0.20275828 -0.21029288]
 [ 0.11849014 -0.10607896]
 [ 0.19197662 -0.17708513]
 [ 0.11456984 -0.26773995]
 [ 0.1565733  -0.2611313 ]
 [ 0.1407225  -0.24672553]
 [ 0.17275597 -0.18219   ]
 [ 0.14043903 -0.2108825 ]
 [ 0.22719377 -0.2356379 ]
 [ 0.1796119  -0.3084199 ]
 [ 0.22149532 -0.2747362 ]
 [ 0.15914002 -0.21918902]
 [ 0.25033507 -0.30006173]
 [ 0.10000671 -0.2876736 ]
 [ 0.04721767 -0.33631372]
 [ 0.12548195 -0.44280824]
 [ 0.08745516 -0.29494348]
 [ 0.16396481 -0.30280012]
 [ 0.17226423 -0.1914744 ]
 [ 0.19006017 -0.12175032]
 [ 0.14195564 -0.25572148]
 [ 0.31102318 -0.23210433]
 [ 0.1663292  -0.11969356]
 [ 0.18495393 -0.2800516 ]
 [ 0.22508663 -0.10476996]
 

{'eval_loss': 0.726432204246521,
 'eval_acc': 0.46938775510204084,
 'eval_f1': 0.2998866213151927,
 'eval_runtime': 35.8,
 'eval_samples_per_second': 6.844,
 'eval_steps_per_second': 1.369}

In [18]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `ModifiedBERT.forward` and have been ignored: text. If text are not expected by `ModifiedBERT.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 245
  Num Epochs = 5
  Instantaneous batch size per device = 5
  Total train batch size (w. parallel, distributed & accumulation) = 5
  Gradient Accumulation steps = 1
  Total optimization steps = 245


Epoch,Training Loss,Validation Loss,Acc,F1
1,0.6227,0.630379,0.669388,0.639269
2,0.4224,0.844188,0.62449,0.562313
3,0.2651,1.30312,0.661224,0.619598
4,0.1161,1.135657,0.734694,0.732248
5,0.0633,1.261988,0.722449,0.717378


The following columns in the evaluation set don't have a corresponding argument in `ModifiedBERT.forward` and have been ignored: text. If text are not expected by `ModifiedBERT.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 245
  Batch size = 5


[[-0.18422766 -0.02392418]
 [-0.7690176   0.40212002]
 [-0.9395913   0.789773  ]
 [-0.92106295  0.62297213]
 [-1.2343438   0.9134669 ]
 [-0.73916507  0.54012644]
 [-0.9729583   0.7029116 ]
 [-0.6682817   0.34819847]
 [-0.7332393   0.64105177]
 [-0.75347567  0.44547963]
 [-0.11343681  0.1767581 ]
 [-0.21878912 -0.00928942]
 [-0.8414858   0.37840563]
 [-0.71305597  0.45616663]
 [-0.45178807  0.34574285]
 [-0.8701881   0.50339377]
 [-0.6039699   0.2729867 ]
 [-0.42661035  0.11725542]
 [-0.8039466   0.42055923]
 [-0.41922313  0.33518487]
 [-0.76949364  0.5419472 ]
 [-0.49190167  0.23999475]
 [-0.5015595   0.19639625]
 [ 0.1324305  -0.36464655]
 [-0.18643665  0.00720864]
 [-0.48565492 -0.03303291]
 [-0.23113887 -0.33555177]
 [-0.65303564  0.10717308]
 [-0.7215058   0.29896232]
 [-0.77357906  0.3147174 ]
 [-0.91192657  0.55309725]
 [-0.8346343   0.6091405 ]
 [-0.81762064  0.5588626 ]
 [-0.5609174   0.5872177 ]
 [-1.0315299   0.6163721 ]
 [ 0.19927162 -0.38666594]
 [ 0.26484752 -0.17144327]
 

Saving model checkpoint to ./results/checkpoint-49
Configuration saved in ./results/checkpoint-49/config.json
Model weights saved in ./results/checkpoint-49/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-49/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-49/special_tokens_map.json
Deleting older checkpoint [results/checkpoint-245] due to args.save_total_limit
The following columns in the evaluation set don't have a corresponding argument in `ModifiedBERT.forward` and have been ignored: text. If text are not expected by `ModifiedBERT.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 245
  Batch size = 5


[[-1.6893302   1.3480521 ]
 [-1.7332569   1.206369  ]
 [-1.6644455   1.5721453 ]
 [-1.57027     1.3339268 ]
 [-1.937268    1.6827809 ]
 [-1.6735774   1.3865886 ]
 [-1.7203604   1.315596  ]
 [-1.7484027   1.3750411 ]
 [-1.4319322   1.2390214 ]
 [-1.8161647   1.4041978 ]
 [ 0.02831004  0.0029079 ]
 [-1.1922603   0.6572461 ]
 [-1.5456493   1.1324886 ]
 [-1.9452654   1.4821885 ]
 [-0.586359    0.5297572 ]
 [-0.91640514  0.52937424]
 [-0.8459028   0.4248477 ]
 [-0.34189478  0.12664036]
 [-1.090938    0.61343586]
 [-0.47584864  0.5099321 ]
 [-1.4051937   1.1167371 ]
 [-0.9655845   0.6695713 ]
 [-0.8759373   0.58886003]
 [-0.5515181   0.0616713 ]
 [-0.5561867   0.21191657]
 [-1.3518337   0.46778074]
 [-1.1240594   0.17885773]
 [-0.99174654  0.2786719 ]
 [-1.6194605   0.7802106 ]
 [-1.6925019   0.9293732 ]
 [-1.9282149   1.4754137 ]
 [-1.8954467   1.5798992 ]
 [-1.8906763   1.5278839 ]
 [-1.5151955   1.3371962 ]
 [-1.9648455   1.4469523 ]
 [ 0.21086027 -0.43385896]
 [-0.56584895  0.35346493]
 

Saving model checkpoint to ./results/checkpoint-98
Configuration saved in ./results/checkpoint-98/config.json
Model weights saved in ./results/checkpoint-98/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-98/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-98/special_tokens_map.json
Deleting older checkpoint [results/checkpoint-49] due to args.save_total_limit
The following columns in the evaluation set don't have a corresponding argument in `ModifiedBERT.forward` and have been ignored: text. If text are not expected by `ModifiedBERT.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 245
  Batch size = 5


[[-2.64629579e+00  2.23285747e+00]
 [-2.73580432e+00  2.35524201e+00]
 [-2.57963395e+00  2.53773189e+00]
 [-2.69229245e+00  2.42897820e+00]
 [-2.75487542e+00  2.37720656e+00]
 [-2.23538136e+00  2.07268929e+00]
 [-2.66616321e+00  2.38621163e+00]
 [-2.76533055e+00  2.23199344e+00]
 [-2.41356897e+00  2.29216909e+00]
 [-2.90280271e+00  2.41748309e+00]
 [ 1.11520994e+00 -9.21943128e-01]
 [-2.76715922e+00  2.09753680e+00]
 [-2.79556537e+00  2.29385376e+00]
 [-2.90694475e+00  2.34095407e+00]
 [-2.63248515e+00  2.19323421e+00]
 [-2.08535171e+00  1.68021274e+00]
 [-1.75430799e+00  1.25373781e+00]
 [-5.38875639e-01  2.81259060e-01]
 [-1.90994084e+00  1.47515321e+00]
 [-2.20739174e+00  1.77331865e+00]
 [-2.30743361e+00  2.17036939e+00]
 [-2.09320426e+00  1.70408237e+00]
 [-1.74595845e+00  1.39501727e+00]
 [-1.12776101e+00  5.45561671e-01]
 [-1.66418469e+00  1.46193671e+00]
 [-2.08758712e+00  1.30757082e+00]
 [-1.83073485e+00  7.50984788e-01]
 [-1.42032397e+00  6.73268497e-01]
 [-2.52970457e+00  1

Saving model checkpoint to ./results/checkpoint-147
Configuration saved in ./results/checkpoint-147/config.json
Model weights saved in ./results/checkpoint-147/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-147/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-147/special_tokens_map.json
Deleting older checkpoint [results/checkpoint-98] due to args.save_total_limit
The following columns in the evaluation set don't have a corresponding argument in `ModifiedBERT.forward` and have been ignored: text. If text are not expected by `ModifiedBERT.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 245
  Batch size = 5


[[-2.525224    2.1666327 ]
 [-2.6527774   2.2134376 ]
 [-2.9000955   2.8544583 ]
 [-2.8305154   2.6899724 ]
 [-2.9341908   2.5825133 ]
 [-2.4303281   2.3162696 ]
 [-2.993988    2.7121534 ]
 [-2.5369205   2.1020966 ]
 [-2.6624923   2.5274596 ]
 [-2.7696266   2.2752004 ]
 [ 2.6276278  -2.2923183 ]
 [ 2.3515358  -1.8748531 ]
 [-2.7328453   2.2694132 ]
 [-3.1374633   2.6324954 ]
 [ 2.6450377  -2.1962729 ]
 [ 0.19532463 -0.46814403]
 [ 1.5856807  -1.4163955 ]
 [ 1.5473635  -1.352669  ]
 [ 0.70042354 -0.8845026 ]
 [ 0.7734192  -0.60793316]
 [-1.7915225   1.7241237 ]
 [-0.8608972   0.5202828 ]
 [ 0.75322354 -0.9111084 ]
 [ 2.8158484  -2.5701604 ]
 [ 1.7533467  -1.6360075 ]
 [-0.5784782  -0.187953  ]
 [ 0.4151382  -1.057307  ]
 [-0.5992885   0.02200147]
 [-2.602274    1.7639489 ]
 [-2.8329668   2.0030088 ]
 [-3.1189706   2.5914276 ]
 [-3.147646    2.839989  ]
 [-3.035884    2.704626  ]
 [-2.7604017   2.6186626 ]
 [-3.1845727   2.6638727 ]
 [ 2.4413056  -2.5121026 ]
 [ 2.6346383  -2.2160316 ]
 

Saving model checkpoint to ./results/checkpoint-196
Configuration saved in ./results/checkpoint-196/config.json
Model weights saved in ./results/checkpoint-196/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-196/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-196/special_tokens_map.json
Deleting older checkpoint [results/checkpoint-147] due to args.save_total_limit
The following columns in the evaluation set don't have a corresponding argument in `ModifiedBERT.forward` and have been ignored: text. If text are not expected by `ModifiedBERT.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 245
  Batch size = 5


[[-2.8704853   2.5359514 ]
 [-3.0169063   2.682099  ]
 [-3.01154     2.9771152 ]
 [-3.0382278   2.9288857 ]
 [-3.2204473   2.893593  ]
 [-2.6156876   2.4847739 ]
 [-3.095073    2.8540275 ]
 [-2.865057    2.4584134 ]
 [-2.8306122   2.7115786 ]
 [-3.113618    2.7180595 ]
 [ 2.7870739  -2.4515471 ]
 [ 0.69405186 -0.7103902 ]
 [-3.0822103   2.665393  ]
 [-3.3110476   2.889919  ]
 [ 2.3870027  -1.943113  ]
 [-0.5873542   0.26392156]
 [ 1.329631   -1.2385254 ]
 [ 1.5566748  -1.4053786 ]
 [ 0.15540749 -0.43285218]
 [-0.14675678  0.14790958]
 [-2.247231    2.1944664 ]
 [-1.7055906   1.3436321 ]
 [ 0.12686571 -0.39576   ]
 [ 2.871619   -2.6335332 ]
 [ 1.3140045  -1.2524933 ]
 [-1.3599225   0.5981521 ]
 [-0.25786185 -0.58318824]
 [-0.93712986  0.29875752]
 [-2.9143627   2.1382558 ]
 [-3.078953    2.3577793 ]
 [-3.2673817   2.8105538 ]
 [-3.2623749   3.0028875 ]
 [-3.198977    2.9082983 ]
 [-2.8677914   2.7438235 ]
 [-3.3264322   2.8956606 ]
 [ 2.5449502  -2.5869534 ]
 [ 2.2456057  -1.8212746 ]
 

Saving model checkpoint to ./results/checkpoint-245
Configuration saved in ./results/checkpoint-245/config.json
Model weights saved in ./results/checkpoint-245/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-245/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-245/special_tokens_map.json
Deleting older checkpoint [results/checkpoint-196] due to args.save_total_limit


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=245, training_loss=0.2979059297211316, metrics={'train_runtime': 888.0866, 'train_samples_per_second': 1.379, 'train_steps_per_second': 0.276, 'total_flos': 235743908381100.0, 'train_loss': 0.2979059297211316, 'epoch': 5.0})

In [None]:
pred_result = trainer.predict(tkn_ds['test'], ignore_keys=['loss', 'last_hidden_state', 'hidden_states', 'attentions'])
pred_label= pred_result.predictions.argmax(axis=1).tolist()
print(pred_label)
from sklearn.metrics import classification_report
print(classification_report(tkn_ds['test']['label'], pred_label, target_names=['False', 'True']))