<a href="https://colab.research.google.com/github/Romira915/jds_compe/blob/valid%2Fgpu/bsc.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [12]:
import pandas as pd
import torch
from transformers import BertTokenizer, BertForSequenceClassification, pipeline
from transformers import AdamW

In [13]:
torch.cuda.empty_cache()

In [14]:
train_path = "train.csv"
test_path = "test.csv"
compe_path = "compe.csv"

try:
  from google.colab import drive
  drive.mount('/content/drive')

  compe_dir = "/content/drive/My Drive/Documents/compe/"

  train_path = compe_dir + train_path
  test_path = compe_dir + test_path
  compe_path = compe_dir + compe_path

except ImportError:
  pass

In [15]:
train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)
compe_df = pd.read_csv(compe_path)

train_docs = train_df["text"].tolist()
test_docs = test_df["text"].tolist()
y = train_df["label"].tolist()
test_y = test_df["label"].tolist()
compe_text = compe_df["text"].tolist()

In [16]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [17]:
model_name = "cl-tohoku/bert-large-japanese"
# model_name = "cl-tohoku/bert-base-japanese-v2"

model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2)
model = model.to(device)
tokenizer = BertTokenizer.from_pretrained(model_name)

loading configuration file https://huggingface.co/cl-tohoku/bert-large-japanese/resolve/main/config.json from cache at /home/romira/.cache/huggingface/transformers/d49f3356a181491359f3f12fe5d60f7441304c3a4796734cbc22de66c1765f0a.0c1afb078c78251713eee5aa225d29cf9899eb0f0bd8d2ba7c4967f586db290c
Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "tokenizer_class": "BertJapaneseTokenizer",
  "transformers_version": "4.21.0.dev0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 32768
}

loading weights file https://huggingface.co/cl-tohoku/bert-large-jap

In [18]:
max_len = 32

train_encodings = tokenizer(train_docs, return_tensors='pt', padding=True, truncation=True, max_length=max_len).to(device)
test_encodings = tokenizer(test_docs, return_tensors='pt', padding=True, truncation=True, max_length=max_len).to(device)

In [19]:
import torch

class JpSentiDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = JpSentiDataset(train_encodings, y)
test_dataset = JpSentiDataset(test_encodings, test_y)

In [20]:
# To calculate additional metrics in addition to the loss, you can also define your own compute_metrics function and pass it to the trainer.

from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted', zero_division=0)
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [21]:
from transformers import Trainer, TrainingArguments

torch.cuda.empty_cache()

training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=4,              # total number of training epochs
    per_device_train_batch_size=8,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    save_total_limit=1,              # limit the total amount of checkpoints. Deletes the older checkpoints.
    dataloader_pin_memory=False,  # Whether you want to pin memory in data loaders or not. Will default to True
    # evaluation_strategy="epoch",     # Evaluation is done at the end of each epoch.
    evaluation_strategy="steps",
    logging_steps=50,
    logging_dir='./logs'
)

trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=test_dataset,             # evaluation dataset
    compute_metrics=compute_metrics  # The function that will be used to compute metrics at evaluation
)

trainer.train()

using `logging_steps` to initialize `eval_steps` to 50
PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
***** Running training *****
  Num examples = 10000
  Num Epochs = 4
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 5000
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
50,0.5328,0.352498,0.868,0.866248,0.883431,0.868
100,0.3408,0.345393,0.872,0.870049,0.890385,0.872
150,0.2904,0.286115,0.883,0.882374,0.888449,0.883
200,0.3681,0.286218,0.877,0.876514,0.880656,0.877
250,0.4109,0.295228,0.883,0.882105,0.891422,0.883
300,0.375,0.423521,0.877,0.874832,0.899136,0.877
350,0.3831,0.30736,0.882,0.881993,0.881994,0.882
400,0.3075,0.328772,0.885,0.883501,0.900934,0.885
450,0.3641,0.325707,0.87,0.869953,0.872166,0.87
500,0.3921,0.267755,0.894,0.893916,0.894413,0.894


***** Running Evaluation *****
  Num examples = 1000
  Batch size = 64
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
***** Running Evaluation *****
  Num examples = 1000
  Batch size = 64
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
***** Running Evaluation *****
  Num examples = 1000
  Batch size = 64
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
***** Running Evaluation *****
  Num examples = 1000
  Batch size = 64
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
***** Running Evaluation *****
  Num examples = 1000
  Batch size = 64
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
***** Running Evaluation *****
  Num examples = 1000
  Batch size = 64
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
***** Running Evaluation *****
  Num examples = 1000
  Batch size = 64
  item = {key: torch.tensor(val[idx

TrainOutput(global_step=5000, training_loss=0.30589715824127195, metrics={'train_runtime': 2254.588, 'train_samples_per_second': 17.742, 'train_steps_per_second': 2.218, 'total_flos': 2329828408320000.0, 'train_loss': 0.30589715824127195, 'epoch': 4.0})

In [22]:
# evaluation のみ実行
trainer.evaluate(eval_dataset=test_dataset)

***** Running Evaluation *****
  Num examples = 1000
  Batch size = 64
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


{'eval_loss': 0.3201843798160553,
 'eval_accuracy': 0.914,
 'eval_f1': 0.9138073072892197,
 'eval_precision': 0.9160044345898004,
 'eval_recall': 0.914,
 'eval_runtime': 3.057,
 'eval_samples_per_second': 327.116,
 'eval_steps_per_second': 5.234,
 'epoch': 4.0}