In [50]:
# データセット作成
import re
import glob

id = 1
datasets = []
# ファイル名の先頭のlabel番号を抜き出す正規表現
pattern = r'^\D*(\d+)'

dataset_path_list = glob.glob("./dataset/*")
for dataset_path in dataset_path_list:
    with open(dataset_path) as f:
        for line in f:
            datasets.append(
                {
                    "id": str(id),
                    "label": re.match(pattern, dataset_path).group(1),
                    "text": line.rstrip('\n'),
                    "orignal": dataset_path
                }
            )
            id+=1

In [51]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# デバイス判定
# device = "cuda:0" if torch.cuda.is_available() else "cpu"
device = torch.device("mps")

# モデルとトークナイザーの読み込み
model_name = 'tohoku-nlp/bert-base-japanese-v3'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=5).to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at tohoku-nlp/bert-base-japanese-v3 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [52]:
# データセット読み込み

import torch
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset


train, valid = train_test_split(datasets, test_size=0.1)

# データセットクラスの定義
class CustomDataset(Dataset):
    def __init__(self, data):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = tokenizer.model_max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text = self.data[idx]['text']
        label = int(self.data[idx]['label'])

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

train_dataset = CustomDataset(train)
val_dataset = CustomDataset(valid)

In [53]:
train_dataset[0]

{'input_ids': tensor([    2, 16936, 19509,  7161,   474,   464, 15579,   430, 14846,   439,
           494,   456, 12483, 12515, 13037,   385, 16402, 24580,   500, 13338,
           441,   456, 27184,  7139, 12995,   429,    46,     3,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,   

In [54]:
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(32768, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [57]:
from transformers import TrainingArguments, Trainer
from transformers import default_data_collator

data_collator = default_data_collator
training_args = TrainingArguments(
    output_dir="./output/20240508",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    load_best_model_at_end=True,
)

In [58]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
)

trainer.train()


                                                    

[A[A                               
  1%|          | 13/1400 [17:34<12:35:34, 32.69s/it]
[A
[A

{'eval_loss': nan, 'eval_runtime': 3.224, 'eval_samples_per_second': 3.722, 'eval_steps_per_second': 0.31, 'epoch': 1.0}



                                                    

[A[A                               
  1%|          | 13/1400 [20:43<12:35:34, 32.69s/it]
[A
[A

{'eval_loss': nan, 'eval_runtime': 3.5103, 'eval_samples_per_second': 3.418, 'eval_steps_per_second': 0.285, 'epoch': 2.0}



                                                    

[A[A                               
  1%|          | 13/1400 [24:17<12:35:34, 32.69s/it]
[A
[A

{'eval_loss': nan, 'eval_runtime': 3.6499, 'eval_samples_per_second': 3.288, 'eval_steps_per_second': 0.274, 'epoch': 3.0}


                                                    
100%|██████████| 21/21 [10:29<00:00, 29.96s/it]s/it]

{'train_runtime': 629.1134, 'train_samples_per_second': 0.515, 'train_steps_per_second': 0.033, 'train_loss': 0.0, 'epoch': 3.0}





TrainOutput(global_step=21, training_loss=0.0, metrics={'train_runtime': 629.1134, 'train_samples_per_second': 0.515, 'train_steps_per_second': 0.033, 'total_flos': 85250278158336.0, 'train_loss': 0.0, 'epoch': 3.0})

In [None]:
train_model = AutoModelForSequenceClassification.from_pretrained("./output/20240508/checkpoint-18")
train_tokenizer = AutoTokenizer.from_pretrained("./output/20240508/checkpoint-18")

In [59]:
text = "プリンターがインクが切れたと警告していますが、交換の手順が不明です。案内していただけますか？"
encoding = train_tokenizer.encode_plus(text,
            add_special_tokens=True,
            max_length=train_tokenizer.model_max_length,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

with torch.no_grad():
    logits = train_model(**encoding).logits

print(logits)


tensor([[nan, nan, nan, nan, nan]])


In [None]:
train_model(**encoding)

SequenceClassifierOutput(loss=None, logits=tensor([[nan, nan, nan, nan, nan]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)