In [1]:
class Work:
    def __init__(self, i, t, c, l):
        self.title = t
        self.content = c
        self.labels = l
        self.id = i
    def __str__(self):
        return f"id: \"{self.id}\"\ntitle: \"{self.title}\"\ncontent: \"{self.content}\"\nlabels: {self.labels}\n"

In [2]:
def load_jsonl(path):
    data=[]
    with open(path, 'r', encoding='utf-8') as reader:
        for line in reader:
            data.append(json.loads(line))
    return data

In [3]:
def load_csv(path):
    data=[]
    with open(path, 'r', encoding='utf-8') as file:
        rows = csv.reader(file)
        for row in rows:
            label = {"id": row[0], "name": row[1]}
            data.append(label)
    return data

In [4]:
def add_str(datas):
    string = ""
    for data in datas:
        t_str = data['body']
        t_str = t_str.replace(u'\u3000', u'')
        string += t_str
    return string

In [5]:
def create_work(data):
    if data['labels'] != None:
        t_labels = create_label_vector(labels, data['labels'])
    else:
        t_labels = create_label_vector(labels, [""])
    
    w = Work(data['id'], data['metadata']['title'], add_str(data['content'])[:512], t_labels)
    return w

In [6]:
def create_label_vector(total_labels, target_labels):
    return_label = []
    for i, label in enumerate(total_labels):
        for t_label in target_labels:
            if label['name'] == t_label:
                return_label.append(label['id'])
    return return_label

In [7]:
import csv
labels = load_csv("label_list_cleaned.csv")
print(labels[15:25])

[{'id': '11', 'name': 'TS'}, {'id': '12', 'name': 'スキル'}, {'id': '13', 'name': '夫婦'}, {'id': '14', 'name': 'ステータス'}, {'id': '15', 'name': '学生'}, {'id': '16', 'name': '後輩'}, {'id': '17', 'name': '少年'}, {'id': '18', 'name': '社会人'}, {'id': '18', 'name': 'サラリーマン'}, {'id': '19', 'name': '大学生'}]


In [8]:
unsortIds = [label['id'] for label in labels]
ids = []
for id in unsortIds:
    if id not in ids:
        ids.append(id)
print(len(ids))

312


In [9]:
id2label = {label['id']:label['name'] for label in labels}
label2id = {label['name']:label['id'] for label in labels}
print(label2id)

{'男主人公': '1', '男性主人公': '1', '女主人公': '2', '女性主人公': '2', 'チート': '3', '主人公最強': '4', '最強': '4', '剣と魔法': '5', '剣': '6', '魔術': '7', '悪役令嬢': '8', '令嬢': '9', 'お嬢様': '9', '探偵': '10', '性転換': '11', 'TS': '11', 'スキル': '12', '夫婦': '13', 'ステータス': '14', '学生': '15', '後輩': '16', '少年': '17', '社会人': '18', 'サラリーマン': '18', '大学生': '19', '高校生': '20', '中学生': '21', '小学生': '22', 'おっさん': '23', 'ヒーロー': '24', '聖女': '25', '記憶喪失': '26', '狂気': '27', 'オタク': '28', '女装': '29', 'シスコン': '30', '恋人': '31', '英雄': '32', 'ぼっち': '33', '陰キャ': '34', '陰陽師': '35', '変身': '36', '作家': '37', '教師': '38', '先生': '38', '魔法': '39', '学園': '40', 'ダンジョン': '41', '超能力': '42', '異能': '42', '異能力': '42', '能力': '42', '異能力バトル': '42', '異能バトル': '42', 'ミリタリー': '43', '音楽': '44', '学校': '45', '高校': '45', '大学': '45', '料理': '46', '部活': '47', '海': '48', '宇宙': '49', 'スポーツ': '50', 'スマホ': '51', '桜': '52', '銃': '53', '喫茶店': '54', '野球': '55', '月': '56', 'ピアノ': '57', '宇宙人': '58', '神社': '59', '花': '60', '錬金術': '61', '病院': '62', '電車': '63', '直観': '64', '図書館': '65', '戦

In [10]:
import json
import tqdm
import time
from os import walk

works = []
st_time = time.time()
filenames = next(walk("testdata"),  (None, None, []))[2]
for filename in filenames:
    datas = load_jsonl(f"testdata/{filename}")
    
    print(f"{filename} original lines: {len(datas)}")
    for data in datas:
        if data['labels'] == None:
            datas.remove(data)
    print(f"{filename} cleaned lines: {len(datas)}")
    
    for data in tqdm.tqdm(datas):
        w = create_work(data)
        work = {"id": w.id, "title": w.title, "content": w.content, "labels": w.labels}
        works.append(work)
        
print(f"Time: {time.time()-st_time}")

117735405488A-512.jsonl original lines: 9585
117735405488A-512.jsonl cleaned lines: 9236


100%|████████████████████████████████████████████████████████████████████████████| 9236/9236 [00:02<00:00, 3152.47it/s]


117735405488B-512.jsonl original lines: 6480
117735405488B-512.jsonl cleaned lines: 6253


100%|████████████████████████████████████████████████████████████████████████████| 6253/6253 [00:02<00:00, 2129.04it/s]


117735405488C-512.jsonl original lines: 8156
117735405488C-512.jsonl cleaned lines: 7898


100%|████████████████████████████████████████████████████████████████████████████| 7898/7898 [00:03<00:00, 2392.05it/s]


117735405489A-512.jsonl original lines: 7418
117735405489A-512.jsonl cleaned lines: 7168


100%|████████████████████████████████████████████████████████████████████████████| 7168/7168 [00:03<00:00, 1838.33it/s]


117735405489B-512.jsonl original lines: 6471
117735405489B-512.jsonl cleaned lines: 6322


100%|████████████████████████████████████████████████████████████████████████████| 6322/6322 [00:03<00:00, 1800.31it/s]


11773540549-512.jsonl original lines: 7539
11773540549-512.jsonl cleaned lines: 7254


100%|████████████████████████████████████████████████████████████████████████████| 7254/7254 [00:03<00:00, 1890.70it/s]


1177354055-512.jsonl original lines: 2958
1177354055-512.jsonl cleaned lines: 2861


100%|████████████████████████████████████████████████████████████████████████████| 2861/2861 [00:01<00:00, 2334.15it/s]


1681641041-512.jsonl original lines: 466
1681641041-512.jsonl cleaned lines: 447


100%|██████████████████████████████████████████████████████████████████████████████| 447/447 [00:00<00:00, 1689.68it/s]


1681645221-512.jsonl original lines: 6774
1681645221-512.jsonl cleaned lines: 6607


100%|████████████████████████████████████████████████████████████████████████████| 6607/6607 [00:02<00:00, 2722.94it/s]


1681645222-512.jsonl original lines: 3886
1681645222-512.jsonl cleaned lines: 3749


100%|████████████████████████████████████████████████████████████████████████████| 3749/3749 [00:01<00:00, 2498.90it/s]


1681670018-512.jsonl original lines: 9
1681670018-512.jsonl cleaned lines: 9


100%|██████████████████████████████████████████████████████████████████████████████████| 9/9 [00:00<00:00, 1635.77it/s]


1681670042-512.jsonl original lines: 9254
1681670042-512.jsonl cleaned lines: 8837


100%|████████████████████████████████████████████████████████████████████████████| 8837/8837 [00:02<00:00, 3300.51it/s]


1681692761-512.jsonl original lines: 12
1681692761-512.jsonl cleaned lines: 12


100%|████████████████████████████████████████████████████████████████████████████████| 12/12 [00:00<00:00, 2665.87it/s]


1681692785-512.jsonl original lines: 1788
1681692785-512.jsonl cleaned lines: 1744


100%|████████████████████████████████████████████████████████████████████████████| 1744/1744 [00:00<00:00, 5091.09it/s]


1681692786-512.jsonl original lines: 171
1681692786-512.jsonl cleaned lines: 168


100%|██████████████████████████████████████████████████████████████████████████████| 168/168 [00:00<00:00, 7465.39it/s]


4852201425-512.jsonl original lines: 1087
4852201425-512.jsonl cleaned lines: 1062


100%|████████████████████████████████████████████████████████████████████████████| 1062/1062 [00:00<00:00, 1508.27it/s]

Time: 88.33619332313538





In [None]:
"""
print(len(works))
for work in tqdm.tqdm(works):
    if work['labels'] == []:
        works.remove(work)
print(len(works))
#print(works[6])
"""

69627


 85%|███████████████████████████████████████████████████████████████           | 59328/69627 [00:54<00:09, 1087.34it/s]

59328





In [75]:
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer
"""
CUT DOWN WORKS
"""
works = works[:512]
print(len(works))
works_id = [work['id'] for work in works]
works_title = [work['title'] for work in works]
works_content = [work['content'] for work in works]

#mlb = MultiLabelBinarizer(classes=(ids))
works_labels = [work['labels'] for work in works]
works_labels = mlb.fit_transform(works_labels)
works_labels = np.array(works_labels, dtype=np.float)
#print(works_labels)
works_df = pd.DataFrame({'id': works_id, 'title': works_title, 'content': works_content, 'labels': works_labels.tolist()})
#works_df = pd.DataFrame({'id': works_id, 'title': works_title, 'content': works_content, 'labels': works_labels})
works_df.head()

512


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  works_labels = np.array(works_labels, dtype=np.float)


Unnamed: 0,id,title,content,labels
0,1177354054880199356,彼女は頭の上にミカンを乗せていた。ミカンセイ空間にようこそ,web小説ですし、出来るだけくだけた書き方でいきます。この作品はノンセンス(荒唐無稽な物事を...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,1177354054880199370,ノー・イヤー・ヒーロー,常識が変わる瞬間を見たことがあるか？例えばクラハムベルがはじめて遠距離通信を行った瞬間とか、...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,1177354054880199563,星の代わりに,今は午後一時半。お腹に少しは物を入れている。タバコもいっぷくすませたし、あとにすることは限ら...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,1177354054880199764,長耳のベアラー,必要とされる場所には自然と人は集まってくる。人が集まれば活気が出来上がり、夜でも賑やかな街に...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,1177354054880199944,海の底へ,「玲さんは、なんでこんなところまで来たの？」拓真に聞かれる。「…だから、拓真に会いにだよ。」...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [76]:
import datasets
from datasets import Dataset
dataset = Dataset.from_pandas(works_df)
train_testvalid = dataset.train_test_split(test_size=0.2)
test_valid = train_testvalid['test'].train_test_split(test_size=0.5)
dataset = datasets.DatasetDict({
    'train': train_testvalid['train'],
    'test': test_valid['test'],
    'valid': test_valid['train']
})

In [77]:
print(type(dataset['train'][0]['labels'][0]))

<class 'float'>


In [78]:
import numpy as np
import torch
from transformers import AutoModel, AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('cl-tohoku/bert-base-japanese-whole-word-masking', mecab_kwargs={"mecab_dic": "unidic", "mecab_option": None})

def preprocess_data(work):
    text = work['content']
    encoding = tokenizer(text, max_length=512, truncation=True, padding="max_length")
    encoding['labels'] = work['labels']
    #print(encoding)
    return encoding

loading configuration file https://huggingface.co/cl-tohoku/bert-base-japanese-whole-word-masking/resolve/main/config.json from cache at C:\Users\User/.cache\huggingface\transformers\573af37b6c39d672f2df687c06ad7d556476cbe43e5bf7771097187c45a3e7bf.abeb707b5d79387dd462e8bfb724637d856e98434b6931c769b8716c6f287258
Model config BertConfig {
  "_name_or_path": "cl-tohoku/bert-base-japanese-whole-word-masking",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "tokenizer_class": "BertJapaneseTokenizer",
  "transformers_version": "4.19.2",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_s

In [None]:
encoding_dataset = dataset.map(preprocess_data, batched=True, remove_columns=dataset['train'].column_names)

  0%|          | 0/1 [00:00<?, ?ba/s]

In [None]:
exp = encoding_dataset['train'][0]
print(exp)
print(type(exp['labels'][0]))
tokenizer.decode(exp['input_ids'])

In [48]:
encoding_dataset.set_format("torch")

In [49]:
print(encoding_dataset['train'][0])

{'labels': tensor([0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0

In [50]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(
    "cl-tohoku/bert-base-japanese-whole-word-masking", 
    problem_type="multi_label_classidication", 
    num_labels=len(ids),
    id2label=id2label,
    label2id=label2id
)

Some weights of the model checkpoint at cl-tohoku/bert-base-japanese-whole-word-masking were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialize

In [51]:
batch_size = 4
metric_name = "f1"

In [52]:
from transformers import TrainingArguments, Trainer

args = TrainingArguments(
    f"bert-finetuned-sem_eval-japanese",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate = 2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=5,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model=metric_name
)

In [53]:
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
from transformers import EvalPrediction
import torch

def multi_label_metrics(pred, labels, threshold=0.5):
    sigmoid = torch.nn.sigmoid()
    probs = sigmoid(torch.Tensor(pred))
    
    y_pred = np.zeros(probs.shape)
    y_pred[np.where(probs >= threshold)] = 1
    
    y_true = labels
    f1_micro_average = f1_score(y_true=y_true, y_pred=y_pred, average='micro')
    roc_auc = roc_auc_score(y_true, y_pred, average = 'micro')
    accuracy = accuracy_score(y_true, y_pred)
    
    metrics = {'f1': f1_micro_average,
               'roc_auc': roc_auc,
               'accuracy': accuracy}
    return metrics

def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
    result = multi_label_metrics(predictions=preds, labels=p.label_ids)
    return result

In [54]:
encoding_dataset['train'][0]['labels'].type()

'torch.FloatTensor'

In [55]:
encoding_dataset['train']['input_ids'][0]

tensor([    2,  6950,   324,     7,     6,    52,   108,     5,   390,    14,
         1991,     8,   390,     9, 21010,   276,    11,  8803,     6,  9871,
        29087,    18,  2149,     5,   109,     7,    52,   181,     5,   324,
           11, 14138,     8, 13059,     7, 15344, 28790,     6,   319,     7,
            9,  3477,     8, 17923, 17356,    13,    15,    10,    59,  4307,
            7,     6,  7672, 28713,     9,  9109,   704,     7,  1285,     5,
         2436,    11,  2773,    10,     8,    36, 30239, 30239, 12077,  8268,
          679,    38, 11283,  1381,    11,   501,    20,     6,  7672, 28713,
            9,  1042,  7263,    15,    16,    21,    10, 14726,    11,   311,
            7, 13043,    16,  2679,    11,  7043,     8,  1076, 29273,     5,
          174,     7,  2021,     6,  7672, 28713,     5,   283,    11,   212,
           16,    33,     5,     9, 14833,   813,     5,  2633,  2845,     8,
         1257,   129,     7,   840,    15,     6,  2095,    28, 

In [56]:
outputs = model(input_ids=encoding_dataset['train']['input_ids'][0].unsqueeze(0), labels=encoding_dataset['train'][0]['labels'].unsqueeze(0))
outputs

SequenceClassifierOutput(loss=None, logits=tensor([[-0.5101,  0.5825,  0.0387,  0.6508,  0.2820, -0.5747, -0.0511,  0.2326,
         -0.3519,  0.0852,  0.5187,  0.5181,  0.1006, -0.6930, -0.0943, -0.4156,
         -0.4098, -0.6603, -0.1107,  0.1329,  0.2678,  0.1894, -0.0331,  0.6075,
          0.6987, -0.5988, -0.3474, -0.1392,  0.1285, -0.3040, -0.3538, -0.2726,
         -0.1459,  0.3945,  0.1916, -0.3962, -0.3057, -0.4619, -0.1243, -0.2535,
          0.2153, -0.3325,  0.1384,  0.4546, -0.1798, -0.2306, -0.1453,  0.3275,
         -0.2980, -0.1530,  0.1781, -0.2820, -0.5926, -0.0785,  0.0161, -0.1399,
          0.2831, -0.0034, -0.6102, -0.3194,  0.6924, -0.2350,  0.3996, -0.1663,
          0.1804, -0.1801,  0.2892,  0.5647, -0.0252, -0.0055,  0.2606,  0.5206,
         -0.1807,  0.1469, -0.0511,  0.0678,  0.1408, -0.1781, -0.0155, -0.0674,
          0.1103,  1.2215,  0.0389,  0.5585, -0.2788,  0.1012,  0.1716,  0.1601,
         -0.0604,  0.4992,  0.1670, -0.0608, -0.0327, -0.2313,  0.

In [57]:
trainer = Trainer(
    model,
    args,
    train_dataset=encoding_dataset["train"],
    eval_dataset=encoding_dataset["valid"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [58]:
import torch
torch.cuda.empty_cache()

In [59]:
trainer.train()

***** Running training *****
  Num examples = 409
  Num Epochs = 5
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 515


KeyError: 'loss'