In [1]:
import torch
import torch.optim as optim
from torch.utils.data import DataLoader
from dataclasses import dataclass
from typing import Dict, Sequence
from datasets.dataset_dict import DatasetDict as ddict
# from pyarrow.dataset import dataset
from datasets.arrow_dataset import Dataset
import pyarrow as pa
from transformers import (
    BertForSequenceClassification,
    AutoTokenizer,
    HfArgumentParser,
    PreTrainedTokenizer,
    TrainingArguments,
    Trainer,
)
from transformers import DataCollatorWithPadding
from transformers.hf_argparser import HfArg
import json
import os

In [2]:
def get_datasets(dir):
    res = []
    os.chdir(dir)
    files = os.listdir()
    for file in files:
        if not os.path.isdir(file):
            res.append(dir+'\\'+file)
        else:
            sub_files = get_datasets(file)
            for sub_file in sub_files:
                res.append(dir+'\\'+sub_file)
    os.chdir('..')
    return res

In [3]:
dataset_names = get_datasets('face2_zh_json')

In [4]:
def modify_dataset(names):
    sentence = []
    labels = []
    for name in names:
        raw = json.load(open(name))
        if 'human' in name:
            for i in range(len(raw)):
                sentence.append(raw[i]['input']+'[SEP]'+raw[i]['output'])
                labels.append(0)
        else:
            for i in range(len(raw['output'])):
                sentence.append(raw['input'][str(i)]+'[SEP]'+raw['output'][str(i)])
                labels.append(1)
    table= pa.table(
            pa.array( [{'text': data,
                         'label': label } for data,label in zip(sentence,labels)],
            type=pa.struct([('text',pa.string()),
                            ('label',pa.int64())])
            )
    )
    return Dataset(table)


<!--
table = modify_dataset(dataset_names)
# raw = json.load(open(dataset_names[0]))

# # 构造数据
# sentence = []

# for i in range(len(raw['output'])):
#     sentence.append(raw['input'][str(i)]+'[SEP]'+raw['output'][str(i)])

# table = pa.table(
#             pa.array( [{'text': data,
#                          'label': 1 } for data in sentence],
#             type=pa.struct([('text',pa.string()),
#                             ('label',pa.int64())])
#             )
#     ) 
datas = ddict({'news':Dataset(table)})
tokenizer = AutoTokenizer.from_pretrained('model')
datas['news'][0]
-->


In [5]:
tokenizer = AutoTokenizer.from_pretrained('model')

In [6]:
def choose_name(names,types):
    res = []
    for t in types.split('-'):
        for name in names:
            if t in name:
                res.append(name)
    return res

In [7]:
types = ['news','webnovel','wiki','webnovel-wiki','news-wiki','news-webnovel']
datas = ddict({t:modify_dataset(choose_name(dataset_names,t)) for t in types})

In [19]:
datas=datas.map(lambda examples:tokenizer(examples['text'], truncation=True),batched=True)

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Map:   0%|          | 0/20000 [00:00<?, ? examples/s]

Map:   0%|          | 0/20000 [00:00<?, ? examples/s]

Map:   0%|          | 0/20000 [00:00<?, ? examples/s]

In [20]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [21]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("model", num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at model and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [22]:
from transformers import TrainingArguments, Trainer

In [23]:
training_args = TrainingArguments(
    output_dir='output',
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False,
)

<!--import evaluate
import numpy as np

accuracy = evaluate.load('accuracy')
# If you have problem connecting to huggingface, you can git clone the evaluate repo https://github.com/huggingface/evaluate.git
# and copy the `metrics/accuracy` folder to your current directory

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)
-->

In [24]:
import evaluate
import numpy as np

# 加载多个评估指标
accuracy_metric = evaluate.load("accuracy")
f1_metric = evaluate.load("f1")
precision_metric = evaluate.load("precision")
recall_metric = evaluate.load("recall")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)  # 获取预测类别
    
    # 计算每个指标
    results = {}
    results.update(accuracy_metric.compute(predictions=predictions, references=labels))
    results.update(f1_metric.compute(predictions=predictions, references=labels, average="binary"))
    results.update(precision_metric.compute(predictions=predictions, references=labels, average="binary"))
    results.update(recall_metric.compute(predictions=predictions, references=labels, average="binary"))
    
    return results

In [25]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=datas['news'],
    eval_dataset=datas['webnovel'],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

  trainer = Trainer(


In [26]:
trainer.train()

  attn_output = torch.nn.functional.scaled_dot_product_attention(


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 