# Fine-tuning a model with the Trainer API or Keras

Install the Transformers, Datasets, and Evaluate libraries to run this notebook.

In [None]:
!pip install datasets evaluate transformers[sentencepiece]

In [3]:
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding

raw_datasets = load_dataset("glue", "mrpc")
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)


def tokenize_function(example):
    return tokenizer(example["sentence1"], example["sentence2"], truncation=True)


tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

README.md:   0%|          | 0.00/35.3k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/649k [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/75.7k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/308k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/3668 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/408 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1725 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



Map:   0%|          | 0/3668 [00:00<?, ? examples/s]

Map:   0%|          | 0/408 [00:00<?, ? examples/s]

Map:   0%|          | 0/1725 [00:00<?, ? examples/s]

In [5]:
import zipfile
import os

# 定义 ZIP 文件的路径和解压目标目录
zip_file_path = '/content/data.zip'  # 你要解压的 ZIP 文件路径
extract_to_dir = '/content/data'  # 你希望将文件解压到哪个目录

# 确保目标目录存在，如果不存在则创建它
if not os.path.exists(extract_to_dir):
    os.makedirs(extract_to_dir)

# 解压 ZIP 文件
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(extract_to_dir)

print(f"文件已成功解压到 {extract_to_dir}")


文件已成功解压到 /content/data


In [33]:
# 文本预处理 huggingface tweetnlp
#aspects
import os
import json
# Use a pipeline as a high-level helper
from transformers import pipeline

ner_model = pipeline("token-classification", model="cardiffnlp/twitter-roberta-base-ner7-latest")


folder_path='./data/data/'
result={}
file_num=len(os.listdir(folder_path))
cur_num=0
for filename in os.listdir(folder_path):
  cur_num+=1
  if cur_num>=10:
    break
  #print('processing:',cur_num,'/',file_num)
  if filename.endswith(".txt"):
    file_path=os.path.join(folder_path,filename)
    with open(file_path,'r',encoding='utf-8') as file:
        tweet=file.read()
        entities=ner_model(tweet)
        #for entity in entities:
            #print(f"Entity: {entity['word']}, Label: {entity['entity']}, Score: {entity['score']}")

        words=tweet.split()
        current_entity=[]
        cur_idx=filename[:len(filename)-4]
        entity_start_idx=None
        entity_end_idx=None
        result[cur_idx]={
            "aspect_spans": [],
            "aspect_texts": []
        }
        for idx,item in enumerate(entities):
          print(item)
          word=item['word']
          label=item['entity']
          idx=item['index']
          if label.startswith('B-'):
            if current_entity:
              if entity_start_idx and entity_end_idx and entity_start_idx<entity_end_idx:
                result[cur_idx]["aspect_spans"].append([entity_start_idx,entity_end_idx])
              else:
                result[cur_idx]["aspect_spans"].append([entity_start_idx])
              result[cur_idx]["aspect_texts"].append(current_entity)
            current_entity=[word]
            entity_start_idx=idx
            entity_end_idx=idx
          elif label.startswith('I-'):
            current_entity.append(word)
            entity_end_idx=idx

        if current_entity:
          if entity_start_idx and entity_end_idx and entity_start_idx<entity_end_idx:
            result[cur_idx]["aspect_spans"].append([entity_start_idx,entity_end_idx])
          else:
            result[cur_idx]["aspect_spans"].append([entity_start_idx])
          result[cur_idx]["aspect_texts"].append(current_entity)


output_path = '/content/aspects.json'
print('NER finished.')
# # 将字典保存为 JSON 文件
# with open(output_path, 'w', encoding='utf-8') as json_file:
#     json.dump(data, json_file, ensure_ascii=False, indent=4)

# print(f"文件已成功保存到 {file_path}")

{'entity': 'B-corporation', 'score': 0.5751563, 'index': 3, 'word': '@', 'start': 1, 'end': 2}
{'entity': 'I-corporation', 'score': 0.8865936, 'index': 4, 'word': 'BBC', 'start': 2, 'end': 5}
{'entity': 'I-corporation', 'score': 0.924744, 'index': 5, 'word': '_', 'start': 5, 'end': 6}
{'entity': 'I-corporation', 'score': 0.8834677, 'index': 6, 'word': 'Travel', 'start': 6, 'end': 12}
{'entity': 'B-location', 'score': 0.8820649, 'index': 25, 'word': 'Canada', 'start': 81, 'end': 87}
{'entity': 'B-event', 'score': 0.4296548, 'index': 1, 'word': 'C', 'start': 0, 'end': 1}
{'entity': 'B-person', 'score': 0.9581105, 'index': 12, 'word': 'Ġ@', 'start': 46, 'end': 47}
{'entity': 'I-person', 'score': 0.97571534, 'index': 13, 'word': 'Matthew', 'start': 47, 'end': 54}
{'entity': 'I-person', 'score': 0.9885982, 'index': 14, 'word': 'K', 'start': 54, 'end': 55}
{'entity': 'I-person', 'score': 0.9891704, 'index': 15, 'word': 'ell', 'start': 55, 'end': 58}
{'entity': 'I-person', 'score': 0.9863468,

In [31]:
# 将字典保存为 JSON 文件
with open(output_path, 'w', encoding='utf-8') as json_file:
    json.dump(result, json_file, ensure_ascii=False, indent=4)

print(f"文件已成功保存到 {output_path}")

文件已成功保存到 /content/aspects.json


In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments("test-trainer")

In [None]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

In [None]:
from transformers import Trainer

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
)

In [None]:
trainer.train()

In [None]:
predictions = trainer.predict(tokenized_datasets["validation"])
print(predictions.predictions.shape, predictions.label_ids.shape)

(408, 2) (408,)

In [None]:
import numpy as np

preds = np.argmax(predictions.predictions, axis=-1)

In [None]:
import evaluate

metric = evaluate.load("glue", "mrpc")
metric.compute(predictions=preds, references=predictions.label_ids)

{'accuracy': 0.8578431372549019, 'f1': 0.8996539792387542}

In [None]:
def compute_metrics(eval_preds):
    metric = evaluate.load("glue", "mrpc")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [None]:
training_args = TrainingArguments("test-trainer", evaluation_strategy="epoch")
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()