In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import torch
from torch.utils.data import Dataset
import numpy as np

def fine_tune_risk_classifier(texts, labels):
    
    # 加载分词器和预训练模型
    tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-cased")
    model = AutoModelForSequenceClassification.from_pretrained(
        "bert-base-multilingual-cased",
        num_labels=3  # 3种风险类型：利率风险、政治风险、经济风险
    )
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    
    # 创建数据集类
    class RiskDataset(Dataset):
        def __init__(self, texts, labels, tokenizer, max_length=512):
            self.encodings = tokenizer(texts, truncation=True, padding=True, max_length=max_length, return_tensors="pt")
            self.labels = labels
            
        def __getitem__(self, idx):
            item = {key: val[idx] for key, val in self.encodings.items()}
            item['labels'] = torch.tensor(self.labels[idx])
            return item
        
        def __len__(self):
            return len(self.labels)
    
    # 将数据集分为训练集和验证集
    from sklearn.model_selection import train_test_split
    train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels, test_size=0.1, random_state=42)
    
    # 创建训练集和验证集
    train_dataset = RiskDataset(train_texts, train_labels, tokenizer)
    val_dataset = RiskDataset(val_texts, val_labels, tokenizer)
    
    # 定义评估函数
    def compute_metrics(eval_pred):
        predictions, labels = eval_pred
        predictions = np.argmax(predictions, axis=1)
        
        # 计算准确率
        accuracy = (predictions == labels).mean()
        
        return {"accuracy": accuracy}
    
    # 定义训练参数
    training_args = TrainingArguments(
        output_dir="./results",
        num_train_epochs=8,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        warmup_steps=500,
        weight_decay=0.01,
        logging_dir="./logs",
        logging_steps=10,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
    )
    
    # 创建Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=compute_metrics,
    )
    
    # 开始微调
    trainer.train()
    
    # 保存模型和分词器
    model_path = "./fine_tuned_risk_classifier"
    model.save_pretrained(model_path)
    tokenizer.save_pretrained(model_path)
    
    return model, tokenizer

In [17]:
texts=[]
with open("country_classify.txt", "r") as f:
    for line in f.readlines():
        texts.append(line.strip())

In [18]:
exchange_rate_labels=[0 for i in range(53)]
political_labels=[1 for i in range(23)]
economic_labels=[2 for i in range(51)]
lebels=exchange_rate_labels+political_labels+economic_labels
lebels

[0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2]

In [19]:
len(lebels),len(texts)

(127, 128)

In [20]:
texts[-1]

''

In [21]:
texts.remove(texts[-1])

In [22]:
len(texts)

127

In [23]:
fine_tune_risk_classifier(texts=texts,labels=lebels)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  9%|▉         | 11/120 [00:01<00:17,  6.11it/s]

{'loss': 1.0955, 'grad_norm': 12.26965618133545, 'learning_rate': 1.0000000000000002e-06, 'epoch': 0.67}


                                                
 12%|█▎        | 15/120 [00:02<00:16,  6.20it/s]

{'eval_loss': 1.0697728395462036, 'eval_accuracy': 0.38461538461538464, 'eval_runtime': 0.1103, 'eval_samples_per_second': 117.873, 'eval_steps_per_second': 18.134, 'epoch': 1.0}


 18%|█▊        | 21/120 [00:05<00:25,  3.90it/s]

{'loss': 1.0774, 'grad_norm': 3.790902614593506, 'learning_rate': 2.0000000000000003e-06, 'epoch': 1.33}


 25%|██▌       | 30/120 [00:06<00:15,  5.87it/s]

{'loss': 1.0799, 'grad_norm': 12.577013969421387, 'learning_rate': 3e-06, 'epoch': 2.0}


                                                
 25%|██▌       | 30/120 [00:06<00:15,  5.87it/s]

{'eval_loss': 1.025092363357544, 'eval_accuracy': 0.38461538461538464, 'eval_runtime': 0.0388, 'eval_samples_per_second': 334.895, 'eval_steps_per_second': 51.522, 'epoch': 2.0}


 34%|███▍      | 41/120 [00:10<00:14,  5.50it/s]

{'loss': 1.0192, 'grad_norm': 6.5642008781433105, 'learning_rate': 4.000000000000001e-06, 'epoch': 2.67}


                                                
 38%|███▊      | 45/120 [00:11<00:12,  5.93it/s]

{'eval_loss': 0.8873974084854126, 'eval_accuracy': 0.6923076923076923, 'eval_runtime': 0.1062, 'eval_samples_per_second': 122.381, 'eval_steps_per_second': 18.828, 'epoch': 3.0}


 42%|████▎     | 51/120 [00:14<00:17,  3.84it/s]

{'loss': 0.9318, 'grad_norm': 5.5779924392700195, 'learning_rate': 5e-06, 'epoch': 3.33}


 50%|█████     | 60/120 [00:15<00:10,  5.93it/s]

{'loss': 0.8934, 'grad_norm': 9.594331741333008, 'learning_rate': 6e-06, 'epoch': 4.0}


                                                
 50%|█████     | 60/120 [00:15<00:10,  5.93it/s]

{'eval_loss': 0.7587037682533264, 'eval_accuracy': 0.7692307692307693, 'eval_runtime': 0.042, 'eval_samples_per_second': 309.829, 'eval_steps_per_second': 47.666, 'epoch': 4.0}


 59%|█████▉    | 71/120 [00:19<00:08,  5.62it/s]

{'loss': 0.7948, 'grad_norm': 9.109843254089355, 'learning_rate': 7.000000000000001e-06, 'epoch': 4.67}


                                                
 62%|██████▎   | 75/120 [00:19<00:07,  6.05it/s]

{'eval_loss': 0.6326590180397034, 'eval_accuracy': 0.9230769230769231, 'eval_runtime': 0.109, 'eval_samples_per_second': 119.23, 'eval_steps_per_second': 18.343, 'epoch': 5.0}


 68%|██████▊   | 81/120 [00:22<00:09,  4.05it/s]

{'loss': 0.7012, 'grad_norm': 8.35789680480957, 'learning_rate': 8.000000000000001e-06, 'epoch': 5.33}


 75%|███████▌  | 90/120 [00:24<00:04,  6.06it/s]

{'loss': 0.5965, 'grad_norm': 9.299725532531738, 'learning_rate': 9e-06, 'epoch': 6.0}


                                                
 75%|███████▌  | 90/120 [00:24<00:04,  6.06it/s]

{'eval_loss': 0.49047544598579407, 'eval_accuracy': 0.8461538461538461, 'eval_runtime': 0.045, 'eval_samples_per_second': 288.772, 'eval_steps_per_second': 44.426, 'epoch': 6.0}


 84%|████████▍ | 101/120 [00:27<00:03,  5.60it/s]

{'loss': 0.4914, 'grad_norm': 6.363148212432861, 'learning_rate': 1e-05, 'epoch': 6.67}


                                                 
 88%|████████▊ | 105/120 [00:28<00:02,  6.02it/s]

{'eval_loss': 0.3044179081916809, 'eval_accuracy': 1.0, 'eval_runtime': 0.1071, 'eval_samples_per_second': 121.399, 'eval_steps_per_second': 18.677, 'epoch': 7.0}


 92%|█████████▎| 111/120 [00:30<00:02,  4.08it/s]

{'loss': 0.3872, 'grad_norm': 6.000543117523193, 'learning_rate': 1.1000000000000001e-05, 'epoch': 7.33}


100%|██████████| 120/120 [00:32<00:00,  5.92it/s]

{'loss': 0.2419, 'grad_norm': 5.796082019805908, 'learning_rate': 1.2e-05, 'epoch': 8.0}


                                                 
100%|██████████| 120/120 [00:34<00:00,  5.92it/s]

{'eval_loss': 0.19464987516403198, 'eval_accuracy': 1.0, 'eval_runtime': 0.0361, 'eval_samples_per_second': 360.443, 'eval_steps_per_second': 55.453, 'epoch': 8.0}


100%|██████████| 120/120 [00:36<00:00,  3.30it/s]


{'train_runtime': 36.3365, 'train_samples_per_second': 25.099, 'train_steps_per_second': 3.302, 'train_loss': 0.7758480389912923, 'epoch': 8.0}


(BertForSequenceClassification(
   (bert): BertModel(
     (embeddings): BertEmbeddings(
       (word_embeddings): Embedding(119547, 768, padding_idx=0)
       (position_embeddings): Embedding(512, 768)
       (token_type_embeddings): Embedding(2, 768)
       (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
       (dropout): Dropout(p=0.1, inplace=False)
     )
     (encoder): BertEncoder(
       (layer): ModuleList(
         (0-11): 12 x BertLayer(
           (attention): BertAttention(
             (self): BertSdpaSelfAttention(
               (query): Linear(in_features=768, out_features=768, bias=True)
               (key): Linear(in_features=768, out_features=768, bias=True)
               (value): Linear(in_features=768, out_features=768, bias=True)
               (dropout): Dropout(p=0.1, inplace=False)
             )
             (output): BertSelfOutput(
               (dense): Linear(in_features=768, out_features=768, bias=True)
               (LayerNorm): L

In [24]:
from transformers import BertForSequenceClassification, BertTokenizerFast

def load_finetuned_model(model_path):
    """
    加载微调后的BERT模型和tokenizer。
    
    参数:
    model_path (str): 微调模型保存的路径
    
    返回:
    model (BertForSequenceClassification): 微调后的BERT模型
    tokenizer (BertTokenizerFast): 与模型对应的tokenizer
    """
    # 加载微调后的模型
    model = BertForSequenceClassification.from_pretrained(model_path)
    
    # 加载tokenizer
    tokenizer = BertTokenizerFast.from_pretrained(model_path)
    
    return model, tokenizer
model_path = './fine_tuned_risk_classifier'
model, tokenizer = load_finetuned_model(model_path)


In [None]:
import random
from transformers import BertForSequenceClassification, BertTokenizerFast
import torch

def load_finetuned_model(model_path):
    model = BertForSequenceClassification.from_pretrained(model_path)
    tokenizer = BertTokenizerFast.from_pretrained(model_path)
    return model, tokenizer

def predict(model, tokenizer, text):
    # 准备输入
    inputs = tokenizer(text, padding=True, truncation=True, return_tensors="pt")
    
    # 模型评估模式
    model.eval()

    # 禁用梯度计算
    with torch.no_grad():
        # 获取logits输出
        outputs = model(**inputs)
    
    # 获取类别预测结果
    logits = outputs.logits
    predicted_class = torch.argmax(logits, dim=-1).item()

    return predicted_class

def evaluate_accuracy(model, tokenizer, texts, true_labels, sample_size):
    # 随机选择 sample_size 个文本
    selected_texts = random.sample(list(zip(texts, true_labels)), sample_size)

    # 预测结果与真实标签比较
    correct = 0
    for text, true_label in selected_texts:
        predicted_label = predict(model, tokenizer, text)
        if predicted_label == true_label:
            correct += 1

    # 计算准确率
    accuracy = correct / sample_size
    return accuracy

# 使用示例
model_path = './fine_tuned_risk_classifier'
model, tokenizer = load_finetuned_model(model_path)


sample_size = len(lebels)
accuracy = evaluate_accuracy(model, tokenizer, texts, true_labels=lebels, sample_size=sample_size)

print(f"Accuracy: {accuracy * 100:.2f}%")


Accuracy: 100.00%
