In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, BertForSequenceClassification, AdamW
import torch
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, accuracy_score, f1_score

In [None]:
# 自定义Dataset类
class TextDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item
# 定义tokenization函数
def tokenize_texts(texts):
    return tokenizer(texts.tolist(), padding=True, truncation=True, return_tensors='pt')

In [None]:
df = pd.read_csv('/home/kara/classification/dataset/dataset.csv')
# 数据分割
train_texts, test_texts, train_labels, test_labels = train_test_split(df['txt'], df['label'], test_size=0.2, random_state=42)

In [None]:
# 初始化BERT tokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

In [None]:
# 对文本进行tokenization
train_encodings = tokenize_texts(train_texts)
test_encodings = tokenize_texts(test_texts)

In [None]:
# 创建DataLoader
train_dataset = TextDataset(train_encodings, train_labels.tolist())
test_dataset = TextDataset(test_encodings, test_labels.tolist())

In [None]:
train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=128, shuffle=False)

In [None]:
# 模型
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
# 优化器
optimizer = AdamW(model.parameters(), lr=2e-5)
#gpu
device = torch.device('cuda:0')
model.to(device)
#loss func
criterion = torch.nn.CrossEntropyLoss()

In [None]:
num_epochs = 1
train_losses = []
# 训练循环
for epoch in range(5):  # 假设训练3个epoch
    model.train()
    epoch_loss = 0
    progress_bar = tqdm(train_loader, desc=f"Epoch {epoch + 1}")
    
    for batch in progress_bar:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = criterion(outputs.logits, labels)
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        progress_bar.set_postfix(loss=epoch_loss / len(train_loader))


  
    avg_train_loss = epoch_loss / len(train_loader)
    train_losses.append(avg_train_loss)
    print(f'Epoch {epoch + 1}')
    print(f'Training loss: {avg_train_loss}')

In [None]:
model_path = '/home/kara/classification/model/1'
# 保存模型
model.save_pretrained(model_path)
plt.figure(figsize=(10, 5))
plt.plot(train_losses, label='Training Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.title('Training Loss')
plt.show()
plt.savefig(model_path + 'training_loss.png')

In [None]:
# 测试模型并生成混淆矩阵
model.eval()
all_preds = []
all_labels = []

In [None]:
with torch.no_grad():
    for batch in tqdm(test_loader, desc="Testing"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        preds = torch.argmax(outputs.logits, dim=1).flatten()

        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

In [None]:
accuracy = accuracy_score(all_labels, all_preds)
f1 = f1_score(all_labels, all_preds, average='binary')  # 也可以使用'micro'或'macro'作为average参数

In [None]:
# 生成混淆矩阵
cm = confusion_matrix(all_labels, all_preds)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot()
plt.show()
plt.savefig(model_path + 'matrix.png')
print(accuracy)
print(f1)
print(cm)