In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizerFast, BertForSequenceClassification, AdamW
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
import seaborn as sns 

In [None]:
MODEL_NAME = 'bert-base-uncased'
DATA_PATH = 'Potato-Prolific-Dataset-main/dataset/offensiveness/raw_data.csv' 
BATCH_SIZE = 16 
MAX_LENGTH = 128 
EPOCHS = 50 
LEARNING_RATE = 1e-6
WEIGHT_DECAY = 0.05 
RANDOM_STATE = 42 
PATIENCE_LIMIT = 4 # Change the random seed to 128, 256, 512, 2025, then change the model name and train the model

if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f"Using GPU: {torch.cuda.get_device_name(0)}")
else:
    device = torch.device("cpu")
    print("Using CPU")

np.random.seed(RANDOM_STATE)
torch.manual_seed(RANDOM_STATE)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(RANDOM_STATE)

In [None]:
try:
    df_raw = pd.read_csv(DATA_PATH)
except FileNotFoundError:
    print(f"ERROR: Data file '{DATA_PATH}' not found. Please check that the path is correct.")
    exit()
print(f"Original dataset size: {len(df_raw)}")

df_white = df_raw[df_raw['race'] == 'White'][['text', 'offensiveness']].copy()
print(f"The size of the white data set after screening: {len(df_white)}")

df_white.dropna(subset=['text', 'offensiveness'], inplace=True)
df_white['text'] = df_white['text'].astype(str) 
df_white['offensiveness'] = df_white['offensiveness'].astype(int) 
print(f"White data set size after removing missing values: {len(df_white)}")

df_white['labels'] = (df_white['offensiveness'] > 1).astype(int)
print("\nBinary label distribution (0: non-aggressive, 1: aggressive):")
print(df_white['labels'].value_counts().sort_index())
if len(df_white) == 0:
    print("Error: No data left after filtering and processing. Please check your data and filtering conditions.")
    exit()

In [None]:
train_val_df, test_df = train_test_split(
    df_white,
    test_size=0.2,
    stratify=df_white['labels'],
    random_state=RANDOM_STATE
)
train_df, val_df = train_test_split(
    train_val_df,
    test_size=0.25,
    stratify=train_val_df['labels'], 
    random_state=RANDOM_STATE
)

print(f"\n训练集大小: {len(train_df)}")
print(f"验证集大小: {len(val_df)}")
print(f"测试集大小: {len(test_df)}")
print("\n训练集二分类标签分布:")
print(train_df['labels'].value_counts().sort_index())
print("\n验证集二分类标签分布:")
print(val_df['labels'].value_counts().sort_index())
print("\n测试集二分类标签分布:")
print(test_df['labels'].value_counts().sort_index())

In [None]:
class_labels_np = np.array(sorted(train_df['labels'].unique()))
class_weights = compute_class_weight(
    class_weight='balanced',
    classes=class_labels_np, 
    y=train_df['labels'].values
)
class_weights_tensor = torch.tensor(class_weights, dtype=torch.float).to(device)
print(f"\n计算得到的二分类类别权重: {class_weights_tensor}")

In [None]:
tokenizer = BertTokenizerFast.from_pretrained(MODEL_NAME)
class OffensivenessDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels 
        self.tokenizer = tokenizer
        self.max_len = max_len
    def __len__(self):
        return len(self.texts)
    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = int(self.labels[idx]) 
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long) 
        }
train_dataset = OffensivenessDataset(
    texts=train_df['text'].values,
    labels=train_df['labels'].values,
    tokenizer=tokenizer,
    max_len=MAX_LENGTH
)
val_dataset = OffensivenessDataset(
    texts=val_df['text'].values,
    labels=val_df['labels'].values,
    tokenizer=tokenizer,
    max_len=MAX_LENGTH
)
test_dataset = OffensivenessDataset(
    texts=test_df['text'].values,
    labels=test_df['labels'].values,
    tokenizer=tokenizer,
    max_len=MAX_LENGTH
)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

In [None]:

model = BertForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)
model.to(device)

optimizer = AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)

loss_fn = torch.nn.CrossEntropyLoss(weight=class_weights_tensor)

def train_epoch(model, data_loader, loss_fn, optimizer, device, n_examples):
    model.train()
    losses = []
    correct_predictions = 0
    for batch in tqdm(data_loader, desc="Training"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        optimizer.zero_grad()
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels 
        )
        logits = outputs.logits
        loss = loss_fn(logits, labels)
        _, preds = torch.max(outputs.logits, dim=1)
        correct_predictions += torch.sum(preds == labels)
        losses.append(loss.item())
        loss.backward()
        optimizer.step()
    return correct_predictions.double() / n_examples, np.mean(losses)
def eval_model(model, data_loader, loss_fn, device, n_examples):
    model.eval()
    losses = []
    correct_predictions = 0
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for batch in tqdm(data_loader, desc="Evaluating"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )
            logits = outputs.logits
            loss = loss_fn(logits, labels)
            _, preds = torch.max(outputs.logits, dim=1)
            correct_predictions += torch.sum(preds == labels)
            losses.append(loss.item())
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    return correct_predictions.double() / n_examples, np.mean(losses), all_preds, all_labels

In [None]:
from sklearn.metrics import f1_score

best_val_macro_f1 = 0
best_model_path = "models/bert_seed42.bin"
patience_counter = 0

history = {
    'train_loss': [],
    'train_acc': [],
    'val_loss': [],
    'val_acc': [],
    'val_macro_f1': []
}

print("\Start training...")
for epoch in range(EPOCHS):
    print(f'Epoch {epoch + 1}/{EPOCHS}')
    print('-' * 10)

    train_acc, train_loss = train_epoch(
        model,
        train_loader,
        loss_fn,
        optimizer,
        device,
        len(train_df)
    )
    print(f'Training set Loss: {train_loss:.4f} Accuracy: {train_acc:.4f}')

    val_acc, val_loss, val_preds, val_labels = eval_model( 
        model,
        val_loader,
        loss_fn,
        device,
        len(val_df)
    )
    print(f'Validation set Loss: {val_loss:.4f} Accuracy: {val_acc:.4f}')

    macro_f1 = f1_score(val_labels, val_preds, average='macro')
    print(f'Validation set Macro F1: {macro_f1:.4f}')

    history['train_loss'].append(train_loss)
    history['train_acc'].append(train_acc.item())
    history['val_loss'].append(val_loss)
    history['val_acc'].append(val_acc.item()) 
    history['val_macro_f1'].append(macro_f1)

    if macro_f1 > best_val_macro_f1:
        best_val_macro_f1 = macro_f1
        patience_counter = 0
        print(f"New best validation set macro F1: {best_val_macro_f1:.4f}. Save checkpoint to {best_model_path}")
        torch.save({
            'epoch': epoch, 
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'best_val_macro_f1': best_val_macro_f1,
            'history': history,
            'num_labels': 2,
            'class_weights_tensor': class_weights_tensor, 
            'max_length': MAX_LENGTH 
        }, best_model_path)
    else:
        patience_counter += 1
        print(f"The validation set macro F1 did not improve. Patience: {patience_counter}/{PATIENCE_LIMIT}")
    if patience_counter >= PATIENCE_LIMIT:
        print("When the early stopping condition is reached, the training stops.")
        break
    print()

print("Training Completed!")

In [None]:
# 11. 绘制训练过程曲线
plt.figure(figsize=(12, 5))
# 绘制损失曲线
plt.subplot(1, 2, 1)
# 只绘制实际训练的epoch数量
epochs_trained = len(history['train_loss'])
plt.plot(range(1, epochs_trained + 1), history['train_loss'], label='Train Loss')
plt.plot(range(1, epochs_trained + 1), history['val_loss'], label='Validation Loss')
plt.title('Training and Validation Loss per Epoch')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.grid(True)
# 绘制准确率曲线
plt.subplot(1, 2, 2)
plt.plot(range(1, epochs_trained + 1), history['train_acc'], label='Train Accuracy')
plt.plot(range(1, epochs_trained + 1), history['val_acc'], label='Validation Accuracy')
plt.title('Training and Validation Accuracy per Epoch')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
from sklearn.metrics import classification_report, confusion_matrix

checkpoint = torch.load('models/bert_seed42.bin', map_location=device)
model.load_state_dict(checkpoint['model_state_dict'])

model.eval()
all_preds = []
all_labels = []
with torch.no_grad():
    for batch in tqdm(test_loader, desc="Test evaluating"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=1)
        all_preds.append(preds.cpu().numpy())
        all_labels.append(labels.cpu().numpy())

all_preds = np.concatenate(all_preds)
all_labels = np.concatenate(all_labels)

target_names = ['Non-offensive', 'Offensive']
print("Classification Report on Test Set:")
print(classification_report(all_labels, all_preds, target_names=target_names, digits=4))

cm = confusion_matrix(all_labels, all_preds)
print("Confusion Matrix:")
print(cm)

plt.figure(figsize=(5,4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=target_names, yticklabels=target_names)
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix - Test Set')
plt.show()
