#  00 preparation

In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification,
    Trainer, TrainingArguments, EarlyStoppingCallback
)
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, classification_report, f1_score, precision_score, recall_score
import os
import warnings
import shutil
import gc
warnings.filterwarnings('ignore')

# Disable wandb logging
os.environ["WANDB_DISABLED"] = "true"

# Function to clean up disk space
def cleanup_disk_space():
    """Clean up unnecessary files to save disk space"""
    print("Cleaning up disk space...")
    
    # Remove cache directories
    cache_dirs = [
        '/tmp',
        '/kaggle/working/.cache',
        '/root/.cache'
    ]
    
    for cache_dir in cache_dirs:
        if os.path.exists(cache_dir):
            try:
                for item in os.listdir(cache_dir):
                    item_path = os.path.join(cache_dir, item)
                    if os.path.isdir(item_path):
                        shutil.rmtree(item_path, ignore_errors=True)
                    else:
                        os.remove(item_path)
            except:
                pass
    
    # Force garbage collection
    gc.collect()
    torch.cuda.empty_cache() if torch.cuda.is_available() else None
    
    print("Disk cleanup completed!")

def check_disk_space():
    """Check available disk space"""
    statvfs = os.statvfs('/kaggle/working')
    free_space_gb = (statvfs.f_frsize * statvfs.f_bavail) / (1024**3)
    print(f"Available disk space: {free_space_gb:.2f} GB")
    return free_space_gb

# Check GPU availability and disk space
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Initial disk space check
check_disk_space()
cleanup_disk_space()
check_disk_space()

# Load dataset
print("Loading dataset...")
df = pd.read_csv('/kaggle/input/taiwan-political-news-dataset/news_training_with_translations.csv')
print(f"Dataset shape: {df.shape}")
print(f"Columns: {df.columns.tolist()}")
print(f"Label distribution:\n{df['label_encoded'].value_counts()}")

2025-06-02 18:13:52.762934: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1748888032.926520      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1748888032.975514      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Using device: cuda
Available disk space: 19.50 GB
Cleaning up disk space...
Disk cleanup completed!
Available disk space: 19.50 GB
Loading dataset...
Dataset shape: (3166, 8)
Columns: ['id', 'media_name', 'title', 'content', 'label', 'label_encoded', 'title_en', 'content_en']
Label distribution:
label_encoded
1    1493
2     981
0     692
Name: count, dtype: int64


# 01 Fine-tune POLITICS model on English data

In [2]:
class PoliticsDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]
        
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

def train_politics_model(texts, labels, model_name, save_path, max_length=512):
    """Fine-tune POLITICS model"""
    print(f"\nTraining POLITICS model: {model_name}")
    print(f"Max length: {max_length}")
    
    # Check disk space before training
    if check_disk_space() < 2.0:  # Less than 2GB
        cleanup_disk_space()
    
    # Load model and tokenizer
    tokenizer = AutoTokenizer.from_pretrained('launch/POLITICS')
    model = AutoModelForSequenceClassification.from_pretrained(
        'launch/POLITICS',
        num_labels=len(np.unique(labels))
    )
    
    # Create dataset
    dataset = PoliticsDataset(texts, labels, tokenizer, max_length)
    
    # Training arguments - optimized for space
    training_args = TrainingArguments(
        output_dir=f'./politics_{model_name}',
        num_train_epochs=3,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        warmup_steps=500,
        weight_decay=0.01,
        logging_dir=f'./logs_{model_name}',
        logging_steps=100,
        save_strategy="no",  # Don't save intermediate checkpoints
        eval_strategy="no",  # No validation set as requested
        load_best_model_at_end=False,
        dataloader_pin_memory=False,
        report_to=None,  # Disable wandb and other logging
        save_total_limit=1,  # Only keep the final model
    )
    
    # Create trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=dataset,
        tokenizer=tokenizer,
    )
    
    # Train model
    print("Starting training...")
    trainer.train()
    
    # Save model (only the final one)
    model.save_pretrained(save_path)
    tokenizer.save_pretrained(save_path)
    
    # Clean up trainer and temporary files
    del trainer
    cleanup_disk_space()
    
    print(f"Model saved to {save_path}")
    return model, tokenizer

# Train POLITICS models
print("\n" + "="*50)
print("PHASE 1: Fine-tuning POLITICS models")
print("="*50)

# Model 1: Title only
politics_title_model, politics_title_tokenizer = train_politics_model(
    texts=df['title_en'].fillna(''),
    labels=df['label_encoded'].values,
    model_name='title',
    save_path='./politics_title_model',
    max_length=128
)

# Model 2: Title + Content
combined_texts = (df['title_en'].fillna('') + ' ' + df['content_en'].fillna('')).str.strip()
politics_combined_model, politics_combined_tokenizer = train_politics_model(
    texts=combined_texts,
    labels=df['label_encoded'].values,
    model_name='combined',
    save_path='./politics_combined_model',
    max_length=512
)


PHASE 1: Fine-tuning POLITICS models

Training POLITICS model: title
Max length: 128
Available disk space: 19.50 GB


tokenizer_config.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/672 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at launch/POLITICS and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Starting training...


Step,Training Loss
100,1.0483
200,0.9901
300,0.942
400,0.8692
500,0.7964
600,0.7834
700,0.7577
800,0.7745
900,0.5146
1000,0.4899


Cleaning up disk space...
Disk cleanup completed!
Model saved to ./politics_title_model

Training POLITICS model: combined
Max length: 512
Available disk space: 19.03 GB


tokenizer_config.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/672 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at launch/POLITICS and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Starting training...


model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Step,Training Loss
100,1.0372
200,0.8958
300,0.7793
400,0.7625
500,0.6677
600,0.6087
700,0.5913
800,0.5767
900,0.3543
1000,0.3261


Cleaning up disk space...
Disk cleanup completed!
Model saved to ./politics_combined_model


# 02 Generate pseudo-labels using fine-tuned POLITICS models

In [3]:
def generate_soft_predictions(model, tokenizer, texts, max_length=512, batch_size=16, temperature=3.0):
    """Generate soft predictions (probability distributions) for all texts"""
    model.eval()
    soft_predictions = []
    
    with torch.no_grad():
        for i in range(0, len(texts), batch_size):
            batch_texts = texts[i:i+batch_size]
            
            # Tokenize batch
            inputs = tokenizer(
                batch_texts,
                truncation=True,
                padding=True,
                max_length=max_length,
                return_tensors='pt'
            )
            
            # Move to device
            inputs = {k: v.to(device) for k, v in inputs.items()}
            
            # Get predictions
            outputs = model(**inputs)
            # Apply temperature scaling and softmax to get soft labels
            soft_labels = torch.softmax(outputs.logits / temperature, dim=-1)
            soft_predictions.extend(soft_labels.cpu().numpy())
    
    return np.array(soft_predictions)

print("\n" + "="*50)
print("PHASE 2: Generating pseudo-labels")
print("="*50)

# Move models to device
politics_title_model.to(device)
politics_combined_model.to(device)

# Generate soft predictions
print("Generating soft predictions with title model...")
title_soft_predictions = generate_soft_predictions(
    politics_title_model, 
    politics_title_tokenizer,
    df['title_en'].fillna('').tolist(),
    max_length=128,
    temperature=3.0  # Temperature for knowledge distillation
)

print("Generating soft predictions with combined model...")
combined_soft_predictions = generate_soft_predictions(
    politics_combined_model,
    politics_combined_tokenizer,
    combined_texts.tolist(),
    max_length=512,
    temperature=3.0  # Temperature for knowledge distillation
)

# Add soft predictions to dataframe (save as strings for CSV compatibility)
print("Saving soft predictions...")
df['politics_title_soft_pred'] = [','.join(map(str, pred)) for pred in title_soft_predictions]
df['politics_combined_soft_pred'] = [','.join(map(str, pred)) for pred in combined_soft_predictions]

print("Soft predictions generated successfully!")


PHASE 2: Generating pseudo-labels
Generating soft predictions with title model...
Generating soft predictions with combined model...
Saving soft predictions...
Soft predictions generated successfully!


# 03. Chinese BERT Dataset Class

In [4]:
class ChineseBertDatasetWithSoftLabels(Dataset):
    def __init__(self, texts, soft_labels, tokenizer, max_length=512):
        self.texts = texts
        self.soft_labels = soft_labels  # Now expects probability distributions
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts[idx])
        soft_label = self.soft_labels[idx]  # This is now a probability distribution
        
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(soft_label, dtype=torch.float)  # Soft labels as float tensors
        }

def compute_metrics_from_predictions(predictions, ground_truth):
    """Compute comprehensive metrics from predictions and ground truth"""
    
    # If predictions are soft labels (probabilities), convert to hard labels
    if len(predictions.shape) > 1 and predictions.shape[1] > 1:
        pred_labels = np.argmax(predictions, axis=1)
    else:
        pred_labels = predictions
    
    # Convert ground truth to numpy array if needed
    if isinstance(ground_truth, list):
        true_labels = np.array(ground_truth)
    else:
        true_labels = ground_truth
    
    # Compute metrics
    acc = accuracy_score(true_labels, pred_labels)
    f1_macro = f1_score(true_labels, pred_labels, average='macro', zero_division=0)
    f1_weighted = f1_score(true_labels, pred_labels, average='weighted', zero_division=0)
    prec_macro = precision_score(true_labels, pred_labels, average='macro', zero_division=0)
    recall_macro = recall_score(true_labels, pred_labels, average='macro', zero_division=0)
    
    return {
        'accuracy': acc,
        'f1_macro': f1_macro,
        'f1_weighted': f1_weighted,
        'precision_macro': prec_macro,
        'recall_macro': recall_macro
    }
        
def evaluate_model_on_data(model, tokenizer, texts, ground_truth, max_length=512, batch_size=16):
    """Evaluate model on given data and return metrics"""
    model.eval()
    model.to(device)
    predictions = []
    
    with torch.no_grad():
        for i in range(0, len(texts), batch_size):
            batch_texts = texts[i:i+batch_size]
            
            inputs = tokenizer(
                batch_texts,
                truncation=True,
                padding=True,
                max_length=max_length,
                return_tensors='pt'
            )
            
            inputs = {k: v.to(device) for k, v in inputs.items()}
            
            outputs = model(**inputs)
            # Get probabilities for comprehensive evaluation
            probs = torch.softmax(outputs.logits, dim=-1)
            predictions.extend(probs.cpu().numpy())
    
    predictions = np.array(predictions)
    metrics = compute_metrics_from_predictions(predictions, ground_truth)
    
    return metrics, predictions


# 04. Fine-tune Chinese BERT with 5-fold CV

In [5]:
class KnowledgeDistillationTrainer(Trainer):
    def __init__(self, temperature=3.0, alpha=0.7, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.temperature = temperature
        self.alpha = alpha
        
    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        """
        Custom loss function for knowledge distillation with soft labels
        """
        labels = inputs.pop("labels")  # These are now soft labels (probability distributions)
        
        # Forward pass
        outputs = model(**inputs)
        logits = outputs.logits
        
        # Apply temperature scaling
        student_probs = torch.softmax(logits / self.temperature, dim=-1)
        
        # Knowledge distillation loss (KL divergence between teacher and student)
        kd_loss = torch.nn.functional.kl_div(
            torch.log_softmax(logits / self.temperature, dim=-1),
            labels,  # Teacher soft labels
            reduction='batchmean'
        ) * (self.temperature ** 2)
        
        loss = kd_loss
        
        return (loss, outputs) if return_outputs else loss

def train_chinese_bert_kfold(texts, soft_labels, model_type, use_content=True, n_splits=5):
    """Train Chinese BERT with 5-fold cross validation using soft labels for knowledge distillation"""
    
    print(f"\n" + "="*60)
    print(f"Training Chinese BERT with Knowledge Distillation - {model_type}")
    print(f"Use content: {use_content}")
    print("="*60)
    
    # Hyperparameters
    max_len = 512 if use_content else 128
    batch_size = 16
    epochs = 5
    lr = 2e-5
    weight_decay = 0.01
    warmup_ratio = 0.1
    dropout = 0.1
    patience = 2
    temperature = 3.0  # Knowledge distillation temperature
    
    # Initialize tokenizer
    tokenizer = AutoTokenizer.from_pretrained('ckiplab/bert-base-chinese')
    
    # Convert soft labels from strings back to arrays
    if isinstance(soft_labels[0], str):
        soft_labels_array = np.array([np.fromstring(label, sep=',') for label in soft_labels])
    else:
        soft_labels_array = np.array(soft_labels)
    
    # Create hard labels for stratification (argmax of soft labels)
    hard_labels_for_split = np.argmax(soft_labels_array, axis=1)
    
    # Get ground truth labels for evaluation
    ground_truth = df['label_encoded'].values
    
    # 5-fold cross validation
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    fold_results = []
    best_fold_model = None
    best_fold_tokenizer = None
    best_score = -1  # Using f1_macro as primary metric
    best_fold_idx = 0
    
    # Store all fold results for CSV
    all_fold_metrics = []
    
    for fold, (train_idx, val_idx) in enumerate(skf.split(texts, hard_labels_for_split)):
        print(f"\nFold {fold + 1}/{n_splits}")
        print("-" * 30)
        
        # Check disk space before each fold
        if check_disk_space() < 1.5:  # Less than 1.5GB
            cleanup_disk_space()
        
        # Split data
        train_texts = [texts[i] for i in train_idx]
        train_soft_labels = soft_labels_array[train_idx]
        val_texts = [texts[i] for i in val_idx]
        val_soft_labels = soft_labels_array[val_idx]
        val_ground_truth = ground_truth[val_idx]
        
        # Create datasets
        train_dataset = ChineseBertDatasetWithSoftLabels(train_texts, train_soft_labels, tokenizer, max_len)
        val_dataset = ChineseBertDatasetWithSoftLabels(val_texts, val_soft_labels, tokenizer, max_len)
        
        # Load model
        model = AutoModelForSequenceClassification.from_pretrained(
            'ckiplab/bert-base-chinese',
            num_labels=soft_labels_array.shape[1],  # Number of classes from soft labels
            hidden_dropout_prob=dropout,
            attention_probs_dropout_prob=dropout
        )
        
        # Training arguments - optimized for space
        training_args = TrainingArguments(
            output_dir=f'./temp_chinese_bert_{model_type}_fold_{fold}',
            num_train_epochs=epochs,
            per_device_train_batch_size=batch_size,
            per_device_eval_batch_size=batch_size,
            warmup_ratio=warmup_ratio,
            weight_decay=weight_decay,
            learning_rate=lr,
            logging_dir=f'./temp_logs_chinese_{model_type}_fold_{fold}',
            logging_steps=50,
            eval_strategy="epoch",
            save_strategy="epoch",  # 改為 "epoch" 以匹配 eval_strategy
            load_best_model_at_end=True,
            metric_for_best_model="eval_loss",
            greater_is_better=False,
            save_total_limit=1,  # 只保留最佳檢查點
            dataloader_pin_memory=False,
            report_to=None,  # 禁用 wandb 和其他日誌
        )
        
        # Create trainer with knowledge distillation
        trainer = KnowledgeDistillationTrainer(
            model=model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=val_dataset,
            tokenizer=tokenizer,
            temperature=temperature,
            callbacks=[EarlyStoppingCallback(early_stopping_patience=patience)]
        )
        
        # Train model
        print("Training with knowledge distillation...")
        trainer.train()
        
        # Evaluate on validation set using ground truth
        print("Evaluating on validation set...")
        val_metrics, val_predictions = evaluate_model_on_data(
            model, tokenizer, val_texts, val_ground_truth, max_len
        )
        
        # Store fold results
        fold_metrics = {
            'model_type': model_type,
            'fold': fold + 1,
            'accuracy': val_metrics['accuracy'],
            'f1_macro': val_metrics['f1_macro'],
            'f1_weighted': val_metrics['f1_weighted'],
            'precision_macro': val_metrics['precision_macro'],
            'recall_macro': val_metrics['recall_macro']
        }
        all_fold_metrics.append(fold_metrics)
        fold_results.append(val_metrics)
        
        # Check if this is the best model (using f1_macro as primary metric)
        current_score = val_metrics['f1_macro']
        if current_score > best_score:
            best_score = current_score
            best_fold_idx = fold
            # Save best model temporarily
            if best_fold_model is not None:
                del best_fold_model, best_fold_tokenizer
                cleanup_disk_space()
            best_fold_model = model
            best_fold_tokenizer = tokenizer
        
        print(f"Fold {fold + 1} Metrics:")
        print(f"  Accuracy: {val_metrics['accuracy']:.4f}")
        print(f"  F1 Macro: {val_metrics['f1_macro']:.4f}")
        print(f"  F1 Weighted: {val_metrics['f1_weighted']:.4f}")
        print(f"  Precision Macro: {val_metrics['precision_macro']:.4f}")
        print(f"  Recall Macro: {val_metrics['recall_macro']:.4f}")
        
        # Clean up current fold if it's not the best
        if fold != best_fold_idx or current_score <= best_score:
            del trainer, model
            # Remove temporary directories
            temp_dir = f'./temp_chinese_bert_{model_type}_fold_{fold}'
            temp_log_dir = f'./temp_logs_chinese_{model_type}_fold_{fold}'
            if os.path.exists(temp_dir):
                shutil.rmtree(temp_dir, ignore_errors=True)
            if os.path.exists(temp_log_dir):
                shutil.rmtree(temp_log_dir, ignore_errors=True)
            cleanup_disk_space()
    
    # Save fold results to CSV
    fold_results_df = pd.DataFrame(all_fold_metrics)
    csv_filename = f'fold_results_{model_type}.csv'
    fold_results_df.to_csv(csv_filename, index=False)
    print(f"\nFold results saved to: {csv_filename}")
    
    # Save only the best model
    best_model_path = f'./chinese_bert_{model_type}_fold_{best_fold_idx}_best'
    best_fold_model.save_pretrained(best_model_path)
    best_fold_tokenizer.save_pretrained(best_model_path)
    
    # Calculate and print average results
    avg_metrics = {
        'accuracy': np.mean([r['accuracy'] for r in fold_results]),
        'f1_macro': np.mean([r['f1_macro'] for r in fold_results]),
        'f1_weighted': np.mean([r['f1_weighted'] for r in fold_results]),
        'precision_macro': np.mean([r['precision_macro'] for r in fold_results]),
        'recall_macro': np.mean([r['recall_macro'] for r in fold_results])
    }
    
    std_metrics = {
        'accuracy': np.std([r['accuracy'] for r in fold_results]),
        'f1_macro': np.std([r['f1_macro'] for r in fold_results]),
        'f1_weighted': np.std([r['f1_weighted'] for r in fold_results]),
        'precision_macro': np.std([r['precision_macro'] for r in fold_results]),
        'recall_macro': np.std([r['recall_macro'] for r in fold_results])
    }
    
    print(f"\n5-Fold CV Results for {model_type} (Knowledge Distillation):")
    print(f"Accuracy: {avg_metrics['accuracy']:.4f} ± {std_metrics['accuracy']:.4f}")
    print(f"F1 Macro: {avg_metrics['f1_macro']:.4f} ± {std_metrics['f1_macro']:.4f}")
    print(f"F1 Weighted: {avg_metrics['f1_weighted']:.4f} ± {std_metrics['f1_weighted']:.4f}")
    print(f"Precision Macro: {avg_metrics['precision_macro']:.4f} ± {std_metrics['precision_macro']:.4f}")
    print(f"Recall Macro: {avg_metrics['recall_macro']:.4f} ± {std_metrics['recall_macro']:.4f}")
    print(f"Best model from fold {best_fold_idx + 1} with F1 Macro: {best_score:.4f}")
    
    return fold_results, best_fold_idx, avg_metrics

# Train Chinese BERT models with knowledge distillation using soft labels

# Model 1: Title only (using POLITICS title soft predictions as labels)
chinese_title_texts = df['title'].fillna('').tolist()
chinese_title_soft_labels = df['politics_title_soft_pred'].tolist()

title_cv_results, best_title_fold, title_avg_metrics = train_chinese_bert_kfold(
    texts=chinese_title_texts,
    soft_labels=chinese_title_soft_labels,
    model_type='title',
    use_content=False
)

# Clean up before next model
cleanup_disk_space()

# Model 2: Title + Content (using POLITICS combined soft predictions as labels)
chinese_combined_texts = (df['title'].fillna('') + ' ' + df['content'].fillna('')).str.strip().tolist()
chinese_combined_soft_labels = df['politics_combined_soft_pred'].tolist()

combined_cv_results, best_combined_fold, combined_avg_metrics = train_chinese_bert_kfold(
    texts=chinese_combined_texts,
    soft_labels=chinese_combined_soft_labels,
    model_type='combined',
    use_content=True
)


Training Chinese BERT with Knowledge Distillation - title
Use content: False


tokenizer_config.json:   0%|          | 0.00/174 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/701 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/110k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]


Fold 1/5
------------------------------
Available disk space: 18.56 GB


pytorch_model.bin:   0%|          | 0.00/409M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ckiplab/bert-base-chinese and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Training with knowledge distillation...


model.safetensors:   0%|          | 0.00/409M [00:00<?, ?B/s]

Epoch,Training Loss,Validation Loss
1,1.1053,0.995015
2,0.7494,0.805822
3,0.5143,0.756765
4,0.2644,0.758165
5,0.2023,0.746664


Evaluating on validation set...
Fold 1 Metrics:
  Accuracy: 0.7429
  F1 Macro: 0.7301
  F1 Weighted: 0.7421
  Precision Macro: 0.7345
  Recall Macro: 0.7264
Cleaning up disk space...
Disk cleanup completed!

Fold 2/5
------------------------------
Available disk space: 18.56 GB


config.json:   0%|          | 0.00/701 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/409M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ckiplab/bert-base-chinese and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Training with knowledge distillation...


model.safetensors:   0%|          | 0.00/409M [00:00<?, ?B/s]

Epoch,Training Loss,Validation Loss
1,1.1042,1.0367
2,0.661,0.786768
3,0.4962,0.747763
4,0.2769,0.805831
5,0.1961,0.713224


Evaluating on validation set...
Fold 2 Metrics:
  Accuracy: 0.7425
  F1 Macro: 0.7292
  F1 Weighted: 0.7416
  Precision Macro: 0.7345
  Recall Macro: 0.7249
Cleaning up disk space...
Disk cleanup completed!

Fold 3/5
------------------------------
Available disk space: 18.56 GB


config.json:   0%|          | 0.00/701 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/409M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ckiplab/bert-base-chinese and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Training with knowledge distillation...


model.safetensors:   0%|          | 0.00/409M [00:00<?, ?B/s]

Epoch,Training Loss,Validation Loss
1,1.1181,1.047374
2,0.7736,0.81236
3,0.4868,0.720573
4,0.335,0.715227
5,0.2192,0.725695


Evaluating on validation set...
Fold 3 Metrics:
  Accuracy: 0.7141
  F1 Macro: 0.7017
  F1 Weighted: 0.7150
  Precision Macro: 0.6980
  Recall Macro: 0.7064
Cleaning up disk space...
Disk cleanup completed!

Fold 4/5
------------------------------
Available disk space: 18.56 GB


config.json:   0%|          | 0.00/701 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/409M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ckiplab/bert-base-chinese and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Training with knowledge distillation...


Epoch,Training Loss,Validation Loss
1,1.1224,1.053741
2,0.7582,0.919142
3,0.4355,0.837508
4,0.3156,0.860793
5,0.2249,0.823593


model.safetensors:   0%|          | 0.00/409M [00:00<?, ?B/s]

Evaluating on validation set...
Fold 4 Metrics:
  Accuracy: 0.7125
  F1 Macro: 0.6929
  F1 Weighted: 0.7103
  Precision Macro: 0.7049
  Recall Macro: 0.6852
Cleaning up disk space...
Disk cleanup completed!

Fold 5/5
------------------------------
Available disk space: 18.56 GB


config.json:   0%|          | 0.00/701 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/409M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ckiplab/bert-base-chinese and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Training with knowledge distillation...


model.safetensors:   0%|          | 0.00/409M [00:00<?, ?B/s]

Epoch,Training Loss,Validation Loss
1,1.1857,1.071589
2,0.7909,0.971369
3,0.4336,0.851139
4,0.2938,0.835058
5,0.2176,0.840255


Evaluating on validation set...
Fold 5 Metrics:
  Accuracy: 0.7046
  F1 Macro: 0.6902
  F1 Weighted: 0.7033
  Precision Macro: 0.7028
  Recall Macro: 0.6821
Cleaning up disk space...
Disk cleanup completed!

Fold results saved to: fold_results_title.csv

5-Fold CV Results for title (Knowledge Distillation):
Accuracy: 0.7233 ± 0.0162
F1 Macro: 0.7088 ± 0.0174
F1 Weighted: 0.7224 ± 0.0163
Precision Macro: 0.7149 ± 0.0161
Recall Macro: 0.7050 ± 0.0188
Best model from fold 1 with F1 Macro: 0.7301
Cleaning up disk space...
Disk cleanup completed!

Training Chinese BERT with Knowledge Distillation - combined
Use content: True


tokenizer_config.json:   0%|          | 0.00/174 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/701 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/110k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]


Fold 1/5
------------------------------
Available disk space: 18.18 GB


pytorch_model.bin:   0%|          | 0.00/409M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ckiplab/bert-base-chinese and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Training with knowledge distillation...


model.safetensors:   0%|          | 0.00/409M [00:00<?, ?B/s]

Epoch,Training Loss,Validation Loss
1,2.25,1.907362
2,1.438,1.540704
3,0.8515,1.414811
4,0.4286,1.694902
5,0.25,1.642087


Evaluating on validation set...
Fold 1 Metrics:
  Accuracy: 0.7729
  F1 Macro: 0.7617
  F1 Weighted: 0.7734
  Precision Macro: 0.7577
  Recall Macro: 0.7667
Cleaning up disk space...
Disk cleanup completed!

Fold 2/5
------------------------------
Available disk space: 18.18 GB


config.json:   0%|          | 0.00/701 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/409M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ckiplab/bert-base-chinese and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Training with knowledge distillation...


model.safetensors:   0%|          | 0.00/409M [00:00<?, ?B/s]

Epoch,Training Loss,Validation Loss
1,2.411,2.13983
2,1.6216,1.827537
3,1.0315,1.608924
4,0.5382,1.785138
5,0.3736,1.663911


Evaluating on validation set...
Fold 2 Metrics:
  Accuracy: 0.7567
  F1 Macro: 0.7476
  F1 Weighted: 0.7569
  Precision Macro: 0.7503
  Recall Macro: 0.7475
Cleaning up disk space...
Disk cleanup completed!

Fold 3/5
------------------------------
Available disk space: 18.18 GB


config.json:   0%|          | 0.00/701 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/409M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ckiplab/bert-base-chinese and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Training with knowledge distillation...


model.safetensors:   0%|          | 0.00/409M [00:00<?, ?B/s]

Epoch,Training Loss,Validation Loss
1,2.3913,2.029004
2,1.6528,1.521217
3,0.9097,1.541879
4,0.5576,1.595929


Evaluating on validation set...
Fold 3 Metrics:
  Accuracy: 0.7583
  F1 Macro: 0.7433
  F1 Weighted: 0.7586
  Precision Macro: 0.7416
  Recall Macro: 0.7452
Cleaning up disk space...
Disk cleanup completed!

Fold 4/5
------------------------------
Available disk space: 18.18 GB


config.json:   0%|          | 0.00/701 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/409M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ckiplab/bert-base-chinese and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Training with knowledge distillation...


model.safetensors:   0%|          | 0.00/409M [00:00<?, ?B/s]

Epoch,Training Loss,Validation Loss
1,2.4187,1.911068
2,1.4645,1.467827
3,0.9226,1.326268
4,0.4574,1.319465
5,0.2432,1.470139


Evaluating on validation set...
Cleaning up disk space...
Disk cleanup completed!
Fold 4 Metrics:
  Accuracy: 0.7962
  F1 Macro: 0.7879
  F1 Weighted: 0.7961
  Precision Macro: 0.7898
  Recall Macro: 0.7862
Cleaning up disk space...
Disk cleanup completed!

Fold 5/5
------------------------------
Available disk space: 18.18 GB


config.json:   0%|          | 0.00/701 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/409M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ckiplab/bert-base-chinese and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Training with knowledge distillation...


model.safetensors:   0%|          | 0.00/409M [00:00<?, ?B/s]

Epoch,Training Loss,Validation Loss
1,2.4656,2.036029
2,1.5356,1.539856
3,1.0502,1.458438
4,0.5386,1.469353
5,0.3272,1.613192


Evaluating on validation set...
Fold 5 Metrics:
  Accuracy: 0.7788
  F1 Macro: 0.7639
  F1 Weighted: 0.7788
  Precision Macro: 0.7637
  Recall Macro: 0.7641
Cleaning up disk space...
Disk cleanup completed!

Fold results saved to: fold_results_combined.csv

5-Fold CV Results for combined (Knowledge Distillation):
Accuracy: 0.7726 ± 0.0145
F1 Macro: 0.7609 ± 0.0157
F1 Weighted: 0.7728 ± 0.0144
Precision Macro: 0.7606 ± 0.0164
Recall Macro: 0.7620 ± 0.0149
Best model from fold 4 with F1 Macro: 0.7879


# 05. Generate final predictions and save results

In [6]:
def load_best_fold_model(model_type, best_fold):
    """Load the best performing fold model"""
    model_path = f'./chinese_bert_{model_type}_fold_{best_fold}_best'
    
    model = AutoModelForSequenceClassification.from_pretrained(model_path)
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    
    return model, tokenizer
    
def generate_final_predictions(model, tokenizer, texts, max_length):
    model.eval()
    model.to(device)
    predictions = []
    confidences = []
    
    with torch.no_grad():
        for i in range(0, len(texts), 16):  # batch_size = 16
            batch_texts = texts[i:i+16]
            
            inputs = tokenizer(
                batch_texts,
                truncation=True,
                padding=True,
                max_length=max_length,
                return_tensors='pt'
            )
            
            inputs = {k: v.to(device) for k, v in inputs.items()}
            
            outputs = model(**inputs)
            probs = torch.softmax(outputs.logits, dim=-1)
            batch_predictions = torch.argmax(outputs.logits, dim=-1)
            batch_confidences = torch.max(probs, dim=-1)[0]
            
            predictions.extend(batch_predictions.cpu().numpy())
            confidences.extend(batch_confidences.cpu().numpy())
    
    return np.array(predictions), np.array(confidences)

print("\n" + "="*50)
print("PHASE 5: Generating final predictions")
print("="*50)

# Load best models
chinese_title_model, chinese_title_tokenizer = load_best_fold_model('title', best_title_fold)
chinese_combined_model, chinese_combined_tokenizer = load_best_fold_model('combined', best_combined_fold)

print(f"Best title model: Fold {best_title_fold}")
print(f"Best combined model: Fold {best_combined_fold}")

# Generate final predictions
print("Generating final predictions with Chinese title model...")
final_title_preds, final_title_conf = generate_final_predictions(
    chinese_title_model,
    chinese_title_tokenizer,
    chinese_title_texts,
    max_length=128
)

print("Generating final predictions with Chinese combined model...")
final_combined_preds, final_combined_conf = generate_final_predictions(
    chinese_combined_model,
    chinese_combined_tokenizer,
    chinese_combined_texts,
    max_length=512
)

# Add final predictions to dataframe
df['chinese_title_pred'] = final_title_preds
df['chinese_title_confidence'] = final_title_conf
df['chinese_combined_pred'] = final_combined_preds
df['chinese_combined_confidence'] = final_combined_conf

# Save final results
output_file = 'political_stance_predictions.csv'
df.to_csv(output_file, index=False)
print(f"\nFinal predictions saved to: {output_file}")


PHASE 5: Generating final predictions
Best title model: Fold 0
Best combined model: Fold 3
Generating final predictions with Chinese title model...
Generating final predictions with Chinese combined model...

Final predictions saved to: political_stance_predictions.csv


# 06. Save all model parameters

In [7]:
print("\n" + "="*50)
print("PHASE 6: Saving model parameters")
print("="*50)

# Save model information
model_info = {
    'politics_title_model': './politics_title_model',
    'politics_combined_model': './politics_combined_model',
    'chinese_title_best_fold': best_title_fold,
    'chinese_combined_best_fold': best_combined_fold,
    'chinese_title_model': f'./chinese_bert_title_fold_{best_title_fold}_best',
    'chinese_combined_model': f'./chinese_bert_combined_fold_{best_combined_fold}_best',
}

# Save model info to file
import json
with open('model_info.json', 'w') as f:
    json.dump(model_info, f, indent=2)

print("Model information saved to: model_info.json")

# Print summary
print("\n" + "="*50)
print("TRAINING SUMMARY")
print("="*50)
print(f"Original dataset size: {len(df)}")
print(f"Number of unique labels: {len(df['label_encoded'].unique())}")
print(f"\nAll models and predictions saved successfully!")
print(f"Final predictions file: {output_file}")

# Display sample predictions
print(f"\nSample predictions:")
print(df[['title', 'label_encoded', 'politics_title_pred', 'chinese_title_pred', 
          'politics_combined_pred', 'chinese_combined_pred']].head())


PHASE 6: Saving model parameters
Model information saved to: model_info.json

TRAINING SUMMARY
Original dataset size: 3166
Number of unique labels: 3

All models and predictions saved successfully!
Final predictions file: political_stance_predictions.csv

Sample predictions:


KeyError: "['politics_title_pred', 'politics_combined_pred'] not in index"