In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [1]:
pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Collecting pyarrow>=21.0.0 (from datasets>=2.0.0->evaluate)
  Downloading pyarrow-22.0.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (3.2 kB)
Downloading evaluate-0.4.6-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyarrow-22.0.0-cp311-cp311-manylinux_2_28_x86_64.whl (47.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.7/47.7 MB[0m [31m36.8 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hInstalling collected packages: pyarrow, evaluate
  Attempting uninstall: pyarrow
    Found existing installation: pyarrow 19.0.1
    Uninstalling pyarrow-19.0.1:
      Successfully uninstalled pyarrow-19.0.1
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency

In [5]:
import os
import sys
import logging
import datasets
import evaluate

import torch
import torch.nn as nn
import torch.nn.functional as F

import pandas as pd
import numpy as np

# 引入 PEFT
from peft import LoraConfig, get_peft_model, TaskType

from transformers import BertTokenizerFast, DataCollatorWithPadding
from transformers import Trainer, TrainingArguments
from transformers import BertPreTrainedModel, BertModel
from transformers.modeling_outputs import SequenceClassifierOutput
from sklearn.model_selection import train_test_split

# === 修复版 SupConLoss (增加数值稳定性) ===
class SupConLoss(nn.Module):
    def __init__(self, temperature=0.07, contrast_mode='all', base_temperature=0.07):
        super(SupConLoss, self).__init__()
        self.temperature = temperature
        self.contrast_mode = contrast_mode
        self.base_temperature = base_temperature

    def forward(self, features, labels=None, mask=None):
        device = (torch.device('cuda') if features.is_cuda else torch.device('cpu'))

        if len(features.shape) < 3:
            raise ValueError('`features` needs to be [bsz, n_views, ...], at least 3 dimensions are required')
        if len(features.shape) > 3:
            features = features.view(features.shape[0], features.shape[1], -1)

        batch_size = features.shape[0]
        if labels is not None and mask is not None:
            raise ValueError('Cannot define both `labels` and `mask`')
        elif labels is None and mask is None:
            mask = torch.eye(batch_size, dtype=torch.float32).to(device)
        elif labels is not None:
            labels = labels.contiguous().view(-1, 1)
            if labels.shape[0] != batch_size:
                raise ValueError('Num of labels does not match num of features')
            mask = torch.eq(labels, labels.T).float().to(device)
        else:
            mask = mask.float().to(device)

        contrast_count = features.shape[1]
        contrast_feature = torch.cat(torch.unbind(features, dim=1), dim=0)
        if self.contrast_mode == 'one':
            anchor_feature = features[:, 0]
            anchor_count = 1
        elif self.contrast_mode == 'all':
            anchor_feature = contrast_feature
            anchor_count = contrast_count
        else:
            raise ValueError('Unknown mode: {}'.format(self.contrast_mode))

        # Compute logits
        anchor_dot_contrast = torch.div(torch.matmul(anchor_feature, contrast_feature.T), self.temperature)
        
        # Numerical stability check
        logits_max, _ = torch.max(anchor_dot_contrast, dim=1, keepdim=True)
        logits = anchor_dot_contrast - logits_max.detach()

        # Tile mask
        mask = mask.repeat(anchor_count, contrast_count)
        
        # Mask-out self-contrast cases
        logits_mask = torch.scatter(
            torch.ones_like(mask), 1,
            torch.arange(batch_size * anchor_count).view(-1, 1).to(device), 0
        )
        mask = mask * logits_mask

        # Compute log_prob
        exp_logits = torch.exp(logits) * logits_mask
        log_prob = logits - torch.log(exp_logits.sum(1, keepdim=True) + 1e-8) # Add epsilon

        # Compute mean of log-likelihood over positive
        # === 关键修复: 分母加 1e-8 防止除以 0 (当 Batch 内只有自身一个类别时) ===
        mask_pos_pairs = mask.sum(1)
        mask_pos_pairs = torch.where(mask_pos_pairs < 1e-6, torch.ones_like(mask_pos_pairs), mask_pos_pairs)
        mean_log_prob_pos = (mask * log_prob).sum(1) / mask_pos_pairs

        # Loss
        loss = - (self.temperature / self.base_temperature) * mean_log_prob_pos
        loss = loss.view(anchor_count, batch_size).mean()
        return loss

# === 修复版 BertScratch (增加归一化) ===
class BertScratch(BertPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels
        self.config = config
        self.alpha = 0.1 # 稍微降低 SCL Loss 权重，避免主导

        self.bert = BertModel(config)
        classifier_dropout = (
            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
        )
        self.dropout = nn.Dropout(classifier_dropout)
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
        
        self.scl_fct = SupConLoss(temperature=0.07) 
        self.post_init()

    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, labels=None, **kwargs):
        outputs = self.bert(input_ids, attention_mask, token_type_ids)
        pooled_output = outputs[1]
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)

        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            ce_loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            
            if self.training:
                # === 关键修复: 对特征进行 L2 归一化 ===
                # 对比学习必须使用归一化的向量，否则点积会爆炸导致 Loss NaN
                normed_features = F.normalize(pooled_output, dim=1).unsqueeze(1)
                
                scl_loss = self.scl_fct(normed_features, labels)
                loss = ce_loss + self.alpha * scl_loss
            else:
                loss = ce_loss

        return SequenceClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions
        )

# === 主程序 ===
if __name__ == '__main__':
    logging.basicConfig(level=logging.INFO)

    # 路径检查
    if os.path.exists("/kaggle/input/labeledtraindata-tsv/labeledTrainData.tsv"):
        train_path = "/kaggle/input/labeledtraindata-tsv/labeledTrainData.tsv"
        test_path = "/kaggle/input/testdata-tsv/testData.tsv"
    else:
        train_path = "./corpus/imdb/labeledTrainData.tsv" 
        test_path = "./corpus/imdb/testData.tsv"

    try:
        train_df = pd.read_csv(train_path, header=0, delimiter="\t", quoting=3)
        test_df = pd.read_csv(test_path, header=0, delimiter="\t", quoting=3)
    except FileNotFoundError:
        print(f"Error: Data file not found at {train_path}")
        sys.exit(1)

    train_df, val_df = train_test_split(train_df, test_size=.2, random_state=42)

    train_ds = datasets.Dataset.from_dict({'label': train_df["sentiment"], 'text': train_df['review']})
    val_ds = datasets.Dataset.from_dict({'label': val_df["sentiment"], 'text': val_df['review']})
    test_ds = datasets.Dataset.from_dict({"text": test_df['review']})

    tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

    def preprocess_function(examples):
        return tokenizer(examples['text'], truncation=True, padding=False, max_length=512)

    tokenized_train = train_ds.map(preprocess_function, batched=True, remove_columns=["text"])
    tokenized_val = val_ds.map(preprocess_function, batched=True, remove_columns=["text"])
    tokenized_test = test_ds.map(preprocess_function, batched=True, remove_columns=["text"])

    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

    model = BertScratch.from_pretrained('bert-base-uncased')

    peft_config = LoraConfig(
        task_type=TaskType.SEQ_CLS,
        inference_mode=False,
        r=8, # 稍微降低秩，减少计算量
        lora_alpha=32,
        lora_dropout=0.1,
        target_modules=["query", "value"],
        modules_to_save=["classifier"]
    )
    
    model = get_peft_model(model, peft_config)
    model.print_trainable_parameters()

    metric = evaluate.load("accuracy")

    def compute_metrics(eval_pred):
        logits, labels = eval_pred
        predictions = np.argmax(logits, axis=-1)
        return metric.compute(predictions=predictions, references=labels)

    training_args = TrainingArguments(
        output_dir='./checkpoint_scl_lora',
        num_train_epochs=3,
        per_device_train_batch_size=16, # 如果依然不稳定，尝试调小到 8
        per_device_eval_batch_size=16,
        gradient_accumulation_steps=1,  # 如果显存允许，建议设为 2 或 4 来模拟更大的 Batch Size (对 SCL 有利)
        warmup_steps=500,
        learning_rate=1e-4, # 稍微降低 LR 增加稳定性 (原 2e-4)
        weight_decay=0.01,
        logging_dir='./logs',
        logging_steps=50,
        save_strategy="epoch",
        eval_strategy="epoch",
        report_to="none",
        remove_unused_columns=False,
        label_names=["labels"]
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_train,
        eval_dataset=tokenized_val,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )

    trainer.train()

    prediction_outputs = trainer.predict(tokenized_test)
    test_pred = np.argmax(prediction_outputs[0], axis=-1).flatten()

    if not os.path.exists("./result"):
        os.makedirs("./result")
        
    result_output = pd.DataFrame(data={"id": test_df["id"], "sentiment": test_pred})
    result_output.to_csv("./result/bert_scl_lora.csv", index=False, quoting=3)
    logging.info('Result saved to ./result/bert_scl_lora.csv')

Map:   0%|          | 0/20000 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Some weights of BertScratch were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 296,450 || all params: 109,780,228 || trainable%: 0.2700


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy
1,0.5179,0.263315,0.8994
2,0.4713,0.226413,0.913
3,0.4589,0.223529,0.9156


