In [1]:
!pip install -U transformers==4.47.0 datasets accelerate


Collecting transformers==4.47.0
  Downloading transformers-4.47.0-py3-none-any.whl.metadata (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.5/43.5 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
Collecting accelerate
  Downloading accelerate-1.6.0-py3-none-any.whl.metadata (19 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (fr

In [2]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    EvalPrediction,
    EarlyStoppingCallback
)
from datasets import Dataset, DatasetDict

# ========== 1. Read dataset ==========
def read_dataset(filepath):
    return pd.read_csv(filepath, on_bad_lines='skip')

# ========== 2. One-hot encode labels ==========
def convert_labels_to_one_hot(dataset, label_column, separator=';'):
    dataset[label_column] = dataset[label_column].astype(str).fillna('')
    labels_list = dataset[label_column].apply(lambda x: [label.strip() for label in x.split(separator) if label.strip()])
    mlb = MultiLabelBinarizer()
    one_hot_labels = mlb.fit_transform(labels_list)
    one_hot_df = pd.DataFrame(one_hot_labels, columns=mlb.classes_)
    return one_hot_df, mlb

# ========== 3. Load data ==========
train_df = read_dataset("/kaggle/input/data-cv/train.csv")
val_df = read_dataset("/kaggle/input/data-cv/val.csv")
test_df = read_dataset("/kaggle/input/data-cv/test.csv")

train_one_hot, mlb = convert_labels_to_one_hot(train_df, "label")
val_one_hot, _ = convert_labels_to_one_hot(val_df, "label")
test_one_hot, _ = convert_labels_to_one_hot(test_df, "label")

train_df = pd.concat([train_df[["text"]], train_one_hot], axis=1)
val_df = pd.concat([val_df[["text"]], val_one_hot], axis=1)
test_df = pd.concat([test_df[["text"]], test_one_hot], axis=1)

# ========== 4. Label mapping ==========
labels = mlb.classes_.tolist()
id2label = {i: label for i, label in enumerate(labels)}
label2id = {label: i for i, label in enumerate(labels)}

# ========== 5. Convert to HF datasets ==========
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)
test_dataset = Dataset.from_pandas(test_df)
dataset = DatasetDict({'train': train_dataset, 'val': val_dataset, 'test': test_dataset})

# ========== 6. Tokenizer ==========
tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-v3-base")

def preprocess_data(examples):
    text = examples["text"]
    encoding = tokenizer(text, padding="max_length", truncation=True, max_length=512)
    labels_batch = {k: examples[k] for k in examples if k in labels}
    labels_matrix = np.zeros((len(text), len(labels)))
    for idx, label in enumerate(labels):
        labels_matrix[:, idx] = labels_batch[label]
    encoding["labels"] = labels_matrix.tolist()
    return encoding

encoded_dataset = dataset.map(preprocess_data, batched=True, remove_columns=dataset["train"].column_names)
encoded_dataset.set_format("torch")

# ========== 7. pos_weight ==========
label_counts = train_one_hot.sum(axis=0)
total_counts = len(train_one_hot)
pos_weight_np = (total_counts - label_counts) / (label_counts + 1e-6)
pos_weight = torch.tensor(pos_weight_np.values, dtype=torch.float32)

# ========== 8. Load model ==========
model = AutoModelForSequenceClassification.from_pretrained(
    "microsoft/deberta-v3-base",
    problem_type="multi_label_classification",
    num_labels=len(labels),
    id2label=id2label,
    label2id=label2id
)

# ========== 9. CustomTrainer ==========
class CustomTrainer(Trainer):
    def __init__(self, *args, pos_weight=None, **kwargs):
        super().__init__(*args, **kwargs)
        self.pos_weight = pos_weight
        self.logs = []  # Store logs for each epoch

    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        loss_fn = nn.BCEWithLogitsLoss(pos_weight=self.pos_weight.to(logits.device))
        loss = loss_fn(logits, labels.float())  
        return (loss, outputs) if return_outputs else loss

    def evaluate(self, eval_dataset=None, **kwargs):
        results = super().evaluate(eval_dataset=eval_dataset, **kwargs)

        # Extract the specific metrics
        eval_f1 = results.get('eval_f1', None)
        eval_loss = results.get('eval_loss', None)
        eval_roc_auc = results.get('eval_roc_auc', None)
        eval_accuracy = results.get('eval_accuracy', None)

        # Log results for each epoch
        self.logs.append({
            'Epoch': self.state.epoch,
            'Training Loss': self.state.log_history[-1].get('loss', 0),
            'Validation Loss': eval_loss,
            'F1': eval_f1,
            'Roc Auc': eval_roc_auc,
            'Accuracy': eval_accuracy,
        })

        # Save logs after each epoch
        self.save_logs()

        return results

    def save_logs(self, output_file="training_logs.csv"):
        # Save logs to a CSV file
        logs_df = pd.DataFrame(self.logs)
        logs_df.to_csv(output_file, index=False)

# ========== 10. Training arguments ==========
training_args = TrainingArguments(
    output_dir="./deberta-v3-base-model",
    evaluation_strategy="epoch",           # Evaluate at each epoch
    save_strategy="epoch",                # Save the model at each epoch
    learning_rate=1e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=10,                 
    weight_decay=0.01,
    load_best_model_at_end=True,          # Load the best model at the end of training
    metric_for_best_model="f1",           # Use F1 score to find the best model
    report_to="none"                     # Disable reporting
)

# ========== 11. Metrics ==========
def multi_label_metrics(preds, labels, threshold=0.5):
    # No need for sigmoid here as preds are already probabilities
    y_pred = np.zeros_like(preds)
    y_pred[preds >= threshold] = 1
    return {
        'f1': f1_score(labels, y_pred, average='micro'),
        'roc_auc': roc_auc_score(labels, y_pred, average='micro'),
        'accuracy': accuracy_score(labels, y_pred)
    }

def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
    return multi_label_metrics(preds, p.label_ids)

# ========== 12. Trainer ==========
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["val"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    pos_weight=pos_weight,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]  # Stop early if no improvement
)

# ========== 13. Train & Evaluate ==========
trainer.train()

# Save logs to CSV
trainer.save_logs("training_logs.csv")

# ========== 14. Inference ==========
text = "I have 3 years experience in working with Python and write softwares for customers."
encoding = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
encoding = {k: v.to(model.device) for k, v in encoding.items()}

with torch.no_grad():
    outputs = model(**encoding)
logits = outputs.logits
sigmoid = torch.nn.Sigmoid()  # You need this only during inference, not for training or metrics
probs = sigmoid(logits.squeeze().cpu())
predictions = np.zeros_like(probs)
predictions[probs >= 0.5] = 1
predicted_labels = [id2label[i] for i, val in enumerate(predictions) if val == 1]

print("Predicted labels:", predicted_labels)


2025-05-14 11:26:13.344888: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1747221973.539923      35 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1747221973.593213      35 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/579 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]



Map:   0%|          | 0/22615 [00:00<?, ? examples/s]

Map:   0%|          | 0/2855 [00:00<?, ? examples/s]

Map:   0%|          | 0/2825 [00:00<?, ? examples/s]

pytorch_model.bin:   0%|          | 0.00/371M [00:00<?, ?B/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  super().__init__(*args, **kwargs)
Error during conversion: ChunkedEncodingError(ProtocolError('Response ended prematurely'))


Epoch,Training Loss,Validation Loss,F1,Roc Auc,Accuracy
1,0.3456,0.299033,0.888069,0.932997,0.698074
2,0.2722,0.261012,0.89972,0.94141,0.725744
3,0.2583,0.27496,0.902338,0.944695,0.728897
4,0.2195,0.252136,0.908633,0.946591,0.745359
5,0.2143,0.252054,0.912567,0.948118,0.76042
6,0.1844,0.24945,0.90538,0.949523,0.741856
7,0.177,0.262951,0.914523,0.950536,0.766025
8,0.1533,0.253831,0.904681,0.951355,0.740105
9,0.1491,0.269238,0.911066,0.950345,0.75972


model.safetensors:   0%|          | 0.00/371M [00:00<?, ?B/s]

Predicted labels: ['Python_Developer', 'Software_Developer']
