In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/dataset-1/train.csv


In [2]:
import torch
import pandas as pd
import numpy as np
from sklearn.metrics import f1_score, classification_report
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding
)
from datasets import Dataset, ClassLabel
from torch.utils.data import DataLoader, WeightedRandomSampler
import gc

2025-05-11 11:09:46.591145: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1746961786.803884      31 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1746961786.864918      31 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [4]:
# Configuration 
class CFG:
    model_name = "bert-base-uncased"
    max_length = 256
    batch_size = 32
    num_epochs = 4
    learning_rate = 2e-5
    seed = 42
    text_col = "text"
    label_col = "labels"
    device = "cuda" if torch.cuda.is_available() else "cpu"
    mixed_precision = True
    gamma = 2.0  # Focal loss parameter
    label_smoothing = 0.1  # Reduces overconfidence

In [31]:
# Data Preparation
df = pd.read_csv("/kaggle/input/dataset-1/train.csv")
df = df.rename(columns={"Category": CFG.label_col, "Text": CFG.text_col})

In [32]:
# Remove duplicates based on text only (optional: keep first or use groupby)
df = df.drop_duplicates(subset=CFG.text_col).reset_index(drop=True)

In [33]:
# Convert to Dataset
dataset = Dataset.from_pandas(df)
class_labels = ClassLabel(
    num_classes=len(df[CFG.label_col].unique()),
    names=list(df[CFG.label_col].unique())
)
dataset = dataset.cast_column(CFG.label_col, class_labels)

Casting the dataset:   0%|          | 0/12085 [00:00<?, ? examples/s]

In [34]:
# Stratified split
splits = dataset.train_test_split(test_size=0.2, stratify_by_column=CFG.label_col, seed=CFG.seed)
train_val = splits["train"].train_test_split(test_size=0.125, stratify_by_column=CFG.label_col, seed=CFG.seed)
dataset = {
    "train": train_val["train"],
    "valid": train_val["test"],
    "test": splits["test"]
}

In [37]:
# Tokenization
tokenizer = AutoTokenizer.from_pretrained(CFG.model_name)
tokenized_datasets = {
    k: v.map(
        lambda x: tokenizer(x["text"], truncation=True, max_length=CFG.max_length, padding=False),
        batched=True,
        remove_columns=["text"]
    ) for k, v in dataset.items()
}

Map:   0%|          | 0/8459 [00:00<?, ? examples/s]

Map:   0%|          | 0/1209 [00:00<?, ? examples/s]

Map:   0%|          | 0/2417 [00:00<?, ? examples/s]

In [38]:
# Adaptive Class Balancing
train_labels = np.array(dataset["train"][CFG.label_col])
class_counts = np.bincount(train_labels)
class_weights = 1. / (class_counts + 1e-6)  # Add smoothing
sample_weights = class_weights[train_labels]

In [39]:
# Create balanced sampler
sampler = WeightedRandomSampler(
    sample_weights, 
    num_samples=len(sample_weights),
    replacement=True
)

In [40]:
# Model Initialization
model = AutoModelForSequenceClassification.from_pretrained(
    CFG.model_name,
    num_labels=len(class_labels.names),
    ignore_mismatched_sizes=True
).to(CFG.device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [41]:
# Custom Trainer with Focal Loss + Label Smoothing
class AdaptiveTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        
        # Calculate cross entropy loss
        ce_loss = torch.nn.functional.cross_entropy(
            logits.view(-1, model.config.num_labels),
            labels.view(-1),
            reduction='none'
        )
        
        # Focal loss component
        pt = torch.exp(-ce_loss)
        focal_loss = (1 - pt) ** CFG.gamma * ce_loss
        
        # Label smoothing regularization
        smooth_loss = (1 - CFG.label_smoothing) * focal_loss.mean() + \
                     CFG.label_smoothing * (-torch.log_softmax(logits, dim=-1).mean())
        
        return (smooth_loss, outputs) if return_outputs else smooth_loss

In [42]:
# Training Arguments
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    learning_rate=CFG.learning_rate,
    per_device_train_batch_size=CFG.batch_size,
    per_device_eval_batch_size=CFG.batch_size*2,
    num_train_epochs=CFG.num_epochs,
    weight_decay=0.01,
    logging_steps=50,
    fp16=CFG.mixed_precision,
    gradient_accumulation_steps=2,
    warmup_ratio=0.1,
    save_strategy="no",
    load_best_model_at_end=False,
    report_to="none",
    optim="adamw_torch",
    seed=CFG.seed,
    max_grad_norm=1.0  # Gradient clipping
)

In [43]:
# Data Collator
data_collator = DataCollatorWithPadding(
    tokenizer=tokenizer,
    padding="longest",
    max_length=CFG.max_length
)

In [44]:
# Initialize Trainer
trainer = AdaptiveTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["valid"],
    data_collator=data_collator,
    tokenizer=tokenizer,
)

  trainer = AdaptiveTrainer(


In [45]:
# Add balanced sampling
trainer.train_dataset = trainer.train_dataset.with_format("torch")
trainer.train_dataloader = DataLoader(
    trainer.train_dataset,
    batch_size=CFG.batch_size,
    sampler=sampler,
    collate_fn=data_collator
)

In [46]:
# Clean memory
gc.collect()
torch.cuda.empty_cache()

In [47]:
# Start training
trainer.train()

Epoch,Training Loss,Validation Loss
1,3.0843,2.112372
2,1.5219,1.197316
3,0.9812,0.947444


TrainOutput(global_step=528, training_loss=1.7470247745513916, metrics={'train_runtime': 784.4483, 'train_samples_per_second': 43.134, 'train_steps_per_second': 0.673, 'total_flos': 4426235859681792.0, 'train_loss': 1.7470247745513916, 'epoch': 3.9735849056603776})

In [48]:
# Evaluation
def compute_metrics(p):
    preds = p.predictions.argmax(-1)
    return {
        "macro_f1": f1_score(p.label_ids, preds, average="macro"),
        "weighted_f1": f1_score(p.label_ids, preds, average="weighted")
    }

trainer.compute_metrics = compute_metrics

In [49]:
# Validation results
valid_results = trainer.evaluate()
print("\nValidation Results:")
print(f"Macro F1: {valid_results['eval_macro_f1']:.4f}")
print(f"Weighted F1: {valid_results['eval_weighted_f1']:.4f}")


Validation Results:
Macro F1: 0.8461
Weighted F1: 0.8608


In [50]:
# Test results
test_results = trainer.predict(tokenized_datasets["test"])
print("\nTest Classification Report:")
print(classification_report(
    test_results.label_ids,
    test_results.predictions.argmax(-1),
    target_names=class_labels.names
))


Test Classification Report:
                           precision    recall  f1-score   support

               Accountant       0.93      0.99      0.96        67
                 Advocate       0.92      0.96      0.94        56
              Agriculture       0.89      0.87      0.88        45
                  Apparel       0.81      0.81      0.81        63
             Architecture       0.86      0.82      0.84        61
                     Arts       0.79      0.84      0.82        50
               Automobile       0.80      0.58      0.67        60
                 Aviation       0.96      0.98      0.97        65
                  Banking       0.95      0.91      0.93        58
               Blockchain       1.00      1.00      1.00         9
                      BPO       0.85      0.56      0.68        39
Building and Construction       0.78      0.87      0.82        67
         Business Analyst       0.89      0.90      0.90        62
           Civil Engineer       