In [1]:
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments
)
from datasets import load_dataset
from evaluate import load

In [4]:
model_path = 'distilbert-base-uncased'
model_location = "/home/kamal/.cache/huggingface/hub/models--roberta-large/snapshots/716877d372b884cad6d419d828bac6c85b3b18d9/"
datasets = "ccdv/patent-classification"
trial_data = "cola"

In [3]:
cola = load_dataset('glue', trial_data)

In [5]:
cola

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 8551
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 1043
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 1063
    })
})

In [6]:
cola['train'].features['label'].names

['unacceptable', 'acceptable']

In [7]:
from transformers import AutoConfig

model_config = AutoConfig.from_pretrained(model_path)

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

In [8]:
model_config

DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.37.2",
  "vocab_size": 30522
}

In [9]:
tokenizer = AutoTokenizer.from_pretrained(model_path)

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [9]:
tokenizer.model_max_lengthls 

512

In [10]:
tokened_data = tokenizer(["This is a test",
                          "There is more to the logic than I know."],
                        truncation=True, padding=True,
                        max_length=8)
tokened_data

{'input_ids': [[101, 2023, 2003, 1037, 3231, 102, 0, 0], [101, 2045, 2003, 2062, 2000, 1996, 7961, 102]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1]]}

In [11]:
def process(row):
    return tokenizer(row['sentence'],truncation=True,
                    padding=True, max_length=512)

In [12]:
classes = 2
class2id = {"acceptable":1, "unacceptable":0}
id2class = {1:"acceptable", 0:"unacceptable"}

In [13]:
model_wt = AutoModelForSequenceClassification.from_pretrained(
    model_path, num_labels=2, id2label=id2class, label2id=class2id
)

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
cola_tokenised = cola.map(process, batched=True)

Map:   0%|          | 0/1063 [00:00<?, ? examples/s]

In [15]:
cola_tokenised

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx', 'input_ids', 'attention_mask'],
        num_rows: 8551
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx', 'input_ids', 'attention_mask'],
        num_rows: 1043
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx', 'input_ids', 'attention_mask'],
        num_rows: 1063
    })
})

In [16]:
from datasets import load_metric
cola_metric = load_metric("glue", "cola")

  cola_metric = load_metric("glue", "cola")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


In [17]:
def compute_metric(eval_pred):
    pred, refs = eval_pred
    predictions = np.argmax(pred, axis=1)
    return cola_metric.compute(predictions=predictions,
                         references=refs)

In [18]:
targs = TrainingArguments(
    output_dir="/home/kamal/training_files/cola",
    num_train_epochs=2,
    per_device_eval_batch_size=4,
    per_device_train_batch_size=4,
    evaluation_strategy="steps",
    eval_steps=500,
    save_strategy="epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    push_to_hub=False,
    report_to="none"
)

In [19]:
from transformers import DataCollatorWithPadding
datacol = DataCollatorWithPadding(tokenizer=tokenizer)

In [22]:
trainer = Trainer(
    model=model_wt,
    args=targs,
    train_dataset=cola_tokenised['train'],
    eval_dataset=cola_tokenised['test'],
    tokenizer=tokenizer,
)

RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [21]:
trainer.train()

Step,Training Loss,Validation Loss


../aten/src/ATen/native/cuda/Loss.cu:240: nll_loss_forward_reduce_cuda_kernel_2d: block: [0,0,0], thread: [0,0,0] Assertion `t >= 0 && t < n_classes` failed.
../aten/src/ATen/native/cuda/Loss.cu:240: nll_loss_forward_reduce_cuda_kernel_2d: block: [0,0,0], thread: [1,0,0] Assertion `t >= 0 && t < n_classes` failed.
../aten/src/ATen/native/cuda/Loss.cu:240: nll_loss_forward_reduce_cuda_kernel_2d: block: [0,0,0], thread: [2,0,0] Assertion `t >= 0 && t < n_classes` failed.
../aten/src/ATen/native/cuda/Loss.cu:240: nll_loss_forward_reduce_cuda_kernel_2d: block: [0,0,0], thread: [3,0,0] Assertion `t >= 0 && t < n_classes` failed.


RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
