In [2]:
!pip install -q transformers 
!pip install -q peft
!pip install -q evaluate

In [3]:
import pandas as pd
df_train = pd.read_csv("/kaggle/input/ag-news-classification-dataset/train.csv")
df_test= pd.read_csv("/kaggle/input/ag-news-classification-dataset/test.csv")

In [4]:
# Splitting into X-train, X_test, y_train, y_test
X_train = df_train['Title'] + " " + df_train['Description'] 
Y_train = df_train['Class Index']

x_test = df_test['Title'] + " " + df_test['Description'] 
y_test = df_test['Class Index']

In [5]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")


def tokenize_function(examples):
    return tokenizer(examples, 
                     padding="max_length", # Pad shorter examples to the same length
                     truncation=True)      # Truncate longer examples to fit the max length
'''
takes a list of text examples as input. 
This function will be responsible for processing the text examples using the loaded tokenizer.
'''

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

In [6]:
from datasets import Dataset


In [7]:
tokenized_X_train = X_train.apply(tokenize_function)
tokenized_X_test = x_test.apply(tokenize_function)


In [None]:
[len(item['input_ids']) for item in tokenized_X_train[0:]][0]

In [24]:
# Create Dataset objects
train_dataset = Dataset.from_dict({"input_ids": [item['input_ids'] for item in tokenized_X_train[0:]][:],
                                   "attention_mask": [item['attention_mask'] for item in tokenized_X_train[0:]][:],
                                   "labels": Y_train})

eval_dataset = Dataset.from_dict({"input_ids": [item['input_ids'] for item in tokenized_X_test[0:]][:],
                                  "attention_mask": [item['attention_mask'] for item in tokenized_X_test[0:]][:],
                                  "labels": y_test})
"""
This function creates two Dataset objects: train_dataset and eval_dataset.

Args:
    tokenized_X_train (list): A list containing tokenized representations of the training data.
        Each element in the list should be a dictionary with keys 'input_ids' and 'attention_mask' 
        containing the tokenized input IDs and attention masks for a single training example.
    Y_train (list): A list containing the ground truth labels for the training data.
        The length of this list should match the length of tokenized_X_train.
    tokenized_X_test (list): A list containing tokenized representations of the evaluation data.
        Each element in the list should be a dictionary with keys 'input_ids' and 'attention_mask' 
        containing the tokenized input IDs and attention masks for a single evaluation example.
    y_test (list): A list containing the ground truth labels for the evaluation data.
        The length of this list should match the length of tokenized_X_test.

Returns:
    tuple: A tuple containing two Dataset objects:
        - train_dataset (Dataset): The training dataset containing input IDs, attention masks, and labels.
        - eval_dataset (Dataset): The evaluation dataset containing input IDs, attention masks, and labels.
"""

In [25]:
from peft import LoraConfig, TaskType

lora_config = LoraConfig(
    task_type=TaskType.SEQ_CLS, r=1, lora_alpha=1, lora_dropout=0.1
)
"""
This code snippet creates a LoraConfig object for a Sequence Classification (SEQ_CLS) task using the PEFT library.

Args:
    task_type (TaskType): The type of NLP task the configuration is intended for.
        In this case, it is set to TaskType.SEQ_CLS for Sequence Classification.
    r (int, optional): The depth (number of layers) of the LORa module. Defaults to 1.
    lora_alpha (float, optional): The alpha parameter for the LORa activation function. Defaults to 1.
    lora_dropout (float, optional): The dropout rate for the LORa module. Defaults to 0.1.

Returns:
    LoraConfig: A LoraConfig object containing the specified configuration parameters for the PEFT library.
"""

In [35]:
# Load a pre-trained BERT model for sequence classification with 4 output classes
from transformers import BertForSequenceClassification

model = BertForSequenceClassification.from_pretrained(
    'bert-base-cased', 
    num_labels=4
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [36]:
from peft import get_peft_model
model = get_peft_model(model, lora_config)

In [37]:
import numpy as np
import evaluate

metric = evaluate.load("accuracy")

In [38]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [39]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(output_dir="test_trainer",  # Output directory for checkpoints etc.
                                  evaluation_strategy="epoch",# Evaluate model performance after each epoch
                                 num_train_epochs=3)          # Train for 3 epochs

In [40]:
# Create a Trainer object for model training and evaluation
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics,
)

RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [33]:
import os

# Set environment variable to enable device-side assertions
os.environ["TORCH_USE_CUDA_DSA"] = "1"


In [34]:
trainer.train()

RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
