# Generate activation for summary-document pairs

In [1]:
import torch
print(torch.cuda.is_available())  # Should return True if CUDA is enabled
print(torch.__version__)  # Check PyTorch version
print(torch.version.cuda)  # Check CUDA version

True
2.6.0+cu124
12.4


## Load the data

In [2]:
from datasets import Dataset

dataset = Dataset.load_from_disk("backup/xsum_factual_combined_with_gold")
print(dataset)

  from .autonotebook import tqdm as notebook_tqdm


Dataset({
    features: ['id', 'doc', 'summary', 'is_factual'],
    num_rows: 11194
})


## Split the data as train:validation into 4:1

In [3]:
# Split the dataset into 80% test and 20% validation
split_datasets = dataset.train_test_split(test_size=0.2, seed=42)

In [4]:
# Access the test and validation datasets
train_dataset = split_datasets['train']  # 80% (test)
validation_dataset = split_datasets['test']  # 20% (validation)

# Verify the splits
print(f"Train dataset size: {len(train_dataset)}")
print(f"Validation dataset size: {len(validation_dataset)}")

Train dataset size: 8955
Validation dataset size: 2239


### Pick an example from the dataset

In [5]:
example = train_dataset[0]
print(example)

{'id': '34593064', 'doc': "Vincent Hudson has been charged with the historical offence on a boy under the age of 16, Cumbria Police said.\nA spokesman for the force said the attack was reported to have taken place in 2001.\nMr Hudson, 54, of Loweswater Avenue, Whitehaven, is due to appear before North Cumbria Magistrates' Court later, the spokesman added.", 'summary': 'A man from Cumbria has been charged in connection with the rape of a boy 14 years ago.', 'is_factual': 1}


## Finetuning

### Load model and dataset

In [6]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

model_path = "meta-llama/Llama-2-7b-hf"

# Load model and tokenizer
model = AutoModelForSequenceClassification.from_pretrained(model_path, num_labels=2)
tokenizer = AutoTokenizer.from_pretrained(model_path)

# Fix padding token issue
tokenizer.pad_token = tokenizer.eos_token  # Set pad_token to eos_token
model.config.pad_token_id = tokenizer.pad_token_id  # Ensure the model recognizes pad_token_id

Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.04s/it]
Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-2-7b-hf and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Prepare dataset

In [7]:
def preprocess_function(examples):
    # Combine document and summary with [SEP]
    inputs = [doc + " [SEP] " + summary for doc, summary in zip(examples['doc'], examples['summary'])]
    
    # Tokenize the inputs
    model_inputs = tokenizer(inputs, truncation=True, padding="max_length", max_length=512)
    
    # Add labels to the tokenized inputs
    model_inputs["labels"] = examples["is_factual"]
    return model_inputs

# Tokenize datasets
tokenized_train = train_dataset.map(preprocess_function, batched=True)
tokenized_val = validation_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/2239 [00:00<?, ? examples/s]

Map: 100%|██████████| 2239/2239 [00:00<00:00, 3748.27 examples/s]


In [8]:
def filter_invalid_labels(dataset):
    # Remove entries where the label is -1
    return dataset.filter(lambda example: example['labels'] != -1)

tokenized_train = filter_invalid_labels(tokenized_train)
tokenized_val = filter_invalid_labels(tokenized_val)


Filter:   0%|          | 0/2239 [00:00<?, ? examples/s]

Filter: 100%|██████████| 2239/2239 [00:00<00:00, 3176.59 examples/s]


### LoRA configuration

In [9]:
from peft import LoraConfig, get_peft_model

# Apply LoRA configuration
peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.1,
    bias="none",
    task_type="SEQ_CLS"
)
model = get_peft_model(model, peft_config)

# Print trainable parameters to confirm LoRA setup
model.print_trainable_parameters()

trainable params: 8,396,800 || all params: 6,615,748,608 || trainable%: 0.1269


### Compute accuracy and f1

### Training parameters

In [10]:
from transformers import TrainingArguments, Trainer, TrainerCallback
import numpy as np
import evaluate

# Load metrics
accuracy_metric = evaluate.load("accuracy")
f1_metric = evaluate.load("f1")

# Define a function to compute training metrics
def compute_training_metrics(predictions, labels):
    predictions = np.argmax(predictions, axis=-1)
    accuracy = accuracy_metric.compute(predictions=predictions, references=labels)
    return accuracy["accuracy"]

# Callback to compute and log training accuracy
class TrainingAccuracyCallback(TrainerCallback):
    def on_epoch_end(self, args, state, control, model=None, tokenizer=None, train_dataloader=None, **kwargs):
        if train_dataloader:
            model.eval()
            total_accuracy = 0
            total_samples = 0
            
            for batch in train_dataloader:
                # Move batch to appropriate device
                inputs = {k: v.to(args.device) for k, v in batch.items()}
                with torch.no_grad():
                    outputs = model(**inputs)
                    predictions = outputs.logits
                    labels = inputs["labels"]
                    
                    # Compute batch accuracy
                    batch_accuracy = compute_training_metrics(predictions.cpu().numpy(), labels.cpu().numpy())
                    total_accuracy += batch_accuracy * len(labels)
                    total_samples += len(labels)
            
            avg_training_accuracy = total_accuracy / total_samples
            print(f"Epoch {state.epoch + 1}: Training Accuracy = {avg_training_accuracy:.4f}")

# Define compute_metrics for validation
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = accuracy_metric.compute(predictions=predictions, references=labels)
    f1 = f1_metric.compute(predictions=predictions, references=labels, average="macro")
    return {
        "accuracy": accuracy["accuracy"],
        "f1": f1["f1"]
    }

# Training arguments
training_args = TrainingArguments(
    output_dir="./llama_xsum",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=4,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=100,
    save_strategy="no",
    save_total_limit=1
)

# Trainer with custom callback
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[TrainingAccuracyCallback()]  # Add the custom callback
)

# Train the model
trainer.train()


  trainer = Trainer(


OutOfMemoryError: CUDA out of memory. Tried to allocate 172.00 MiB. GPU 0 has a total capacity of 47.44 GiB of which 82.25 MiB is free. Including non-PyTorch memory, this process has 47.14 GiB memory in use. Of the allocated memory 46.59 GiB is allocated by PyTorch, and 256.65 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

### Upload model to huggingface

In [1]:
from transformers import LlamaForSequenceClassification, LlamaTokenizer

model = LlamaForSequenceClassification.from_pretrained("trinadutta/finetuned_llama")
tokenizer = LlamaTokenizer.from_pretrained("trinadutta/finetuned_llama")


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-2-7b-hf and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
