In [1]:
!pip install torch torchvision torchaudio
!pip install transformers==4.46.2
!pip install peft==0.13.2
!pip install accelerate==1.1.1
!pip install trl==0.12.1
!pip install bitsandbytes==0.41.1 --prefer-binary
!pip install datasets==3.1.0
!pip install huggingface-hub==0.26.2

Collecting torch
  Downloading torch-2.1.2-cp310-cp310-manylinux1_x86_64.whl.metadata (25 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch)
  Downloading nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12 (from nvidia-cudnn-cu12==9.5.1.17->torch)
  Downloading nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch)
  Downloading nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl.metadata (1.5 k

In [2]:
# Step 2: Import Required Libraries
import torch
import gc
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments
)
from peft import (
    LoraConfig,
    get_peft_model,
    prepare_model_for_kbit_training
)
from trl import SFTTrainer
from datasets import load_dataset


In [3]:
# Step 3: Configure Model Loading with 4-bit Quantization
print("Configuring model for memory-efficient loading...")

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
)

Configuring model for memory-efficient loading...


In [4]:
# Option B: Python API
from huggingface_hub import login
login("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")


In [8]:
!pip uninstall flash-attn flash_attn -y

from transformers import AutoModelForCausalLM, AutoTokenizer

model_name = "mistralai/Mistral-7B-Instruct-v0.1"
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto",
    torch_dtype=torch.float16,
    trust_remote_code=True,
    use_auth_token=True
)

tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    trust_remote_code=True,
    use_auth_token=True
)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

print(f"Model loaded successfully on device: {model.device}")
print(f"GPU Memory allocated: {torch.cuda.memory_allocated()/1e9:.2f} GB")

Found existing installation: flash_attn 2.8.0.post2
Uninstalling flash_attn-2.8.0.post2:
  Successfully uninstalled flash_attn-2.8.0.post2


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

You are calling `save_pretrained` to a 4-bit converted model, but your `bitsandbytes` version doesn't support it. If you want to save 4-bit models, make sure to have `bitsandbytes>=0.41.3` installed.


Model loaded successfully on device: cuda:0
GPU Memory allocated: 4.13 GB




In [9]:
# Step 5: Configure LoRA for Parameter Efficient Fine-Tuning
print("Configuring LoRA adapters...")

lora_config = LoraConfig(
    r=16,                              # Rank of adaptation
    lora_alpha=16,                     # LoRA scaling parameter
    target_modules=[
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj"
    ],
    lora_dropout=0.05,                 # LoRA dropout
    bias="none",                       # Bias type
    task_type="CAUSAL_LM",            # Task type
)

# Prepare model for k-bit training and add LoRA adapters
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, lora_config)

# Print trainable parameters
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
total_params = sum(p.numel() for p in model.parameters())
print(f"Trainable parameters: {trainable_params:,}")
print(f"Total parameters: {total_params:,}")
print(f"Percentage of trainable params: {100 * trainable_params / total_params:.2f}%")

Configuring LoRA adapters...
Trainable parameters: 41,943,040
Total parameters: 3,794,014,208
Percentage of trainable params: 1.11%


In [10]:
# Step 6: Load and Format VT-SSum Dataset
print("Loading VT-SSum dataset...")

# Paths to your formatted JSONL files
train_path = "train.jsonl"
dev_path = "dev.jsonl"

# Load datasets
raw_datasets = load_dataset("json", data_files={
    "train": train_path,
    "validation": dev_path
})

print(f"Training examples: {len(raw_datasets['train'])}")
print(f"Validation examples: {len(raw_datasets['validation'])}")

Loading VT-SSum dataset...
Training examples: 99504
Validation examples: 12569


In [11]:
# Step 7: Format Dataset for Training
def format_chat_template(example):
    """Format messages using Mistral's chat template and tokenize with padding/truncation"""
    text = tokenizer.apply_chat_template(
        example["messages"],
        tokenize=False,
        add_generation_prompt=False
    )
    # Tokenize the text and apply padding and truncation
    tokenized_output = tokenizer(
        text,
        padding="max_length",  # Pad to max_seq_length
        truncation=True,       # Truncate to max_seq_length
        max_length=2048,       # Use the same max_seq_length as in SFTTrainer
        return_tensors="pt"    # Return PyTorch tensors
    )
    # Return the tokenized inputs
    return {
        "input_ids": tokenized_output["input_ids"].squeeze(),
        "attention_mask": tokenized_output["attention_mask"].squeeze(),
    }

# Apply formatting
print("Formatting and tokenizing dataset...")
tokenized_datasets = raw_datasets.map(
    format_chat_template,
    remove_columns=["messages"],
    desc="Formatting and tokenizing chat templates"
)

# Preview a formatted example (now contains tokenized data)
print("\nSample formatted example (tokenized):")
print(tokenized_datasets["train"][0])

Formatting and tokenizing dataset...


Formatting and tokenizing chat templates:   0%|          | 0/12569 [00:00<?, ? examples/s]


Sample formatted example (tokenized):
{'input_ids': [1, 1, 733, 16289, 28793, 995, 460, 396, 7583, 438, 18062, 3864, 14165, 11774, 1238, 28725, 18319, 865, 356, 272, 6421, 16582, 9621, 28723, 13, 13, 12069, 18062, 653, 272, 2296, 23347, 10424, 18319, 356, 272, 2191, 14165, 16582, 28747, 13, 13, 8243, 722, 298, 396, 18738, 297, 12635, 298, 272, 6355, 302, 3951, 6827, 301, 28723, 382, 3683, 1951, 302, 1524, 297, 272, 2990, 390, 813, 1918, 403, 3677, 28725, 17905, 293, 304, 27316, 8768, 12829, 592, 298, 1073, 272, 10065, 298, 5516, 304, 1565, 272, 13041, 297, 264, 2948, 3216, 1938, 272, 879, 970, 272, 2990, 349, 4716, 27288, 28723, 1124, 25123, 390, 264, 17255, 302, 456, 1951, 297, 456, 2758, 28723, 733, 28748, 16289, 28793, 4186, 722, 298, 396, 18738, 297, 12635, 298, 272, 6355, 302, 3951, 6827, 301, 28723, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,

In [12]:
# Step 8: Configure Training Arguments
print("\nConfiguring training parameters...")

training_args = TrainingArguments(
    output_dir="./vtssum-mistral-finetuned",
    per_device_train_batch_size=1,           # Batch size per device
    gradient_accumulation_steps=8,           # Simulate larger batch size
    learning_rate=2e-4,                      # Learning rate
    max_steps=1000,                          # Maximum training steps
    warmup_steps=100,                        # Warmup steps
    fp16=True,                               # Mixed precision training
    logging_steps=50,                        # Log every N steps
    save_strategy="steps",                   # Save strategy
    save_steps=200,                        # Save every N steps
    evaluation_strategy="steps",             # Evaluation strategy
    eval_steps=200,                          # Evaluate every N steps
    optim="adamw_8bit",                      # 8-bit optimizer
    lr_scheduler_type="linear",              # Learning rate scheduler
    report_to=[],                            # Disable wandb
    remove_unused_columns=False,             # Keep all columns
    dataloader_pin_memory=False,             # Reduce memory usage
    group_by_length=True,                    # Group samples by length
    gradient_checkpointing=False,            # Disable gradient checkpointing to avoid warning
)


Configuring training parameters...




In [13]:
# Step 9: Initialize SFT Trainer
print("Initializing SFT Trainer...")

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    dataset_text_field="text",
    max_seq_length=2048,
    args=TrainingArguments(
        output_dir="vtssum-finetuned",
        per_device_train_batch_size=1,
        gradient_accumulation_steps=8,
        learning_rate=2e-4,
        max_steps=1000,
        warmup_steps=100,
        fp16=True,
        logging_steps=1,                # Log every step to see updates
        save_strategy="steps",
        save_steps=200,
        optim="adamw_8bit",
        lr_scheduler_type="linear",
        report_to=[],                   # No wandb
        disable_tqdm=False              # <== Enable tqdm
    ),
    packing=False,
)

# Step 10: Memory Management and Training
print("\nCleaning memory before training...")
gc.collect()
torch.cuda.empty_cache()

print(f"GPU Memory before training: {torch.cuda.memory_allocated()/1e9:.2f} GB")
print("Starting training...")

# Start training with visible progress bar
trainer.train()


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.
max_steps is given, it will override any value given in num_train_epochs


Initializing SFT Trainer...

Cleaning memory before training...
GPU Memory before training: 4.82 GB
Starting training...


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Step,Training Loss
1,3.4178
2,3.6521
3,3.41
4,3.5361
5,3.4261
6,3.1571
7,3.1006
8,3.2706
9,3.4222
10,3.4669




TrainOutput(global_step=1000, training_loss=2.1230395323038103, metrics={'train_runtime': 34523.2386, 'train_samples_per_second': 0.232, 'train_steps_per_second': 0.029, 'total_flos': 7.03129498681344e+17, 'train_loss': 2.1230395323038103, 'epoch': 0.08039877793857533})

In [2]:
model.save_pretrained("vtssum-finetuned", safe_serialization=False)
tokenizer.save_pretrained("vtssum-finetuned")

NameError: name 'model' is not defined

In [1]:
model.save_pretrained(
    "./checkpoints/mistral-vtssum-final", 
    safe_serialization=True,  # Force safetensors format
    save_embedding_layers=True
)

NameError: name 'model' is not defined

In [4]:

# Inference sanity check
test_msgs = [
    {"role":"system","content":"You are an expert lecture summariser."},
    {"role":"user","content":"Summarise: Overfitting in machine learning occurs when a model learns the training data too well, including its noise and outliers, resulting in poor generalization to new, unseen data. This typically happens when a model is too complex, such as having too many parameters relative to the amount of training data. As a result, the model performs exceptionally on the training set but fails to predict accurately on the test set. To prevent overfitting, techniques like cross-validation, regularization (e.g., L1 or L2), pruning (for decision trees), early stopping (in neural networks), and using more training data can be applied. Additionally, simpler models or ensemble methods like bagging and boosting often improve generalization performance."}
]
prompt = tokenizer.apply_chat_template(test_msgs, tokenize=False, add_generation_prompt=True)
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
out = model.generate(**inputs, max_new_tokens=200, temperature=0.3)
print(tokenizer.decode(out[0], skip_special_tokens=True).split("[/INST]")[-1].strip())


NameError: name 'tokenizer' is not defined