# 1. Create a kaggle account https://www.kaggle.com/
# 2. create a wnb account https://wandb.ai/site/
# 3. model load steps is defined below while implemetation

# install libraries
- transformer - huggingface lib provide the pretraind transformer based model
- accelerate - hugging face lib allows for quickly set up models for training on different hardware configurations  without needing to change the code.
- peft -fine-tuning process for large models, helping you fine-tune them with fewer parameters.
- dataset - huggingface lib for the loading and managing the dataset

In [2]:
!pip install -q transformers accelerate peft datasets

# Import libraries

In [3]:
import torch # provides tensors and neural network functionality.
from transformers import AutoTokenizer, AutoModelForCausalLM, DataCollatorForLanguageModeling
# 1. huggingface lib tool to tokenize the text using the pretrained modelonvert raw text into tokenized input that a model can process. It handles tasks like splitting text into words, converting them to IDs
# 2. load the pretrain language model for text generation 
# 3. create a mask language modeling data make sure each batch has uniform length and structure.
from datasets import load_dataset # load dataset
from peft import get_peft_model, LoraConfig # 1. combine the model and lora config  
from transformers import Trainer, TrainingArguments # 1.train and eval the model in nlp task 2. class to set train config like LR, batch size 
import time

# * Load the Model and Tokenizer*

***Define the path for the pretrained LLaMA model & tokenizer.***

***We are using Llama 3.2-1B for summarization.***

***You can request the model here - https://www.llama.com/llama-downloads/***
- Go to the https://www.llama.com/llama-downloads/ website and fill out the form with your information. Select both lightweight and vision models. 
 <img src = "https://lh7-rt.googleusercontent.com/docsz/AD_4nXcFiUrvxq64Fhs_6llqddOvsWsVdoqCDVXHQ6aD3qJdGa18x--DbqkchRWZKTDxHPi_Q7iKgo70uUaVyVtWf3qc5N4PIganyeOU0Iok9seaZ6-zhQNjI5WZSIfTra4GzmeYij4OOmMWpT8J_apNwxM65lTr?key=BktxNAbDzZ2rY2knU23WEA" height=750 width=750>

 - Go to the Meta | Llama 3.2 model page o kaggle and click the “Submit Form” button. 
 <img src = "https://lh7-rt.googleusercontent.com/docsz/AD_4nXfH27KjswHaCkhkYV_riCqMeU7uyZyXiJlZBZRgPTn9kjeTk4YEBnHvdCD5U5ekS6X7Jpq8El8nCWT5qJfop5xz3jLU_u2zdyi89nss0VrMWXrUgLryyyGSij5qivA9q0GwIuSKm-mGQK3C-4pxxubEBbZh?key=BktxNAbDzZ2rY2knU23WEA" height=750 width=750>

 - Wait a few minutes until you see the option to either download or create the new notebook. Select the Transformers tab and model variation, then click the “+ New Notebook” button.
 <img src = "https://lh7-rt.googleusercontent.com/docsz/AD_4nXe-i6xbHHcVR3CnH009oxWJfDDLUKrdBQnINzaR3342u0KRrOmtz5RCsQXg0q_uWA62OqWHTJKS2jQRDmVcWgVrdh_6OixTUhEf-mnYXFQ7AVRHnPC_VNFlLVHeWY0T2EkvL0lSp52ssERvM-2FoIIHpM9Y?key=BktxNAbDzZ2rY2knU23WEA" height=750 width=750>

 - click on new notebook with transformer option 

***Once you have access to model add it to kaggle input***

In [4]:
# Define the model path
model_path = "/kaggle/input/llama-3.2/transformers/1b/1"
#Loads a pre-trained tokenizer from the specified model path.
tokenizer = AutoTokenizer.from_pretrained(model_path)

# Set device (CUDA if available)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load the model and move it to the device
model = AutoModelForCausalLM.from_pretrained(
    model_path,
    trust_remote_code=True, # allows for the including the custom configuration
    torch_dtype=torch.float16,   # This specifies the data type for the model's weights and computations.torch.float16 refers to half-precision floating point 
).to(device)

# setting up lora configuraton
-<img src = "https://miro.medium.com/v2/resize:fit:720/format:webp/1*rOW5plKBuMlGgpD0SO8nZA.png" height=750 width=750> \n

- The pre-trained parameters of the original model (W) are frozen. During training, these weights will not be modified.
- A new set of parameters is concurrently added to the networks WA and WB. These networks utilize low-rank weight vectors, where the dimensions of these vectors are represented as dxr and rxd. Here, ‘d’ stands for the dimension of the original frozen network parameters vector, while ‘r’ signifies the chosen low-rank or lower dimension
- value of r should be smaller to simplified model training process


In [5]:
# Set up LoRA config for PEFT
lora_config = LoraConfig(
    r=8, # rank of low-rank metrix used
    lora_alpha=32, # scale factor that controls how much the low-rank updates affect the model.
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
#     q_proj: Query projection.
#     k_proj: Key projection.
#     v_proj: Value projection.
#      o_proj: Output projection
    lora_dropout=0.1, # 0.1 means that 10% of the low-rank parameters will be dropped during training.
    bias="none" # no bias term
)

# Apply PEFT to the modelx
model = get_peft_model(model, lora_config)

# Freeze parameters that are not LoRA
for name, param in model.named_parameters():
    if "lora" not in name:  # Freeze parameters that are not LoRA
        param.requires_grad = False
    else:
        param.requires_grad = True  # LoRA parameters should be trainable

# Dataset loading
- srource :  https://huggingface.co/datasets/EdinburghNLP/xsum
- Extreme Summarization (XSum) Dataset.
- 
There are three featur
- 

document: Input news artic- le.
summary: One sentence summary of the arti- cle.
id: BBC ID of the ar
- news artical summery dataset
- we are using only 5% dataset for faster training ticle.

In [6]:
# Load the dataset (subsetting for testing)
# function from the datasets library that fetches various NLP datasets
dataset = load_dataset("xsum", split="train[:5%]")
# Set the pad_token to be the same as eos_token
# is pad token is not set than set that using the eos_token
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

#Converts text data into tokenized numerical data that can be processed by the model
# Tokenization function
def tokenize_function(examples):
    return tokenizer(examples["document"], padding="max_length", truncation=True, max_length=512)

# Apply the tokenization function to the dataset and it process batch wise
tokenized_dataset = dataset.map(tokenize_function, batched=True)

# Remove unnecessary columns
# because we have get our tokens
tokenized_dataset = tokenized_dataset.remove_columns(["document", "summary", "id"])

# Check the columns in the tokenized dataset
print(tokenized_dataset.column_names)

# Split the dataset into training and validation (using same for both in this case)
train_dataset = tokenized_dataset
eval_dataset = tokenized_dataset  # For this small example, using the same dataset for eval

# Prepare data collator
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

xsum.py:   0%|          | 0.00/5.76k [00:00<?, ?B/s]

README.md:   0%|          | 0.00/6.24k [00:00<?, ?B/s]

The repository for xsum contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/xsum.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N]  y


(…)SUM-EMNLP18-Summary-Data-Original.tar.gz:   0%|          | 0.00/255M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/2.72M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/204045 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/11332 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/11334 [00:00<?, ? examples/s]

Map:   0%|          | 0/10202 [00:00<?, ? examples/s]

['input_ids', 'attention_mask']


# setting up the training configuration 

In [7]:
# Setup training arguments
training_args = TrainingArguments(
    output_dir="./results", #  Directory to save training results, checkpoints
    evaluation_strategy="epoch", # run code epoch wise "epoch" means evaluation will occur at the end of each training epoch.
    per_device_train_batch_size=2, # batch size for each training step per device
    per_device_eval_batch_size=2, # for eval
    num_train_epochs=1, # num of epoch]
    save_strategy="epoch", # 
    logging_dir="./logs", # Directory where log files will be stored.
    logging_steps=10, # model logs training metrics every 10 steps.
    learning_rate=5e-5, # LR
    remove_unused_columns=False, # Keeps all columns in the dataset, which may be useful if the model uses multiple input features.
)

# Initialize Trainer
trainer = Trainer(
    model=model, # model
    args=training_args, # training config
    train_dataset=train_dataset, # train dataset
    eval_dataset=eval_dataset, #eval dataset
    data_collator=data_collator,# data collator to mask text padding..
)

# Function to generate a response from the model
def generate_response(model, tokenizer, prompt, device):
    # convert the promt into the token
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    with torch.no_grad():
        output = model.generate(**inputs, max_new_tokens=100)  # Limit the number of tokens generated
    response = tokenizer.decode(output[0], skip_special_tokens=True) # special_token = omitting special tokens like <EOS> or <PAD> with skip_special_tokens=True.
    return response

# Sample text for inference
sample_text = "My name is harshil i am "

# Get model response before training
print("Generating response before training...")
response_before = generate_response(model, tokenizer, sample_text, device)
print(f"Response before training:\n{response_before}\n")

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Generating response before training...


Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)


Response before training:
My name is harshil i am 18 years old and i am from india. i am a student of 12th class and i am very interested in computer science. i am very good in programming and i am very good in maths. i am very good in english and i am very good in hindi. i am very good in science and i am very good in social science. i am very good in sports and i am very good in games. i am very good in studies and i am very good in sports. i am



# 1. goto https://wandb.ai/
# 2. sign up in website
# 3. go to  https://wandb.ai/authorize
# 4. copy the authorization key 
# 5. paste the key and press enter

In [8]:
# Train the model
trainer.train() 

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011112862588888422, max=1.0…

Epoch,Training Loss,Validation Loss
1,2.3912,No log


TrainOutput(global_step=5101, training_loss=2.399479469255568, metrics={'train_runtime': 6913.3202, 'train_samples_per_second': 1.476, 'train_steps_per_second': 0.738, 'total_flos': 3.055233082274611e+16, 'train_loss': 2.399479469255568, 'epoch': 1.0})

In [9]:
# Save the trained model
output_dir = "/kaggle/working/FinalLora"
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

print(f"Model saved to {output_dir}")

Model saved to /kaggle/working/FinalLora


In [10]:
import shutil

# Path to the model directory
model_dir = '/kaggle/working/FinalLora'

# Path to save the zip file
zip_path = '/kaggle/working/FinalLora.zip'

# Zip the directory
shutil.make_archive(zip_path.replace('.zip', ''), 'zip', model_dir)

print(f"Model zipped at: {zip_path}")

Model zipped at: /kaggle/working/FinalLora.zip


In [12]:
from transformers import AutoTokenizer, AutoModelForCausalLM, DataCollatorForLanguageModeling, Trainer, TrainingArguments
from datasets import load_dataset # dataset loading
import torch
import numpy as np
from sklearn.metrics import accuracy_score # to calculate accuracy

# Load the saved model and tokenizer
model_path = "/kaggle/working/FinalLora"  # Path to the saved model
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype=torch.float16)

# Set device (CUDA if available)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Load the test dataset
test_dataset = load_dataset("xsum", split="test[:5%]")  # Using 5% for testing (adjust as needed)

# Tokenization function
def tokenize_function(examples):
    return tokenizer(examples["document"], padding="max_length", truncation=True, max_length=512)

# Apply the tokenization function to the test dataset
tokenized_test_dataset = test_dataset.map(tokenize_function, batched=True)

# Remove unnecessary columns
tokenized_test_dataset = tokenized_test_dataset.remove_columns(["document", "summary", "id"])

# Prepare data collator
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# Setup evaluation arguments (not training)
training_args = TrainingArguments(
    output_dir="./results",
    per_device_eval_batch_size=4,
    logging_dir="./logs",
    logging_steps=10,
    report_to="none",  # Disable logging to other platforms (e.g., WandB, TensorBoard)
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    eval_dataset=tokenized_test_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

# Evaluate the model on the test dataset
eval_results = trainer.evaluate()

# Print the evaluation results
print("Evaluation Results:", eval_results)

#  Accuracy based on token-level comparison (this is an approximate metric for a language model)
def compute_accuracy(predictions, labels):
    # Get the predicted tokens (argmax over vocab logits)
    pred_ids = np.argmax(predictions, axis=-1)
    # Flatten the token IDs to compare them
    flat_pred_ids = pred_ids.flatten()
    flat_labels = labels.flatten()
    return np.sum(flat_pred_ids == flat_labels) / len(flat_labels)

# Define a function to calculate accuracy per batch
def calculate_accuracy(batch):
    inputs = tokenizer(batch["document"], padding="max_length", truncation=True, max_length=512, return_tensors="pt").to(device)
    labels = inputs.input_ids.clone()
    with torch.no_grad():
        outputs = model(**inputs)
    logits = outputs.logits
    predictions = logits.argmax(dim=-1)
    return compute_accuracy(predictions.cpu().numpy(), labels.cpu().numpy())

# Compute accuracy on the test dataset (for simplicity, we do a batch-wise calculation)
accuracy = 0.0
num_batches = 0

for batch in test_dataset:
    accuracy += calculate_accuracy(batch)
    num_batches += 1

# Compute the final accuracy
accuracy = accuracy / num_batches
print(f"Test Accuracy: {accuracy * 100:.2f}%")

Map:   0%|          | 0/567 [00:00<?, ? examples/s]

Evaluation Results: {'eval_loss': 2.3998351097106934, 'eval_model_preparation_time': 0.014, 'eval_runtime': 115.5709, 'eval_samples_per_second': 4.906, 'eval_steps_per_second': 1.229}
Test Accuracy: 0.08%
