In [2]:
import torch
import pandas as pd
from datasets import load_dataset, load_from_disk
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model, TaskType

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# dataset = load_dataset("fotiecodes/jarvis-llama2-dataset")
# dataset.save_to_disk("datasets")

In [4]:
dataset = load_from_disk("datasets/train/")
dataset

Dataset({
    features: ['text'],
    num_rows: 230
})

In [5]:
df = pd.DataFrame(dataset)
# print(df.columns)
# print(df['text'])
# print(type(df['text']))
df['text'][0]

'<s>[INST] Do you need further clarification? [/INST] Negative, I have a clear understanding of the instructions. </s>'

GPU- Cuda

In [6]:
import torch.version


print("Pytorch version: ", torch.__version__)
print("Cuda version:", torch.version.cuda)
print("GPU available: ", torch.cuda.is_available())
print("No of GPUs: ", torch.cuda.device_count())
print("GPU Name: ", torch.cuda.get_device_name())
device = "cuda" if torch.cuda.is_available() else "cpu"

Pytorch version:  2.7.1+cu126
Cuda version: 12.6
GPU available:  True
No of GPUs:  1
GPU Name:  NVIDIA GeForce RTX 2050


#### Quantization configurations

In [7]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_compute_dtype=torch.bfloat16,
)

#### Device & Model configurations

In [8]:
model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True
)
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)

#### LoRA configurations

In [9]:
lora_config = LoraConfig(
    r = 8,  # rank
    
    lora_alpha=16,      # scaling factor: after training the ouput of LoRA module will be scaled by 16/8 = 2.
    
    target_modules=['q_proj', 'v_proj'],    # which layers to modify. here- query & value projection layers.
    
    lora_dropout=0.05,      # regularization to prevent overfitting. here- 5% dropout(randomly zeros out parts of the LoRA weights during training)
    
    bias='none',        # here no bias terms will be added.
    
    task_type=TaskType.CAUSAL_LM    # CAUSAL_LM are models that predict the next token based on past tokens
)

In [10]:
model = get_peft_model(model, lora_config)

#### Q&A Training

In [11]:
def tokenize(batch):
    tokens = tokenizer(
        batch['text'],
        truncation=True,    # Ensures that if the tokenized sequence is longer than max_length, it will be cut off
        padding='max_length',
        max_length=256,
        return_tensors = 'pt' # returns a Pytorch Tensor
    )
    tokens['labels'] = tokens['input_ids'].clone()
    return tokens

In [12]:
tokenized_dataset = dataset.map(
    tokenize,
    batched=True,
    remove_columns=dataset.column_names
)

In [13]:
training_args = TrainingArguments(
    output_dir='trained_model/echo-first-trained',
    per_device_train_batch_size=3,
    gradient_accumulation_steps=3,
    learning_rate=1e-3,
    num_train_epochs=15,
    fp16=True,
    logging_steps=20,   # watch the status at every 20th step
    save_strategy='epoch',
    report_to='none',
    remove_unused_columns=False,
    label_names=['labels']
)

In [14]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    processing_class=tokenizer
)

In [15]:
trainer.train()

Step,Training Loss
20,3.9942
40,0.2742
60,0.1967
80,0.1711
100,0.1366
120,0.1128
140,0.1027
160,0.0861
180,0.0756
200,0.0646


TrainOutput(global_step=390, training_loss=0.2886335821487965, metrics={'train_runtime': 899.8447, 'train_samples_per_second': 3.834, 'train_steps_per_second': 0.433, 'total_flos': 5488057044172800.0, 'train_loss': 0.2886335821487965, 'epoch': 15.0})

In [16]:
model.save_pretrained("trained_model/echo-tinyllama-lora-adapter_finetuned")
tokenizer.save_pretrained("trained_model/echo-tinyllama-lora-adapter_finetuned")

"""What we just saved:
    1. The folder contains only the LoRA adapter weights and configuration,
    2. Tokenizer files.
    
    But: 
    *** The base model itself is not changed; the adapter is loaded and used together with it. ***
    i.e. adapter weights are applied on top of the base model at runtime. 
    
    So now we need to merge the adapters & the base model get a standalone finetuned model.
"""

'What we just saved:\n    1. The folder contains only the LoRA adapter weights and configuration,\n    2. Tokenizer files.\n\n    But: \n    *** The base model itself is not changed; the adapter is loaded and used together with it. ***\n    i.e. adapter weights are applied on top of the base model at runtime. \n\n    So now we need to merge the adapters & the base model get a standalone finetuned model.\n'

In [17]:
# from transformers import AutoModelForCausalLM, AutoTokenizer
# model = AutoModelForCausalLM.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0')
# tokenizer = AutoTokenizer.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0')

In [18]:
input_text = "hello are you my assisstant?"
input_ids = tokenizer.encode(input_text, return_tensors='pt')
attention_mask = input_ids.new_ones(input_ids.shape)
output = model.generate(
    input_ids,
    attention_mask=attention_mask,
    max_length=100,
    do_sample=True,
    top_k=5,
    top_p=0.95,
    temperature=0.7,
    repetition_penalty=1.5,
    num_return_sequences=1    
)
generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
print(generated_text)



RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu! (when checking argument for argument index in method wrapper_CUDA__index_select)