In [1]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer,BitsAndBytesConfig,Trainer,TrainingArguments
from peft import LoraConfig, get_peft_model, TaskType
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import torch
torch.cuda.init()

In [3]:
import os
os.environ['HF_HUB_DISABLE_SYMLINKS_WARNING'] = "1"

In [4]:
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:128'

In [5]:
model_name = "EleutherAI/gpt-neo-1.3B"

In [6]:
model = AutoModelForCausalLM.from_pretrained(model_name)

In [7]:
quantization_config = BitsAndBytesConfig(load_in_8bit=True)

In [8]:
model = AutoModelForCausalLM.from_pretrained(model_name,quantization_config=quantization_config)

`low_cpu_mem_usage` was None, now default to True since model is quantized.


In [9]:
from accelerate import infer_auto_device_map
device_map = infer_auto_device_map(model, max_memory={0: "6GiB", "cpu": "8GiB"})

In [10]:
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map=device_map,
    quantization_config=quantization_config,
    offload_folder="offload_dir"
)

In [11]:
tokenizer= AutoTokenizer.from_pretrained(model_name)

In [12]:
if tokenizer.pad_token is None:
 tokenizer.pad_token = tokenizer.eos_token

In [17]:
dataset = load_dataset("json",data_files=r"D:\Prabha\NLP\Model\gen_ai_chatbot\tokenized_dataset_512_revizsed.jsonl",split="train")

In [18]:
def format_dataset(example):
    
    prompt = example['text']
    return {
        "input_ids": example['input_ids'],
        "attention_mask": example['attention_mask'],
        "labels": example['input_ids'] 
    }


In [19]:
dataset = dataset.map(format_dataset)

In [20]:
dataset = dataset.train_test_split(test_size=0.1)

In [21]:
lora = LoraConfig(task_type=TaskType.CAUSAL_LM,r=1,lora_alpha=16,lora_dropout=0.1,target_modules=["q_proj","v_proj"])

In [22]:
model = get_peft_model(model,lora)

In [23]:
model.print_trainable_parameters()

trainable params: 196,608 || all params: 1,315,772,416 || trainable%: 0.0149


In [24]:
output_dir =r"D:\Prabha\NLP\Model\gen_ai_chatbot\model\output\model_6"
logs=r"D:\Prabha\NLP\Model\gen_ai_chatbot\model\output\logs"

In [25]:
training_args = TrainingArguments(output_dir=output_dir,per_device_train_batch_size=1,gradient_accumulation_steps=4,learning_rate=2e-4,num_train_epochs=240,fp16=True,logging_dir=logs,save_strategy="epoch",logging_steps=100)

In [26]:
trainer = Trainer(model=model,args=training_args,train_dataset=dataset["train"],eval_dataset=dataset["test"],tokenizer=tokenizer)

  trainer = Trainer(model=model,args=training_args,train_dataset=dataset["train"],eval_dataset=dataset["test"],tokenizer=tokenizer)


In [27]:
trainer.train()



Step,Training Loss
100,2.2972
200,1.5643
300,1.1489
400,0.918
500,0.7712
600,0.6641
700,0.58
800,0.5143
900,0.4597
1000,0.4149




TrainOutput(global_step=2160, training_loss=0.5784547483479535, metrics={'train_runtime': 3369.6567, 'train_samples_per_second': 2.564, 'train_steps_per_second': 0.641, 'total_flos': 3.208013213073408e+16, 'train_loss': 0.5784547483479535, 'epoch': 240.0})

In [28]:
model.save_pretrained(r"D:\Prabha\NLP\Model\gen_ai_chatbot\model\output\model_6\save")



In [29]:
tokenizer.save_pretrained(r"D:\Prabha\NLP\Model\gen_ai_chatbot\model\output\model_6\save\tokenizer")

('D:\\Prabha\\NLP\\Model\\gen_ai_chatbot\\model\\output\\model_6\\save\\tokenizer\\tokenizer_config.json',
 'D:\\Prabha\\NLP\\Model\\gen_ai_chatbot\\model\\output\\model_6\\save\\tokenizer\\special_tokens_map.json',
 'D:\\Prabha\\NLP\\Model\\gen_ai_chatbot\\model\\output\\model_6\\save\\tokenizer\\vocab.json',
 'D:\\Prabha\\NLP\\Model\\gen_ai_chatbot\\model\\output\\model_6\\save\\tokenizer\\merges.txt',
 'D:\\Prabha\\NLP\\Model\\gen_ai_chatbot\\model\\output\\model_6\\save\\tokenizer\\added_tokens.json',
 'D:\\Prabha\\NLP\\Model\\gen_ai_chatbot\\model\\output\\model_6\\save\\tokenizer\\tokenizer.json')