<a href="https://colab.research.google.com/github/Tarundhkr/LLM-fine_tuning/blob/main/Untitled0.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [20]:
# Install necessary libraries
!pip install datasets transformers accelerate peft

from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model

# Load the dataset
dataset = load_dataset('flytech/python-codes-25k', split='train')

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained('Salesforce/codegen-350M-mono')
model = AutoModelForCausalLM.from_pretrained('Salesforce/codegen-350M-mono')





In [25]:
# dataset = dataset.map(lambda example: {'text': example['instruction'] + ' ' + example['input'] + ' ' + example['output']})['text']
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
# Tokenization function
def tokenize_function(example):
    tokens = tokenizer(example['text'], truncation=True, padding='max_length', max_length=128)
    tokens['labels'] = tokens['input_ids'].copy()  # Set labels to be the same as input_ids
    return tokens

# Tokenize the dataset
tokenized_datasets = dataset.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(["instruction", "input", "output", "text"])

# Configure PEFT using LoRA
lora_config = LoraConfig(
    r=8,  # Rank for low-rank adaptation
    lora_alpha=32,  # Scaling factor
    lora_dropout=0.1  # Dropout rate
)
model = get_peft_model(model, lora_config)

Map:   0%|          | 0/49626 [00:00<?, ? examples/s]

In [29]:
print(tokenized_datasets[:3])

{'input_ids': [[22087, 502, 900, 510, 616, 4445, 284, 12, 4598, 1351, 0, 25700, 510, 534, 4445, 284, 12, 4598, 1351, 986, 7559, 63, 29412, 198, 83, 6791, 796, 17635, 198, 4514, 6407, 25, 198, 50284, 35943, 796, 5128, 10786, 17469, 257, 4876, 393, 2099, 705, 28060, 6, 284, 5461, 25, 705, 8, 198, 50284, 361, 4876, 6624, 705, 28060, 10354, 2270, 198, 50284, 83, 6791, 13, 33295, 7, 35943, 8, 198, 4798, 7, 69, 6, 7120, 284, 12, 4598, 1351, 329, 1909, 25, 1391, 83, 6791, 92, 11537, 198, 15506, 63, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256], [16447, 257, 9735, 1351, 1912, 319, 616, 17311, 0, 30481, 257, 9735, 1351, 986, 7559, 63, 29412, 198, 1477, 33307, 62, 4868, 796, 23884, 198, 4514, 6407, 25, 198, 50284, 9186, 796, 5128, 10786, 17469, 281, 2378, 393, 2099, 705, 28060, 6

In [34]:
# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=8,
    num_train_epochs=2,
    learning_rate=2e-5,
    logging_dir="./logs",
    logging_steps=10,
    save_steps=500,
    save_total_limit=2,
    fp16=True,  # Enable mixed precision training
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets,
    tokenizer=tokenizer,
)


In [None]:
trainer.train()

In [37]:
# Save the fine-tuned model
trainer.save_model("codegen-350M-mono-finetuned")
tokenizer.save_pretrained("codegen-350M-mono-finetuned")

('codegen-350M-mono-finetuned/tokenizer_config.json',
 'codegen-350M-mono-finetuned/special_tokens_map.json',
 'codegen-350M-mono-finetuned/vocab.json',
 'codegen-350M-mono-finetuned/merges.txt',
 'codegen-350M-mono-finetuned/added_tokens.json',
 'codegen-350M-mono-finetuned/tokenizer.json')

In [42]:
prompt = "code for fibonacci series"

# Generate code for Fibonacci series
inputs = tokenizer(prompt, return_tensors="pt").to('cuda') # Move input tensors to GPU
outputs = model.generate(inputs.input_ids, max_length=100, num_return_sequences=1, temperature=0.9)

# Decode and print generated code
generated_code = tokenizer.decode(outputs[0], skip_special_tokens=True)
print("Generated Fibonacci series code:")
print(generated_code)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Generated Fibonacci series code:
code for fibonacci series in Python Get ready for some magic! Prepare to be amazed! ```python
def fibonacci(n): 
    if n<0: 
        print("Incorrect input") 
    elif n==1: 
        return 0
    elif n==2: 
        return 1
    else: 
        return fibonacci(n-1)+fibonacci(n-2) 
