In [None]:
# install all dependencies
!pip install datasets transformers 
!pip install bitsandbytes


In [None]:

HUGGING_FACE_TOKEN = "your_token_here" # get your token from huggingface.co

In [None]:

from huggingface_hub import login
login(token=HUGGING_FACE_TOKEN)


In [None]:
from datasets import load_dataset
from transformers import LlamaForCausalLM, AutoTokenizer, Trainer, TrainingArguments
from peft import get_peft_model, LoraConfig, TaskType


In [None]:

dataset = load_dataset('VishaalY/synthetic-code-generations', split='train')

In [None]:

eval_dataset = dataset.select(range(1000))
train_dataset = dataset.select(range(1000, len(dataset)))

In [None]:

model_name = "meta-llama/Llama-3.2-1B"
model = LlamaForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    torch_dtype="float32",
    use_cache=False

In [None]:

model.gradient_checkpointing_enable()


In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token if tokenizer.eos_token else "[PAD]"
    if tokenizer.pad_token == "[PAD]":
        tokenizer.add_special_tokens({'pad_token': '[PAD]'})


In [None]:

def tokenize_function(examples):
    encoding = tokenizer(examples['response'], padding="max_length", truncation=True, max_length=400)
    encoding['labels'] = encoding['input_ids']
    return encoding

In [None]:

tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
tokenized_eval_dataset = eval_dataset.map(tokenize_function, batched=True)

In [None]:

lora_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    inference_mode=False,
    r=8,
    lora_alpha=16,
    lora_dropout=0.1
)

In [None]:
model = get_peft_model(model, lora_config)

In [None]:
model.print_trainable_parameters()

In [None]:
train_size = len(tokenized_train_dataset)
batch_size = 2
num_train_steps = train_size // batch_size


In [None]:
training_args = TrainingArguments(
    output_dir="./results",
    #evaluation_strategy="epoch",
    learning_rate=2e-4,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    save_steps=500,
    save_total_limit=2,
    max_steps=60,
    report_to=["none"],
    gradient_accumulation_steps=4,
    fp16=False,
    dataloader_num_workers=2,
    group_by_length=True,
    warmup_ratio = 0.03,
    warmup_steps = 5,
    #save_steps = 0,
    lr_scheduler_type="linear",
)


In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_eval_dataset,
)


In [None]:
# Start training
trainer.train()


In [None]:
DIR_PATH = "/path/to/save/model"
model.save_pretrained(DIR_PATH)
tokenizer.save_pretrained(DIR_PATH)
