In [None]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"  
import json
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer
)
from datasets import load_dataset
from safetensors.torch import save_file

In [None]:
def load_and_preprocess_data(train_file, validation_file, tokenizer): 

    data_files = {
        'train': train_file,
        'validation': validation_file
    }
    dataset = load_dataset('json', data_files=data_files)
    
    def preprocess_function(examples):
        max_length = 32

        inputs = examples['input']
        outputs = [str(o) for o in examples['output']]

        prompts = [f"{inp}\n" for inp in inputs]
        full_texts = [prompt + out for prompt, out in zip(prompts, outputs)]

        tokenized_full = tokenizer(full_texts, truncation=True, padding='max_length', max_length=max_length)

        tokenized_prompt = tokenizer(prompts, truncation=True, padding='max_length', max_length=max_length)

        labels = []
        for i in range(len(full_texts)):

            prompt_len = len(tokenizer.encode(prompts[i], truncation=True, max_length=max_length))
    
            label = [-100] * prompt_len + tokenized_full['input_ids'][i][prompt_len:]
       
            label = label[:max_length]
      
            if len(label) < max_length:
                label += [-100] * (max_length - len(label))
            labels.append(label)


        tokenized_full['labels'] = labels

        return tokenized_full
    

    tokenized_datasets = dataset.map(preprocess_function, batched=True)
  
    tokenized_datasets = tokenized_datasets.remove_columns(['input', 'output', 'instruction'])
    
    return tokenized_datasets


train_file = '/2_arithmetic_operations_100/finetune_pythia_100/finetune_data/train_100.jsonl'
validation_file = '/2_arithmetic_operations_100/finetune_pythia_100/finetune_data/test_100.jsonl'

model_name = 'EleutherAI/pythia-1.4b-deduped'
tokenizer = AutoTokenizer.from_pretrained(model_name)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

tokenized_datasets = load_and_preprocess_data(train_file, validation_file, tokenizer)

print(tokenized_datasets['train'][:5])
print(tokenized_datasets['validation'][:5])

train_size = len(tokenized_datasets['train'])
validation_size = len(tokenized_datasets['validation'])

In [None]:
from peft import get_peft_config, get_peft_model, get_peft_model_state_dict, IA3Config, TaskType

model = AutoModelForCausalLM.from_pretrained('EleutherAI/pythia-1.4b-deduped')


ia3_config = IA3Config(
    peft_type="IA3",
    task_type=TaskType.CAUSAL_LM,
    inference_mode=False,

    target_modules=["query_key_value", "dense", "dense_h_to_4h", "dense_4h_to_h"],
    feedforward_modules=["dense_h_to_4h", "dense_4h_to_h"],
    init_ia3_weights=True 
)

model = get_peft_model(model, ia3_config)
model.print_trainable_parameters()

In [None]:
training_args = TrainingArguments(
    output_dir='./ia3_results',              
    num_train_epochs=2,                     
    per_device_train_batch_size=8,         
    logging_steps=10,                     
    save_steps=25,                         
    save_strategy="steps",                  
    save_total_limit=10,                    
    fp16=True,                           
    gradient_accumulation_steps=4,         
    report_to="none",                        
    learning_rate=8e-3,                     
    remove_unused_columns=False,           
    seed=42
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
    tokenizer=tokenizer
)



In [None]:
trainer.train()