In [1]:

import json
import os
import sys
sys.path.append(os.path.abspath('/home/dslabra5/EAP-IG/Circuit_LoRa'))
from collections import defaultdict
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer
)

import random
import numpy as np
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed(42)  
os.environ["CUDA_VISIBLE_DEVICES"] = "0" 

## 数据集

In [None]:
def load_and_preprocess_data(train_file, validation_file, tokenizer):
   
    data_files = {
        'train': train_file,
        'validation': validation_file
    }
    dataset = load_dataset('json', data_files=data_files)
    
    def preprocess_function(examples):
        max_length = 32
        inputs = examples['input']
        outputs = [str(o) for o in examples['output']]

        prompts = [f"{inp}\n" for inp in inputs]
        full_texts = [prompt + out for prompt, out in zip(prompts, outputs)]

        tokenized_full = tokenizer(full_texts, truncation=True, padding='max_length', max_length=max_length)

        tokenized_prompt = tokenizer(prompts, truncation=True, padding='max_length', max_length=max_length)

        labels = []
        for i in range(len(full_texts)):
            prompt_len = len(tokenizer.encode(prompts[i], truncation=True, max_length=max_length))

            label = [-100] * prompt_len + tokenized_full['input_ids'][i][prompt_len:]
            label = label[:max_length]
            if len(label) < max_length:
                label += [-100] * (max_length - len(label))
            labels.append(label)

        tokenized_full['labels'] = labels

        return tokenized_full
    
    tokenized_datasets = dataset.map(preprocess_function, batched=True)
    
    return tokenized_datasets

train_file = '/home/dslabra5/EAP-IG/2_arithmetic_operations_100/finetune_pythia_100/finetune_data/train_100.jsonl'
validation_file = '/home/dslabra5/EAP-IG/2_arithmetic_operations_100/finetune_pythia_100/finetune_data/test_100.jsonl'

model_name = 'pythia-1.4b-deduped'
tokenizer = AutoTokenizer.from_pretrained(model_name)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

tokenized_datasets = load_and_preprocess_data(train_file, validation_file, tokenizer)

print(tokenized_datasets['train'][:5])
print(tokenized_datasets['validation'][:5])

train_size = len(tokenized_datasets['train'])
validation_size = len(tokenized_datasets['validation'])

## 加载模型

In [3]:
model_name = 'EleutherAI/pythia-1.4b-deduped'
model = AutoModelForCausalLM.from_pretrained(model_name)

## 训练参数设置

In [4]:
training_args = TrainingArguments(
    output_dir='./results',               
    num_train_epochs=2,                     
    per_device_train_batch_size=8,      
    warmup_steps=50,                       
    weight_decay=0.01,                     
    logging_dir='./logs',                   
    logging_steps=10,                     
    eval_strategy='no',                  
    save_steps=250,                       
    save_strategy="steps",                  
    save_total_limit=1,                    
    load_best_model_at_end=False,        
    fp16=True,                              
    gradient_accumulation_steps=4,           
    report_to="none",                        
    learning_rate=8e-6,                   
)

In [5]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
    tokenizer=tokenizer
)

In [None]:

trainer.train()