In [None]:
import json
import pandas as pd
import torch
from datasets import Dataset, load_dataset
#from huggingface_hub import notebook_login
from peft import LoraConfig, PeftModel
from peft import LoraConfig, get_peft_model

device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

Using device: cuda:1


In [3]:
lora_config = LoraConfig(
    r=64,
    lora_alpha=32,
    target_modules=['o_proj', 'q_proj', 'up_proj', 'v_proj', 'k_proj', 'down_proj', 'gate_proj'],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
) 
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    pipeline,
    logging,
)

In [11]:
import pandas as pd
import pyarrow as pa
from datasets import Dataset, DatasetDict, Value, Features

# 1. CSV 파일 불러오기
df = pd.read_csv("extracted_documents.csv")
df = df.fillna("")

# 2. 필요한 컬럼만 추출하고 이름 변경
df['input_ids'] = df['original_text']
df['labels']  = df['summary_text']
df = df[['input_ids', 'labels']]

# 3. 문자열 길이 확인 (디버깅용)
print(f"최대 input_ids 길이: {df['input_ids'].str.len().max()}")
print(f"최대 labels 길이: {df['labels'].str.len().max()}")

# 4. PyArrow Table로 변환 (large_string 적용)
schema = pa.schema([
    ('input_ids', pa.large_string()),
    ('labels', pa.large_string())
])
table = pa.Table.from_pandas(df, schema=schema)



# 5. Dataset으로 변환
dataset = Dataset.from_dict({col: table[col].to_pandas() for col in table.schema.names})

# 6. train/validation/test 데이터셋 분할 (8:1:1 비율 예시)
#   - train_test_split() 함수를 두 번 호출해서
#     먼저 train: 80%, test: 20% 분리하고,
#     그 뒤 test 부분을 다시 50%씩 나눠 validation: 10%, test: 10% 로 분할
split_dataset = dataset.train_test_split(test_size=0.2, seed=42)
temp_dataset  = split_dataset['test'].train_test_split(test_size=0.5, seed=42)

final_dataset = DatasetDict({
    'train': split_dataset['train'],
    'validation': temp_dataset['train'],
    'test': temp_dataset['test']
})

final_dataset = final_dataset.cast_column("input_ids", Value("large_string"))
final_dataset = final_dataset.cast_column("labels", Value("large_string"))

# # 스키마와 첫 번째 예시 확인
# for split in final_dataset.keys():
#     print(f"[{split} 데이터셋] 스키마:")
#     print(final_dataset[split].features)
#     print(f"[{split} 데이터셋] 첫 번째 예시:")
#     print(final_dataset[split][0])
# 7. 결과 확인
print(final_dataset)

최대 input_ids 길이: 3993
최대 labels 길이: 1000


Casting the dataset: 100%|██████████| 19463/19463 [00:00<00:00, 42270.07 examples/s]
Casting the dataset: 100%|██████████| 2433/2433 [00:00<00:00, 40381.55 examples/s]
Casting the dataset: 100%|██████████| 2433/2433 [00:00<00:00, 41424.43 examples/s]
Casting the dataset: 100%|██████████| 19463/19463 [00:00<00:00, 667645.96 examples/s]
Casting the dataset: 100%|██████████| 2433/2433 [00:00<00:00, 867455.09 examples/s]
Casting the dataset: 100%|██████████| 2433/2433 [00:00<00:00, 863564.49 examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 19463
    })
    validation: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 2433
    })
    test: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 2433
    })
})





In [10]:
from trl import SFTTrainer

#notebook_login()

#model_id = "google/gemma-7b-it"
# model_id = "google/gemma-7b"
model_id = "google/gemma-2b-it"
#model_id = "google/gemma-2b"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

#model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=lora_config, device_map={"":0})
model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, device_map={"":1})
#model = AutoModelForCausalLM.from_pretrained(model_id, device_map={"":0})
tokenizer = AutoTokenizer.from_pretrained(model_id, add_eos_token=True)
{
    "instruction": "Create a function to calculate the sum of a sequence of integers.",
    "input":"[1, 2, 3, 4, 5]",
    "output": "# Python code def sum_sequence(sequence): sum = 0 for num in sequence: sum += num return sum"
}

dataset = read_csv("extracted_documents.csv", split="train")

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.34it/s]


NameError: name 'read_csv' is not defined

In [12]:
def generate_prompt(data_point):
    """Gen. input text based on a prompt, task instruction, (context info.), and answer

    :param data_point: dict: Data point
    :return: dict: tokenzed prompt
    """
    prefix_text = 'Below is an instruction that describes a task. Write a response that appropriately completes the request.\\n\\n'
    # Samples with additional context into.
    if data_point['input']:
        text = f"""<start_of_turn>user {prefix_text} {data_point["instruction"]} here are the inputs {data_point["input"]} <end_of_turn>\\n<start_of_turn>model{data_point["output"]} <end_of_turn>"""
    # Without
    else:
        text = f"""<start_of_turn>user {prefix_text} {data_point["instruction"]} <end_of_turn>\\n<start_of_turn>model{data_point["output"]} <end_of_turn>"""
    return text


In [None]:
# add the "prompt" column in the dataset
text_column = [generate_prompt(data_point) for data_point in dataset]
dataset = dataset.add_column("prompt", text_column)
dataset = dataset.shuffle(seed=1234)  # Shuffle dataset here
dataset = dataset.map(lambda samples: tokenizer(samples["prompt"]), batched=True)

dataset = dataset.train_test_split(test_size=0.2)
train_data = dataset["train"]
test_data = dataset["test"]

In [None]:

model = get_peft_model(model, lora_config)
trainable, total = model.get_nb_trainable_parameters()
print(f"Trainable: {trainable} | total: {total} | Percentage: {trainable/total*100:.4f}%")

import transformers

from trl import SFTTrainer
train_data=final_dataset['train']
val_data=final_dataset['validation']

tokenizer.pad_token = tokenizer.eos_token
torch.cuda.empty_cache()
trainer = SFTTrainer(
    model=model.to(device),
    train_dataset=train_data,
	test_dataset=val_data,
    dataset_text_field="prompt",
    peft_config=lora_config,
    args=transformers.TrainingArguments(
        per_device_train_batch_size=1,
        gradient_accumulation_steps=4,
        warmup_steps=0,
        max_steps=100,
        learning_rate=2e-4,
        logging_steps=1,
        output_dir="outputs",
        optim="paged_adamw_8bit",
        save_strategy="epoch",
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)


Trainable: 78446592 | total: 2584619008 | Percentage: 3.0351%


TypeError: SFTTrainer.__init__() got an unexpected keyword argument 'dataset_text_field'

: 

In [None]:

# Start the training process
trainer.train()

new_model = "gemma-2b-it-finetune" #Name of the model you will be pushing to huggingface model hub
# Save the fine-tuned model
trainer.model.save_pretrained(new_model)

# Merge the model with LoRA weights
base_model = AutoModelForCausalLM.from_pretrained(
    model_id,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map={"": 1},
)
merged_model= PeftModel.from_pretrained(base_model, new_model)
merged_model= merged_model.merge_and_unload()

# Save the merged model
merged_model.save_pretrained("merged_model",safe_serialization=True)
tokenizer.save_pretrained("merged_model")
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"


def get_completion(query: str, model, tokenizer) -> str:
  device = "cuda:1"
  prompt_template = """
  <start_of_turn>user
  Below is an instruction that describes a task. Write a response that appropriately completes the request.
  {query}
  <end_of_turn>\\n<start_of_turn>model
  
  """
  prompt = prompt_template.format(query=query)
  encodeds = tokenizer(prompt, return_tensors="pt", add_special_tokens=True)
  model_inputs = encodeds.to(device)
  generated_ids = model.generate(**model_inputs, max_new_tokens=1000, do_sample=True, pad_token_id=tokenizer.eos_token_id)
  # decoded = tokenizer.batch_decode(generated_ids)
  decoded = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
  return (decoded)

result = get_completion(query="code the fibonacci series in python using reccursion", model=merged_model, tokenizer=tokenizer)
print(result)


# Push the model and tokenizer to the Hugging Face Model Hub
# merged_model.push_to_hub(new_model, use_temp_dir=False)
# tokenizer.push_to_hub(new_model, use_temp_dir=False)