In [1]:
# !pip install transformers
# !pip install peft
# !pip install dataclasses
# !pip install --upgrade wandb

In [2]:
# pip uninstall -y wandb

In [3]:
# pip install wandb

In [4]:
# import wandb
# print(wandb.__version__)

In [5]:
import wandb
wandb.login()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33m19010247shim[0m ([33m19010247shim-sejong-university[0m). Use [1m`wandb login --relogin`[0m to force relogin


True

In [6]:
import transformers
from transformers import TrainingArguments, Trainer, AutoTokenizer, AutoModelForCausalLM
from transformers.trainer_callback import EarlyStoppingCallback
import torch
from torch.utils.data import Dataset
from peft import LoraConfig, get_peft_model
from dataclasses import dataclass
import json, os, random, logging, math, copy
import numpy as np

IGNORE_INDEX = -100 # 학습 loss 계산에 무시되는 index
os.environ['WANDB_PROJECT'] = 'TEST' # wandb project 이름 설정

In [7]:
# random seed 설정 함수
def set_seed(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)  # if use multi-GPU
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    np.random.seed(seed)
    random.seed(seed)

In [8]:
def load_dataset(directory_path):
    filenames = os.listdir(directory_path)
    datas = []
    for filename in filenames:
        with open(os.path.join(directory_path, filename), 'r', encoding='utf8') as f:
            file_data = json.load(f)
            if isinstance(file_data, list):
                datas.extend(file_data)
            else:
                datas.append(file_data)

    print(f"loading finished : {len(datas)} datas")
    return datas

def data_transform(datas):
    prompt_template = (
        "당신은 대한민국 경상북도 경산시 부동산 전문가입니다. "
        "다음 입력정보에 따라 적절한 공실 순위와, 분석을 생성하세요.\n"
        "### 입력 정보:\n{features}\n\n### 분석 결과:\n"
    )

    dataset = []
    for data in datas:
        try:
            reasoning_result = json.loads(data.get('reasoning_result', '{}'), strict=False)
        except json.JSONDecodeError as e:
            print(f"Warning: Skipping data point due to JSONDecodeError: {e}")
            print(f"Problematic data: {data.get('reasoning_result', '{}')}")
            continue

        # 'columns'만 가져오기
        columns = reasoning_result.get('features', {}).get('columns', {})
        if not columns:
            print("Warning: Skipping data point due to missing 'columns'")
            continue

        features = json.dumps(columns, ensure_ascii=False, indent=2)
        analysis = reasoning_result.get('analysis', '')

        source = prompt_template.format(features=features)
        target = analysis

        dataset.append(dict(
            source=source,
            target=target
        ))

    print(f"Total data samples: {len(dataset)}")

    return dataset


In [9]:
def preprocess(sources, targets, tokenizer):
    examples = [s + t for s, t in zip(sources, targets)]

    input_ids = tokenizer(text=examples, padding=False, return_attention_mask=False, return_length=False,
                          max_length=tokenizer.model_max_length, truncation=True, verbose=False)["input_ids"]
    labels = copy.deepcopy(input_ids)

    for pieces in input_ids:
        assert not any([math.isnan(piece) or math.isinf(piece) for piece in pieces])

    source_lens = tokenizer(text=sources, padding=False, return_attention_mask=False, return_length=True,
                            max_length=tokenizer.model_max_length, truncation=True, verbose=False)["length"]

    for example_index in range(len(examples)):
        for index in range(source_lens[example_index]):
            labels[example_index][index] = IGNORE_INDEX

    return dict(input_ids=input_ids, labels=labels)


class CustomDataset(Dataset):
    def __init__(self, examples, tokenizer):
        self.tokenizer = tokenizer

        sources = [example['source'] for example in examples]
        targets = [f"{example['target']}{tokenizer.eos_token}" for example in examples]

        logging.warning(msg="tokenizing...")
        data_dict = preprocess(sources=sources, targets=targets, tokenizer=tokenizer)
        logging.warning(msg="tokenizing finished")

        self.input_ids = data_dict["input_ids"]
        self.labels = data_dict["labels"]

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return dict(input_ids=self.input_ids[idx], labels=self.labels[idx])

@dataclass
class CustomCollator(object):
    tokenizer: transformers.PreTrainedTokenizer

    def __call__(self, instances):
        input_ids, labels = tuple([instance[key] for instance in instances] for key in ("input_ids", "labels"))
        input_ids = [torch.tensor(piece) for piece in input_ids]
        labels = [torch.tensor(piece) for piece in labels]

        input_ids = torch.nn.utils.rnn.pad_sequence([i.flip(dims=[-1]) for i in input_ids], batch_first=True, padding_value=self.tokenizer.pad_token_id).flip(dims=[1])
        labels = torch.nn.utils.rnn.pad_sequence([i.flip(dims=[-1]) for i in labels], batch_first=True, padding_value=IGNORE_INDEX).flip(dims=[1])

        return dict(input_ids=input_ids, labels=labels, attention_mask=input_ids.ne(self.tokenizer.pad_token_id))


In [10]:
def get_training_args(args):
    training_args = TrainingArguments(
        output_dir=args['output_dir'],
        evaluation_strategy="steps",
        load_best_model_at_end=False,
        metric_for_best_model="eval_loss",
        greater_is_better=False,
        learning_rate=args["learning_rate"],
        weight_decay=args["weight_decay"],
        push_to_hub=False,
        do_train=True,
        num_train_epochs=args['num_epochs'],
        per_device_train_batch_size=args["batch_size"],
        logging_steps=args["logging_steps"],
        gradient_accumulation_steps=args["accumulation_steps"],
        save_strategy="steps",
        save_steps=args["save_steps"],
        warmup_ratio=0.03,
        lr_scheduler_type='constant', #learning rate 고정.
        max_grad_norm=1.0,
        fp16=True,  #이전에는 false 였음. a100 쓸꺼니깐 문제없을듯.
        report_to=args["report_to"],
        run_name=args["run_name"],
    )

    return training_args

def get_lora_args(args):
    peft_config = LoraConfig(
        lora_alpha=128,  # 설정 명시적으로 분리
        lora_dropout=args['lora_dropout'],
        r=args['lora_r'],
        bias=args['bias'],
        task_type="CAUSAL_LM"
    )

    return peft_config

In [11]:
def training(config):
    # model and tokenizer load
    model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path=config['pretrained_model_name_or_path'],
                                                 trust_remote_code=config['trust_remote_code'],
                                                 cache_dir=config['cache_dir'],
                                                 local_files_only=config['local_files_only'])

    tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=config['pretrained_model_name_or_path'],
                                              trust_remote_code=config['trust_remote_code'],
                                              cache_dir=config['cache_dir'],
                                              local_files_only=config['local_files_only'],
                                              padding_side=config['padding_side'])

    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.pad_token_id = tokenizer.eos_token_id
    tokenizer.model_max_length = config['max_token_length']

    # LoRA 적용
    lora_config = get_lora_args(config['lora_args'])
    model = get_peft_model(model, lora_config)

    # org dataset load
    train_dataset = load_dataset(config['train_data_path'])
    train_dataset = data_transform(train_dataset)

    # Ensure the directory exists
    os.makedirs(config['train_data_path'], exist_ok=True)
    train_dataset = load_dataset(config['train_data_path'])
    train_dataset = data_transform(train_dataset)


    # prepare train dataset
    train_dataset = CustomDataset(examples=train_dataset, tokenizer=tokenizer)
    data_collator = CustomCollator(tokenizer=tokenizer)

    # prepare training model
    training_args = get_training_args(config['training_args'])

    train_size = int(0.8 * len(train_dataset))
    eval_size = len(train_dataset) - train_size
    train_dataset, eval_dataset = torch.utils.data.random_split(train_dataset, [train_size, eval_size])

    trainer = Trainer(model=model,
                      tokenizer=tokenizer,
                      args=training_args,
                      train_dataset=train_dataset,
                      eval_dataset=eval_dataset,
                      data_collator=data_collator,
                      #callbacks=[EarlyStoppingCallback(early_stopping_patience=3, early_stopping_threshold=0.1)])
                      )

    # 학습 수행
    trainer.train()

    # 맨 마지막 - 학습 종료 이후 저장하는 부분
    trainer.save_state()
    trainer.save_model(output_dir=os.path.join(config['output_dir'],"final"))


In [12]:
# workspace/LoRA1/inputs1

#root_dir = os.path.abspath("LoRA1")
root_dir = "/workspace/LoRA1"
input_dir = os.path.join(root_dir, "inputs1")

# 반드시 경로 알잘딱 바꿔주기
model_name = "EleutherAI/polyglot-ko-1.3b" # beomi/llama-2-ko-7b , EleutherAI/polyglot-ko-1.3b
output_dir = os.path.join(root_dir, "outputs", model_name.split("/")[1], "test")
os.makedirs(output_dir, exist_ok=True)

cache_dir = os.path.join(root_dir, 'cache')

set_seed(seed=42)

config = {
    "training_args":{
        "output_dir": output_dir,
        "learning_rate": 2e-5,
        "weight_decay": 0.001,
        "batch_size": 6,
        "accumulation_steps": 32,
        "logging_steps": 1,
        "save_steps": 50,
        "num_epochs": 25,
        "report_to": "wandb",
        "run_name": "session_1227"
    },
    "lora_args": {
        "lora_r": 128,
        "lora_dropout": 0.05,
        "bias": "none"
    },


    "pretrained_model_name_or_path": model_name,
    "trust_remote_code": True,
    "cache_dir": cache_dir,
    "local_files_only": False,
    "padding_side": "left",
    "max_token_length": 4096,

    "train_data_path": input_dir,
    "output_dir": output_dir

}

training(config)

config.json:   0%|          | 0.00/640 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/31.6k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/1.00G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/1.02G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/748M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/164 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.65M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/185 [00:00<?, ?B/s]

loading finished : 4600 datas
Problematic data: {
    "features": {
        "group_id": 124,
        "columns": {
            "num_of_company": [139, 142, 149],
            "num_of_large": [2, 2, 2],
            "num_of_bus_stop": [11, 12, 13],
            "num_of_hospital": [0, 0, 0],
            "num_of_theather": [0, 0, 0],
            "num_of_camp": [0, 0, 0],
            "num_of_school": [3, 3, 3],
            "nearest_subway_name": ['영남대', '영남대', '영남대'],
            "nearest_subway_distance": [10467.092625317817, 10481.343809099148, 10452.50914080669],
            "num_of_subway": [0, 0, 0],
            "num_of_gvn_office": [0, 0, 1],
            "parks_within_500m": [0, 0, 0],
            "parking_lots_within_500m": [0, 0, 0],
            "industry_category": ['소매업', '소매업', '소매업'],
            "avg_sales_level": [6.0, 5.0, 3.5]
        }
    },
    "analysis": "경산시의 3개의 공실 중에서 경쟁 업체 수와 인프라, 교통 접근성, 대학교 접근성, 매출 등급 등을 종합적으로 고려해야 합니다. 첫 번째 공실은 경쟁 업체 수가 많고 매출 등급이 높은 편이지만, 대학교와 지하철 접



Problematic data: {
    "features": {
        "group_id": 4003,
        "columns": {
            "num_of_company": [127, 128, 128],
            "num_of_large": [1, 1, 1],
            "num_of_bus_stop": [14, 13, 13],
            "num_of_hospital": [6, 9, 9],
            "num_of_theather": [0, 0, 0],
            "num_of_camp": [0, 0, 0],
            "num_of_school": [2, 3, 4],
            "nearest_subway_name": ['정평', '정평', '정평'],
            "nearest_subway_distance": [1238.4178088494273, 1224.868828666845, 1236.7192466366105],
            "num_of_subway": [0, 0, 0],
            "num_of_gvn_office": [1, 1, 1],
            "parks_within_500m": [1, 2, 2],
            "parking_lots_within_500m": [5, 5, 5],
            "industry_category": ['의료', '의료', '의료'],
            "avg_sales_level": [4.17, 3.56, 3.43]
        }
    },
    "analysis": "가장 알맞은 공실을 선정하기 위해 다양한 요소를 고려했습니다. 먼저, 평균 매출 등급을 살펴보면 4.17, 3.56, 3.43으로 높은 수치를 가진 공실이 성공할 가능성이 크다고 볼 수 있습니다. 인근 인프라와 교통 접근성 측면에서는 공진 1호실이 가장 우수한 조건을 갖

  trainer = Trainer(model=model,


Step,Training Loss,Validation Loss
1,81.9173,2.539487
2,82.7282,2.520751
3,80.0806,2.503021
4,83.1529,2.48559
5,79.0429,2.468714
6,79.3241,2.451651
7,77.7463,2.434202
8,79.0985,2.417995
9,78.5396,2.401903
10,76.748,2.386069


config.json:   0%|          | 0.00/640 [00:00<?, ?B/s]

In [13]:
# 1. batch_size:8, accumulation_step:32, fp16 = False
# 2. batch_size:4, accumlation_step:64, fp16 = True
# 3. batich_size:6, accumulation_step:32, fp16 = True

In [14]:
# !pwd

In [15]:
# !ls

In [16]:
# cd ..

In [17]:
# !ls

In [18]:
# !pwd

In [19]:
# !rm -rf workspace/

In [20]:
# !ls

In [21]:
# rm -rf no_wandb

In [22]:
# !ls

In [23]:
# cd LoRA1

In [24]:
# !ls

In [25]:
# rm -rf no_use

In [26]:
# !ls

In [27]:
# rm -rf cache

In [28]:
# rm -rf outputs

In [29]:
# !ls