In [1]:
# !pip install transformers
# !pip install peft
# !pip install dataclasses
# !pip install --upgrade wandb

Collecting transformers
  Downloading transformers-4.47.1-py3-none-any.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.1/44.1 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.24.0 (from transformers)
  Downloading huggingface_hub-0.27.0-py3-none-any.whl.metadata (13 kB)
Collecting regex!=2019.12.17 (from transformers)
  Downloading regex-2024.11.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (40 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.5/40.5 kB[0m [31m12.0 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.22,>=0.21 (from transformers)
  Downloading tokenizers-0.21.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Collecting safetensors>=0.4.1 (from transformers)
  Downloading safetensors-0.4.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.8 kB)
Collecting tqdm>=4.27 (from transformers)
  Downloading tqdm-

In [7]:
# import wandb
# print(wandb.__version__)

In [3]:
import wandb
wandb.login()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [2]:
import transformers
from transformers import TrainingArguments, Trainer, AutoTokenizer, AutoModelForCausalLM
from transformers.trainer_callback import EarlyStoppingCallback
import torch
from torch.utils.data import Dataset
from peft import LoraConfig, get_peft_model
from dataclasses import dataclass
import json, os, random, logging, math, copy
import numpy as np

IGNORE_INDEX = -100 # 학습 loss 계산에 무시되는 index
os.environ['WANDB_PROJECT'] = 'TEST2.' # wandb project 이름 설정

In [3]:
# random seed 설정 함수
def set_seed(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)  # if use multi-GPU
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    np.random.seed(seed)
    random.seed(seed)

In [4]:
def load_dataset(directory_path):
    filenames = os.listdir(directory_path)
    datas = []
    for filename in filenames:
        file_path = os.path.join(directory_path, filename)

        # 파일인지 확인
        if os.path.isdir(file_path):  # 디렉토리인 경우 건너뜀
            print(f"Skipping directory: {file_path}")
            continue

        try:
            with open(file_path, 'r', encoding='utf8') as f:
                file_data = json.load(f)
                if isinstance(file_data, list):
                    datas.extend(file_data)
                else:
                    datas.append(file_data)
        except Exception as e:
            print(f"Error reading file {file_path}: {e}")
            continue

    print(f"Loading finished: {len(datas)} data points")
    return datas

def data_transform(datas):
    prompt_template = (
        "당신은 대한민국 경상북도 경산시에서 15년 이상 경력을 쌓은 창업 전문 컨설턴트입니다."
        "다음 입력정보는 입점하고자 하는 공실 주변 최근접 3개의 점포들 데이터입니다. "
        "이 데이터를 기반으로 해당 위치에서 성공 가능성이 높은 업종을 평가하고, 공실의 장점과 경쟁력을 분석합니다. "
        "이를 바탕으로 합리적인 근거를 논리적으로 제시하며, 필요한 경우 관련 수치를 명확히 명시합니다. "
        "다음 입력정보에 따라 적절한 분석 결과를 생성하세요.\n"
        "### 입력 정보:\n{features}\n\n### 분석 결과:\n"
    )

    dataset = []
    for data in datas:
        try:
            reasoning_result = json.loads(data.get('reasoning_result', '{}'), strict=False)
        except json.JSONDecodeError as e:
            print(f"Warning: Skipping data point due to JSONDecodeError: {e}")
            print(f"Problematic data: {data.get('reasoning_result', '{}')}")
            continue

        # 'columns'만 가져오기
        columns = reasoning_result.get('features', {}).get('columns', {})
        if not columns:
            print("Warning: Skipping data point due to missing 'columns'")
            continue

        features = json.dumps(columns, ensure_ascii=False, indent=2)
        analysis = reasoning_result.get('analysis', '')

        source = prompt_template.format(features=features)
        target = analysis

        dataset.append(dict(
            source=source,
            target=target
        ))

    print(f"Total data samples: {len(dataset)}")

    return dataset


In [5]:
def preprocess(sources, targets, tokenizer):
    examples = [s + t for s, t in zip(sources, targets)]

    input_ids = tokenizer(text=examples, padding=False, return_attention_mask=False, return_length=False,
                          max_length=tokenizer.model_max_length, truncation=True, verbose=False)["input_ids"]
    labels = copy.deepcopy(input_ids)

    for pieces in input_ids:
        assert not any([math.isnan(piece) or math.isinf(piece) for piece in pieces])

    source_lens = tokenizer(text=sources, padding=False, return_attention_mask=False, return_length=True,
                            max_length=tokenizer.model_max_length, truncation=True, verbose=False)["length"]

    for example_index in range(len(examples)):
        for index in range(source_lens[example_index]):
            labels[example_index][index] = IGNORE_INDEX

    return dict(input_ids=input_ids, labels=labels)


class CustomDataset(Dataset):
    def __init__(self, examples, tokenizer):
        self.tokenizer = tokenizer

        sources = [example['source'] for example in examples]
        targets = [f"{example['target']}{tokenizer.eos_token}" for example in examples]

        logging.warning(msg="tokenizing...")
        data_dict = preprocess(sources=sources, targets=targets, tokenizer=tokenizer)
        logging.warning(msg="tokenizing finished")

        self.input_ids = data_dict["input_ids"]
        self.labels = data_dict["labels"]

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return dict(input_ids=self.input_ids[idx], labels=self.labels[idx])

@dataclass
class CustomCollator(object):
    tokenizer: transformers.PreTrainedTokenizer

    def __call__(self, instances):
        input_ids, labels = tuple([instance[key] for instance in instances] for key in ("input_ids", "labels"))
        input_ids = [torch.tensor(piece) for piece in input_ids]
        labels = [torch.tensor(piece) for piece in labels]

        input_ids = torch.nn.utils.rnn.pad_sequence([i.flip(dims=[-1]) for i in input_ids], batch_first=True, padding_value=self.tokenizer.pad_token_id).flip(dims=[1])
        labels = torch.nn.utils.rnn.pad_sequence([i.flip(dims=[-1]) for i in labels], batch_first=True, padding_value=IGNORE_INDEX).flip(dims=[1])

        return dict(input_ids=input_ids, labels=labels, attention_mask=input_ids.ne(self.tokenizer.pad_token_id))


In [6]:
def get_training_args(args):
    training_args = TrainingArguments(
        output_dir=args['output_dir'],
        evaluation_strategy="steps",
        load_best_model_at_end=False,
        metric_for_best_model="eval_loss",
        greater_is_better=False,
        learning_rate=args["learning_rate"],
        weight_decay=args["weight_decay"],
        push_to_hub=False,
        do_train=True,
        num_train_epochs=args['num_epochs'],
        per_device_train_batch_size=args["batch_size"],
        logging_steps=args["logging_steps"],
        gradient_accumulation_steps=args["accumulation_steps"],
        #gradient_checkpointing=True,
        save_strategy="steps",
        save_steps=args["save_steps"],
        warmup_ratio=0.03,
        lr_scheduler_type='constant', #learning rate 고정.
        max_grad_norm=1.0,
        fp16=True,  #이전에는 false 였음. a100 쓸꺼니깐 문제없을듯.
        report_to=args["report_to"],
        run_name=args["run_name"],
    )

    return training_args

def get_lora_args(args):
    peft_config = LoraConfig(
        lora_alpha=128,  # 설정 명시적으로 분리
        lora_dropout=args['lora_dropout'],
        r=args['lora_r'],
        bias=args['bias'],
        task_type="CAUSAL_LM"
    )

    return peft_config

In [7]:
def training(config):
    # model and tokenizer load
    model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path=config['pretrained_model_name_or_path'],
                                                 trust_remote_code=config['trust_remote_code'],
                                                 cache_dir=config['cache_dir'],
                                                 local_files_only=config['local_files_only'])

    tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=config['pretrained_model_name_or_path'],
                                              trust_remote_code=config['trust_remote_code'],
                                              cache_dir=config['cache_dir'],
                                              local_files_only=config['local_files_only'],
                                              padding_side=config['padding_side'])

    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.pad_token_id = tokenizer.eos_token_id
    tokenizer.model_max_length = config['max_token_length']

    # LoRA 적용
    lora_config = get_lora_args(config['lora_args'])
    model = get_peft_model(model, lora_config)

    # org dataset load
    train_dataset = load_dataset(config['train_data_path'])
    train_dataset = data_transform(train_dataset)

    # Ensure the directory exists
    os.makedirs(config['train_data_path'], exist_ok=True)
    train_dataset = load_dataset(config['train_data_path'])
    train_dataset = data_transform(train_dataset)


    # prepare train dataset
    train_dataset = CustomDataset(examples=train_dataset, tokenizer=tokenizer)
    data_collator = CustomCollator(tokenizer=tokenizer)

    # prepare training model
    training_args = get_training_args(config['training_args'])

    train_size = int(0.8 * len(train_dataset))
    eval_size = len(train_dataset) - train_size
    train_dataset, eval_dataset = torch.utils.data.random_split(train_dataset, [train_size, eval_size])

    trainer = Trainer(model=model,
                      tokenizer=tokenizer,
                      args=training_args,
                      train_dataset=train_dataset,
                      eval_dataset=eval_dataset,
                      data_collator=data_collator,
                      #callbacks=[EarlyStoppingCallback(early_stopping_patience=3, early_stopping_threshold=0.1)])
                      )

    # 학습 수행
    trainer.train()

    # 맨 마지막 - 학습 종료 이후 저장하는 부분
    trainer.save_state()
    trainer.save_model(output_dir=os.path.join(config['output_dir'],"final"))


In [None]:
# workspace/LoRA2/inputs2

#root_dir = os.path.abspath("LoRA2")
root_dir = "/workspace/LoRA2"
input_dir = os.path.join(root_dir, "inputs2")

# 반드시 경로 알잘딱 바꿔주기
model_name = "EleutherAI/polyglot-ko-1.3b" # beomi/llama-2-ko-7b , EleutherAI/polyglot-ko-1.3b
output_dir = os.path.join(root_dir, "outputs2", model_name.split("/")[1], "test")
os.makedirs(output_dir, exist_ok=True)

cache_dir = os.path.join(root_dir, 'cache2')

set_seed(seed=42)

config = {
    "training_args":{
        "output_dir": output_dir,
        "learning_rate": 2e-5,
        "weight_decay": 0.001,
        "batch_size": 8,
        "accumulation_steps": 30,
        "logging_steps": 1,
        "save_steps": 25,
        "num_epochs": 25,
        "report_to": "wandb",
        "run_name": "session_1228"
    },
    "lora_args": {
        "lora_r": 128,
        "lora_dropout": 0.05,
        "bias": "none"
    },


    "pretrained_model_name_or_path": model_name,
    "trust_remote_code": True,
    "cache_dir": cache_dir,
    "local_files_only": False,
    "padding_side": "left",
    "max_token_length": 4096,

    "train_data_path": input_dir,
    "output_dir": output_dir

}

training(config)

config.json:   0%|          | 0.00/640 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/31.6k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/1.00G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/1.02G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/748M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/164 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.65M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/185 [00:00<?, ?B/s]

Loading finished: 931 data points
Problematic data: {
    "features": {
        "group_id": 457,
        "columns": {
            "매출등급": [6, 1, 2],
            "대분류업종": ['도매업', '소매업', '기타'],
            "대분류업종코드": [7, 6, 0],
            "num_of_company(near 3km)": [129, 129, 128],
            "num_of_large(near 1km)": [2, 2, 2],
            "num_of_bus_stop(near 500m)": [7, 6, 6],
            "num_of_hospital(near 1km)": [8, 9, 8],
            "num_of_theather(near 1km)": [6, 6, 6],
            "num_of_camp(near 3km)": [1, 1, 1],
            "num_of_school(near 500m)": [1, 1, 1],
            "nearest_subway_name": ['정평', '정평', '정평'],
            "nearest_subway_distance": [511.1381476240846, 487.01563044870994, 541.3505049957142],
            "num_of_subway(near 500m)": [0, 1, 0],
            "num_of_gvn_office(near 500m)": [3, 3, 3],
            "parks_within_500m": [2, 2, 2],
            "parking_lots_within_500m": [7, 7, 6],
            "university_within_0m_500m": [0, 0, 0],
     



Problematic data: {
    "features": {
        "group_id": 895,
        "columns": {
            "매출등급": [2, 9, 8],
            "대분류업종": ["음식점", "소매업", "소매업"],
            "대분류업종코드": [1, 6, 6],
            "num_of_company(near 3km)": [192, 192, 192],
            "num_of_large(near 1km)": [1, 1, 1],
            "num_of_bus_stop(near 500m)": [14, 14, 15],
            "num_of_hospital(near 1km)": [0, 0, 0],
            "num_of_theather(near 1km)": [5, 5, 5],
            "num_of_camp(near 3km)": [0, 0, 0],
            "num_of_school(near 500m)": [1, 1, 1],
            "nearest_subway_name": ["영남대", "영남대", "영남대"],
            "nearest_subway_distance": [10500.355582262662, 10501.005421851083, 10467.33538656536],
            "num_of_subway(near 500m)": [0, 0, 0],
            "num_of_gvn_office(near 500m)": [1, 1, 1],
            "parks_within_500m": [0, 0, 0],
            "parking_lots_within_500m": [0, 0, 0],
            "university_within_0m_500m": [0, 0, 0],
            "university_within_

  trainer = Trainer(model=model,
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33m19010247shim[0m ([33m19010247shim-sejong-university[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [None]:
# !pwd

In [None]:
# !ls

In [None]:
# cd ..

In [None]:
# !ls

In [None]:
# !pwd

In [None]:
# !rm -rf workspace/

In [None]:
# !ls

In [None]:
# rm -rf no_wandb

In [None]:
# !ls

In [None]:
# cd LoRA1

In [None]:
# !ls

In [None]:
# rm -rf no_use

In [None]:
# !ls

In [None]:
# rm -rf cache

In [None]:
# rm -rf outputs

In [None]:
# 28일까지 베스트가 5*24=120이니깐 그 배수인 8*30=240 으로 해보자, 안되면 딱 깔끔하게 포기!