In [1]:
# !pip install transformers
!pip install peft

Collecting peft
  Downloading peft-0.14.0-py3-none-any.whl.metadata (13 kB)
Collecting accelerate>=0.21.0 (from peft)
  Downloading accelerate-1.2.1-py3-none-any.whl.metadata (19 kB)
Collecting huggingface-hub>=0.25.0 (from peft)
  Downloading huggingface_hub-0.27.0-py3-none-any.whl.metadata (13 kB)
Collecting fsspec>=2023.5.0 (from huggingface-hub>=0.25.0->peft)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Collecting sympy==1.13.1 (from torch>=1.13.0->peft)
  Using cached sympy-1.13.1-py3-none-any.whl.metadata (12 kB)
Downloading peft-0.14.0-py3-none-any.whl (374 kB)
Downloading accelerate-1.2.1-py3-none-any.whl (336 kB)
Downloading huggingface_hub-0.27.0-py3-none-any.whl (450 kB)
Using cached sympy-1.13.1-py3-none-any.whl (6.2 MB)
Downloading fsspec-2024.12.0-py3-none-any.whl (183 kB)
Installing collected packages: sympy, fsspec, huggingface-hub, accelerate, peft
  Attempting uninstall: sympy
    Found existing installation: sympy 1.13.3
    Uninstalling sympy-1.1

  You can safely remove it manually.


In [22]:
import transformers
from transformers import TrainingArguments, Trainer, AutoTokenizer, AutoModelForCausalLM
import torch
from torch.utils.data import Dataset
from peft import LoraConfig, get_peft_model, PeftModel
from dataclasses import dataclass
import json, os, random, logging, math, copy
import numpy as np

In [23]:
def load_dataset(directory_path):
    filenames = os.listdir(directory_path)
    datas = []
    for filename in filenames:
        with open(os.path.join(directory_path, filename), 'r', encoding='utf8') as f:
            datas.append(json.loads(f.read()))

    print(f"loading finished : {len(datas)} datas")
    return datas

In [24]:
def load_dataset(directory_path):
    filenames = os.listdir(directory_path)
    datas = []
    for filename in filenames:
        with open(os.path.join(directory_path, filename), 'r', encoding='utf8') as f:
            file_data = json.load(f)
            if isinstance(file_data, list):
                datas.extend(file_data)
            else:
                datas.append(file_data)

    print(f"loading finished : {len(datas)} datas")
    return datas

def data_transform(datas):
    prompt_template = (
        "당신은 대한민국 경상북도 경산시 부동산 전문가입니다. "
        "다음 입력정보를 보고 판단하여 공실 우선순위에 대한 분석 결과를 생성하세요.\n"
        "### 입력 정보:\n{features}\n\n### 분석 결과:\n"
    )

    dataset = []
    for data in datas:
        try:
            reasoning_result = json.loads(data.get('reasoning_result', '{}'), strict=False)
        except json.JSONDecodeError as e:
            print(f"Warning: Skipping data point due to JSONDecodeError: {e}")
            print(f"Problematic data: {data.get('reasoning_result', '{}')}")
            continue

        columns = reasoning_result.get('features', {}).get('columns', {})
        features = json.dumps(columns, ensure_ascii=False, indent=2)
        analysis = reasoning_result.get('analysis', '')

        source = prompt_template.format(features=features)
        target = analysis

        dataset.append(dict(
            source=source,
            target=target
        ))

    print(f"Total data samples: {len(dataset)}")

    return dataset

In [25]:
def make_model_input(features):
    prompt = (
        "당신은 대한민국 경상북도 경산시 부동산 전문가입니다. "
        "다음 입력정보를 보고 판단하여 공실 우선순위에 대한 분석 결과를 생성하세요.\n"
        "### 입력 정보:\n{features}\n\n### 분석 결과:\n"
    )

    return prompt.format_map(dict(
        features=features
    ))

In [26]:
def inference_multi_source(model, tokenizer, device, sources):
    inputs = tokenizer(sources, return_tensors="pt", return_attention_mask=True, padding="longest")
    inputs.to(device)

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=300,
            num_beams=5,
            temperature=0.7,
            top_k=50,
            top_p=0.95,
            repetition_penalty=1.2
        )
        outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)

    return outputs

In [27]:
def inference_single_source(model, tokenizer, device, source):
    inputs = tokenizer([source], return_tensors="pt", return_token_type_ids=False).to(device)

    with torch.no_grad():
        output = model.generate(
            **inputs,
            max_new_tokens=300,
            num_beams=5,
            temperature=0.7,
            top_k=50,
            top_p=0.95,
            repetition_penalty=1.2
        )
        decoded_output = tokenizer.decode(output[0], skip_special_tokens=True)
        response_start = len(source)
        response = decoded_output[response_start:].strip()

    return response

In [28]:
#workspace/LoRA1/outputs
#workspace/LoRA1/outputs/polyglot-ko-1.3b/test/final/adapter_config.json

def main(config):
    root_dir = os.path.abspath("/workspace/LoRA1")
    cache_dir = os.path.join(root_dir, 'cache')
    adapter_dir = os.path.join("/workspace/LoRA1", "outputs")


    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Device: {device}") # 디버깅
    
    lora_adapter_name = "polyglot-ko-1.3b/test"

    lora_adapter_path = os.path.join("/workspace/LoRA1/outputs/polyglot-ko-1.3b/test/final")
    print(f"LoRA Adapter Path: {lora_adapter_path}")

    model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path=config['pretrained_model_name_or_path'],
                                                 trust_remote_code=config['trust_remote_code'],
                                                 cache_dir=cache_dir,
                                                 local_files_only=config['local_files_only'])

    tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=config['pretrained_model_name_or_path'],
                                              trust_remote_code=config['trust_remote_code'],
                                              cache_dir=cache_dir,
                                              local_files_only=config['local_files_only'],
                                              padding_side=config['padding_side'])

    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.pad_token_id = tokenizer.eos_token_id
    tokenizer.model_max_length = config['max_token_length']

    model = PeftModel.from_pretrained(model, lora_adapter_path)
    model = model.merge_and_unload()

    model.eval()
    model.to(device)

    # workspace/LoRA1/session_data/developer
    dataset = load_dataset("/workspace/LoRA1/session_data/developer")
    dataset = data_transform(dataset)

    for item in dataset[:5]:
        source = item['source']
        target = item['target']
        result = inference_single_source(model, tokenizer, device, source)

        print(f"[Source]\n{source}\n\n[Target]\n{target}\n\n[Response]\n{result}\n")
        print("#" * 100)
        print("\n\n")

config = {
    "pretrained_model_name_or_path": "EleutherAI/polyglot-ko-1.3b",
    "trust_remote_code": True,
    "local_files_only": False,
    "padding_side": "left",
    "max_token_length": 4096,
}
main(config)

Device: cuda
LoRA Adapter Path: /workspace/LoRA1/outputs/polyglot-ko-1.3b/test/final


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

loading finished : 4600 datas
Problematic data: {
    "features": {
        "group_id": 124,
        "columns": {
            "num_of_company": [139, 142, 149],
            "num_of_large": [2, 2, 2],
            "num_of_bus_stop": [11, 12, 13],
            "num_of_hospital": [0, 0, 0],
            "num_of_theather": [0, 0, 0],
            "num_of_camp": [0, 0, 0],
            "num_of_school": [3, 3, 3],
            "nearest_subway_name": ['영남대', '영남대', '영남대'],
            "nearest_subway_distance": [10467.092625317817, 10481.343809099148, 10452.50914080669],
            "num_of_subway": [0, 0, 0],
            "num_of_gvn_office": [0, 0, 1],
            "parks_within_500m": [0, 0, 0],
            "parking_lots_within_500m": [0, 0, 0],
            "industry_category": ['소매업', '소매업', '소매업'],
            "avg_sales_level": [6.0, 5.0, 3.5]
        }
    },
    "analysis": "경산시의 3개의 공실 중에서 경쟁 업체 수와 인프라, 교통 접근성, 대학교 접근성, 매출 등급 등을 종합적으로 고려해야 합니다. 첫 번째 공실은 경쟁 업체 수가 많고 매출 등급이 높은 편이지만, 대학교와 지하철 접

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Problematic data: {
    "features": {
        "group_id": 1830,
        "columns": {
            "num_of_company": [199, 182, 202],
            "num_of_large": [1, 1, 1],
            "num_of_bus_stop": [15, 16, 17],
            "num_of_hospital": [0, 0, 0],
            "num_of_theather": [1, 1, 1],
            "num_of_camp": [0, 0, 0],
            "num_of_school": [1, 1, 2],
            "nearest_subway_name": ['영남대', '영남대', '영남대'],
            "nearest_subway_distance": [10382.046980649531, 10358.80742765456, 10459.980463169948],
            "num_of_subway": [0, 0, 0],
            "num_of_gvn_office": [1, 2, 1],
            "parks_within_500m": [0, 0, 0],
            "parking_lots_within_500m": [0, 0, 0],
            "industry_category": ['교육', '교육', '교육'],
            "avg_sales_level": [1.25, 1.25, 1.2]
        }
    },
    "analysis": "주어진 공실 정보를 종합적으로 분석해보면, 세 공실 모두 교육 업종에 속하고 있으며 주변 인프라 및 교통 접근성 측면에서 큰 차이가 나타나지 않습니다. 하지만 평균 매출 등급과 주변 기업체 수를 비교해보면, 공실 3이 다소 낮은 평균 매출 등급을 보이고 있습니다. 이

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


[Source]
당신은 대한민국 경상북도 경산시 부동산 전문가입니다. 다음 입력정보를 보고 판단하여 공실 우선순위에 대한 분석 결과를 생성하세요.
### 입력 정보:
{
  "num_of_company": [
    221,
    220,
    217
  ],
  "num_of_large": [
    1,
    1,
    1
  ],
  "num_of_bus_stop": [
    3,
    6,
    5
  ],
  "num_of_hospital": [
    0,
    0,
    0
  ],
  "num_of_theather": [
    1,
    1,
    1
  ],
  "num_of_camp": [
    0,
    0,
    0
  ],
  "num_of_school": [
    1,
    1,
    1
  ],
  "nearest_subway_name": [
    "영남대",
    "영남대",
    "영남대"
  ],
  "nearest_subway_distance": [
    10693.98845059832,
    10684.94092642059,
    10633.144466650036
  ],
  "num_of_subway": [
    0,
    0,
    0
  ],
  "num_of_gvn_office": [
    2,
    2,
    2
  ],
  "parks_within_500m": [
    0,
    0,
    0
  ],
  "parking_lots_within_500m": [
    0,
    0,
    0
  ],
  "industry_category": [
    "의료",
    "의료",
    "의료"
  ],
  "avg_sales_level": [
    5.0,
    4.25,
    4.33
  ]
}

### 분석 결과:


[Target]


[Response]
### 입력 정보:션### 입력 정보:션### 입력 정보:션### 입력 정보:션### 입

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


[Source]
당신은 대한민국 경상북도 경산시 부동산 전문가입니다. 다음 입력정보를 보고 판단하여 공실 우선순위에 대한 분석 결과를 생성하세요.
### 입력 정보:
{
  "num_of_company": [
    213,
    235,
    235
  ],
  "num_of_large": [
    0,
    1,
    1
  ],
  "num_of_bus_stop": [
    4,
    6,
    7
  ],
  "num_of_hospital": [
    0,
    0,
    0
  ],
  "num_of_theather": [
    1,
    1,
    1
  ],
  "num_of_camp": [
    0,
    0,
    0
  ],
  "num_of_school": [
    0,
    0,
    0
  ],
  "nearest_subway_name": [
    "영남대",
    "영남대",
    "영남대"
  ],
  "nearest_subway_distance": [
    11636.458741876531,
    11472.60454661412,
    11402.545758615828
  ],
  "num_of_subway": [
    0,
    0,
    0
  ],
  "num_of_gvn_office": [
    0,
    0,
    2
  ],
  "parks_within_500m": [
    0,
    0,
    0
  ],
  "parking_lots_within_500m": [
    0,
    0,
    0
  ],
  "industry_category": [
    "음식점",
    "음식점",
    "음식점"
  ],
  "avg_sales_level": [
    3.0,
    2.82,
    2.8
  ]
}

### 분석 결과:


[Target]
분석 결과, 3개 공실 중 최적의 창업 위치는 group_id가 102인 2번째 공실입니다. 이유는 다음과

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


[Source]
당신은 대한민국 경상북도 경산시 부동산 전문가입니다. 다음 입력정보를 보고 판단하여 공실 우선순위에 대한 분석 결과를 생성하세요.
### 입력 정보:
{
  "num_of_company": [
    213,
    235,
    235
  ],
  "num_of_large": [
    0,
    1,
    1
  ],
  "num_of_bus_stop": [
    4,
    6,
    7
  ],
  "num_of_hospital": [
    0,
    0,
    0
  ],
  "num_of_theather": [
    1,
    1,
    1
  ],
  "num_of_camp": [
    0,
    0,
    0
  ],
  "num_of_school": [
    0,
    0,
    0
  ],
  "nearest_subway_name": [
    "영남대",
    "영남대",
    "영남대"
  ],
  "nearest_subway_distance": [
    11636.458741876531,
    11472.60454661412,
    11402.545758615828
  ],
  "num_of_subway": [
    0,
    0,
    0
  ],
  "num_of_gvn_office": [
    0,
    0,
    2
  ],
  "parks_within_500m": [
    0,
    0,
    0
  ],
  "parking_lots_within_500m": [
    0,
    0,
    0
  ],
  "industry_category": [
    "소매업",
    "소매업",
    "소매업"
  ],
  "avg_sales_level": [
    4.6,
    4.75,
    4.76
  ]
}

### 분석 결과:


[Target]


[Response]
### 입력 정보:션### 입력 정보:션### 입력 정보:션### 입력 정보:션#

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


[Source]
당신은 대한민국 경상북도 경산시 부동산 전문가입니다. 다음 입력정보를 보고 판단하여 공실 우선순위에 대한 분석 결과를 생성하세요.
### 입력 정보:
{
  "num_of_company": [
    213,
    235,
    235
  ],
  "num_of_large": [
    0,
    1,
    1
  ],
  "num_of_bus_stop": [
    4,
    6,
    7
  ],
  "num_of_hospital": [
    0,
    0,
    0
  ],
  "num_of_theather": [
    1,
    1,
    1
  ],
  "num_of_camp": [
    0,
    0,
    0
  ],
  "num_of_school": [
    0,
    0,
    0
  ],
  "nearest_subway_name": [
    "영남대",
    "영남대",
    "영남대"
  ],
  "nearest_subway_distance": [
    11636.458741876531,
    11472.60454661412,
    11402.545758615828
  ],
  "num_of_subway": [
    0,
    0,
    0
  ],
  "num_of_gvn_office": [
    0,
    0,
    2
  ],
  "parks_within_500m": [
    0,
    0,
    0
  ],
  "parking_lots_within_500m": [
    0,
    0,
    0
  ],
  "industry_category": [
    "도매업",
    "도매업",
    "도매업"
  ],
  "avg_sales_level": [
    4.0,
    1.5,
    3.25
  ]
}

### 분석 결과:


[Target]
주어진 정보를 종합적으로 분석해보면, 성공적인 창업을 위해서는 다음 요인을 고려해야 합니다. 첫째로, 평균

In [29]:
#workspace/LoRA1/outputs/polyglot-ko-1.3b/test/final/adapter_config.json

In [38]:
!nvidia-smi

Wed Dec 25 09:09:12 2024       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 560.35.03              Driver Version: 560.35.03      CUDA Version: 12.6     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA A100 80GB PCIe          On  |   00000000:81:00.0 Off |                    0 |
| N/A   35C    P0             61W /  300W |   81139MiB /  81920MiB |      0%      Default |
|                                         |                        |             Disabled |
+-----------------------------------------+------------------------+----------------------+
                                                