## 라이브러리 임포트 및 모델 선정

In [2]:
import os
# 데이터 경로 (os.path.join 사용, 절대 경로)
BASE_DIR = os.path.dirname(os.getcwd())  # ml_code 폴더의 상위 폴더로 이동
DATA_DIR = os.path.join(BASE_DIR, 'data', 'ML')
os.makedirs(DATA_DIR, exist_ok=True)
print(BASE_DIR)

c:\final_git\SKN12-FINAL-5TEAM


In [5]:
from sentence_transformers import SentenceTransformer
import torch
import os
import numpy as np
import json
import random
from tqdm import tqdm

# 데이터 경로 (os.path.join 사용, 절대 경로)
BASE_DIR = os.path.dirname(os.getcwd())  # ml_code 폴더의 상위 폴더로 이동
DATA_DIR = os.path.join(BASE_DIR, 'data', 'ML')
os.makedirs(DATA_DIR, exist_ok=True)
print(BASE_DIR)
print(DATA_DIR)

# 디바이스 확인 (cuda가 있으면 GPU 사용)
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# BiEncoder 모델 로딩
bi_model = SentenceTransformer("BM-K/KoSimCSE-roberta")
bi_model.to(device)

# (선택) 최대 문장 길이 조정하고 싶으면 아래처럼 설정 가능
bi_model.max_seq_length = 512  # 기본은 384인데 필요 시 늘리기 가능

print(f"✅ BiEncoder loaded on: {device}, max_seq_length: {bi_model.max_seq_length}")


c:\final_git\SKN12-FINAL-5TEAM
c:\final_git\SKN12-FINAL-5TEAM\data\ML


No sentence-transformers model found with name BM-K/KoSimCSE-roberta. Creating a new one with mean pooling.


✅ BiEncoder loaded on: cpu, max_seq_length: 512


## json 데이터 불러오기

In [None]:

# 데이터 분할 비율 설정
TRAIN_RATIO = 0.8
VAL_RATIO = 0.1
TEST_RATIO = 0.1

def split_data(data, train_ratio=0.8, val_ratio=0.1, test_ratio=0.1):
    """데이터를 train, val, test로 분할합니다."""
    random.shuffle(data)
    total_size = len(data)
    
    train_size = int(total_size * train_ratio)
    val_size = int(total_size * val_ratio)
    
    train_data = data[:train_size]
    val_data = data[train_size:train_size + val_size]
    test_data = data[train_size + val_size:]
    
    return train_data, val_data, test_data

def load_qa_data(data, split_name=""):

    qas = []
    scores = []
    for d in tqdm(data, desc=f"📦 Loading {split_name}", ncols=100):
        qas.append((d['question'], d['answer']))
        scores.append(d['score'])
    print(f"✅ Loaded {len(qas)} Q&A pairs from {split_name} data.")

    SAVE_FILE_PATH = os.path.join(DATA_DIR, f"{split_name}_data.json")
    with open(SAVE_FILE_PATH, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=4)

    return qas, scores

def embed_qas(qas, model, split_name=""):
    print(f"🚀 Embedding {split_name}...")

    questions = [q for q, _ in qas]
    answers = [a for _, a in qas]

    # 각 문장 리스트를 임베딩
    q_vecs = model.encode(
        questions,
        convert_to_numpy=True,
        batch_size=32,
        show_progress_bar=True
    )

    a_vecs = model.encode(
        answers,
        convert_to_numpy=True,
        batch_size=32,
        show_progress_bar=True
    )

    # 벡터 이어붙이기 (concat)
    features = [list(q) + list(a) for q, a in tqdm(zip(q_vecs, a_vecs), total=len(qas), desc=f"🔗 Merging {split_name}", ncols=100)]
    
    return features


## question과 answer 각각 임베딩

In [5]:
# preprocessed_data.json 로드
with open(INPUT_FILE_PATH, 'r', encoding='utf-8') as f:
    processed_data = json.load(f)

# 데이터 분할
train_data, val_data, test_data = split_data(processed_data, TRAIN_RATIO, VAL_RATIO, TEST_RATIO)

print(f"✅ Data split complete - Train: {len(train_data)}, Val: {len(val_data)}, Test: {len(test_data)}")

# 호출 시 split 이름 명시
train_qa, y_train = load_qa_data(train_data, split_name="train")
val_qa, y_val = load_qa_data(val_data, split_name="val")
test_qa, y_test = load_qa_data(test_data, split_name="test")

X_train = embed_qas(train_qa, bi_model, split_name="train")
X_val = embed_qas(val_qa, bi_model, split_name="val")
X_test = embed_qas(test_qa, bi_model, split_name="test")


✅ Data split complete - Train: 5257, Val: 657, Test: 658


📦 Loading train: 100%|█████████████████████████████████████████████████| 5257/5257 [00:00<?, ?it/s]



✅ Loaded 5257 Q&A pairs from train data.


📦 Loading val: 100%|█████████████████████████████████████████████████████| 657/657 [00:00<?, ?it/s]



✅ Loaded 657 Q&A pairs from val data.


📦 Loading test: 100%|█████████████████████████████████████████| 658/658 [00:00<00:00, 89324.27it/s]

✅ Loaded 658 Q&A pairs from test data.
🚀 Embedding train...



Batches: 100%|██████████| 165/165 [02:08<00:00,  1.28it/s]

Batches: 100%|██████████| 165/165 [23:45<00:00,  8.64s/it]
🔗 Merging train:   0%|                                                    | 0/5257 [00:00<?, ?it/s]
🔗 Merging train: 100%|██████████████████████████████████████| 5257/5257 [00:00<00:00, 10059.59it/s]
🔗 Merging train: 100%|██████████████████████████████████████| 5257/5257 [00:00<00:00, 10059.59it/s]


🚀 Embedding val...


Batches: 100%|██████████| 21/21 [00:17<00:00,  1.19it/s]
Batches:   0%|          | 0/21 [00:00<?, ?it/s]
Batches: 100%|██████████| 21/21 [02:25<00:00,  6.92s/it]
Batches: 100%|██████████| 21/21 [02:25<00:00,  6.92s/it]                    | 0/657 [00:00<?, ?it/s]
🔗 Merging val: 100%|██████████████████████████████████████████| 657/657 [00:00<00:00, 10558.04it/s]
🔗 Merging val: 100%|██████████████████████████████████████████| 657/657 [00:00<00:00, 10558.04it/s]


🚀 Embedding test...


Batches: 100%|██████████| 21/21 [00:18<00:00,  1.14it/s]
Batches:   0%|          | 0/21 [00:00<?, ?it/s]
Batches: 100%|██████████| 21/21 [02:25<00:00,  6.93s/it]
Batches: 100%|██████████| 21/21 [02:25<00:00,  6.93s/it]
🔗 Merging test: 100%|██████████████████████████████████████████| 658/658 [00:00<00:00, 5393.76it/s]
🔗 Merging test: 100%|██████████████████████████████████████████| 658/658 [00:00<00:00, 5393.76it/s]


## array로 변환 후 저장

In [8]:

# numpy array로 변환 
X_train = np.array(X_train)
X_val = np.array(X_val)
X_test = np.array(X_test)

y_train = np.array(y_train)
y_val = np.array(y_val)
y_test = np.array(y_test)

# 데이터 저장
np.savez(os.path.join(DATA_DIR, "train_set.npz"), X=X_train, y=y_train)
np.savez(os.path.join(DATA_DIR, "val_set.npz"), X=X_val, y=y_val)
np.savez(os.path.join(DATA_DIR, "test_set.npz"), X=X_test, y=y_test)

## 저장한 array 불러오기

In [9]:
train = np.load(os.path.join(DATA_DIR, "train_set.npz"))
X_train = train["X"]
y_train = train["y"]

val = np.load(os.path.join(DATA_DIR, "val_set.npz"))
X_val = val["X"]
y_val = val["y"]

test = np.load(os.path.join(DATA_DIR, "test_set.npz"))
X_test = test["X"]
y_test = test["y"]
