In [None]:
# ─────────────────────────────────────────────────────────
# 0) Install / Imports
# ─────────────────────────────────────────────────────────
!pip install -q --upgrade transformers peft accelerate datasets torch tqdm

import os, gc, numpy as np, pandas as pd, torch
from tqdm.auto import tqdm
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from peft import PeftModel
from torch.nn.functional import softmax

device = "cuda" if torch.cuda.is_available() else "cpu"



[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m506.3/506.3 kB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 MB[0m [31m20.4 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
cudf-cu12 25.6.0 requires pyarrow<20.0.0a0,>=14.0.0; platform_machine == "x86_64", but you have pyarrow 21.0.0 which is incompatible.
pylibcudf-cu12 25.6.0 requires pyarrow<20.0.0a0,>=14.0.0; platform_machine == "x86_64", but you have pyarrow 21.0.0 which is incompatible.[0m[31m
[0m

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# ─────────────────────────────────────────────────────────
# 1) Config (학습 때와 동일해야 함)
# ─────────────────────────────────────────────────────────
BASE_MODEL  = "klue/bert-base"
OUTPUT_DIR  = "/content/drive/MyDrive/LikeLion_NLP2/Small_Challenge/"      # ← 학습한 LoRA 어댑터 경로
MAX_LEN     = 512                          # 창 최대 길이
STRIDE      = 128                          # 슬라이딩 윈도우 stride
BATCH_SIZE  = 64                           # 창 배치 추론 크기 (GPU에 맞게 조절)
TITLE_BLEND = 0.0                          # 0.0=미사용 / 0.3=문단:타이틀평균=0.7:0.3로 블렌딩
AGG_METHOD  = "mean"                       # "mean" | "max" (창 집계 방식)
INPUT_CSV   = "test.csv"
OUTPUT_CSV  = "submission_LoRA_klue_bert-base.csv"



In [None]:
# ─────────────────────────────────────────────────────────
# 2) Load tokenizer & model (+ LoRA)
# ─────────────────────────────────────────────────────────
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
base      = AutoModelForSequenceClassification.from_pretrained(BASE_MODEL, num_labels=2).to(device)
model     = PeftModel.from_pretrained(base, OUTPUT_DIR+"skt_kobert_lora_out/").to(device)
model.eval()
_ = torch.cuda.empty_cache()
print("Model ready on", device)



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/289 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/425 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/445M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at klue/bert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model ready on cuda


In [None]:
# ─────────────────────────────────────────────────────────
# 3) Read test.csv (expected columns: ID, title, paragraph_index, paragraph_text)
# ─────────────────────────────────────────────────────────
test_df = pd.read_csv(OUTPUT_DIR+INPUT_CSV)

required_cols = {"ID","title","paragraph_index","paragraph_text"}
missing = required_cols - set(test_df.columns)
if missing:
    raise ValueError(f"test.csv missing columns: {missing}")

# NaN 방지
test_df["paragraph_text"] = test_df["paragraph_text"].astype(str).fillna("")
print(test_df.head(3))



          ID           title  paragraph_index  \
0  TEST_0000  공중 도덕의 의의와 필요성                0   
1  TEST_0001  공중 도덕의 의의와 필요성                1   
2  TEST_0002  공중 도덕의 의의와 필요성                2   

                                      paragraph_text  
0  도덕이란 원래 개인의 자각에서 출발해 자기 의지로써 행동하는 일이다. 그러므로 도덕...  
1  도덕은 단순히 개인의 문제나 사회의 문제로 한정될 수 없다. 개인적인 측면과 사회적...  
2  여기에 이른바 공중도덕은 실천적, 사회적 도덕의 한 부문이다. 즉, 공중 도덕이라 ...  


In [None]:
# ─────────────────────────────────────────────────────────
# 4) Sliding-window inference for a single paragraph
# ─────────────────────────────────────────────────────────
@torch.no_grad()
def predict_paragraph(text: str, max_len=MAX_LEN, stride=STRIDE, agg=AGG_METHOD):
    # 토큰 ID로 길이 판단
    enc = tokenizer(text, add_special_tokens=True, return_offsets_mapping=False)
    ids = enc["input_ids"]

    # 짧으면 원샷
    if len(ids) <= max_len:
        inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=max_len).to(device)
        logits = model(**inputs).logits
        prob_ai = softmax(logits, dim=1)[0, 1].item()
        return float(prob_ai)

    # 길면 창으로 분할
    chunks = []
    start = 0
    while start < len(ids):
        end = min(start + max_len, len(ids))
        chunk_ids = ids[start:end]
        chunks.append(chunk_ids)
        if end == len(ids): break
        start = end - stride  # 겹치게 이동

    # 창 단위 배치 추론
    probs = []
    for i in range(0, len(chunks), BATCH_SIZE):
        batch = chunks[i:i+BATCH_SIZE]
        inputs = {"input_ids": batch, "attention_mask": [[1]*len(x) for x in batch]}
        inputs = tokenizer.pad(inputs, return_tensors="pt").to(device)
        logits = model(**inputs).logits
        batch_probs = softmax(logits, dim=1)[:, 1].detach().cpu().numpy().tolist()
        probs.extend(batch_probs)

    return float(np.max(probs) if agg == "max" else np.mean(probs))



In [None]:
# ─────────────────────────────────────────────────────────
# 5) Predict all rows
# ─────────────────────────────────────────────────────────
probs = []
for text in tqdm(test_df["paragraph_text"].tolist(), desc="Predicting"):
    probs.append(predict_paragraph(text))

test_df["prob_ai"] = probs  # 0~1 확률



Predicting:   0%|          | 0/1962 [00:00<?, ?it/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (538 > 512). Running this sequence through the model will result in indexing errors
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [None]:
# ─────────────────────────────────────────────────────────
# 6) (Optional) title-level blending (동일 title 평균과 섞기)
# ─────────────────────────────────────────────────────────
if TITLE_BLEND > 0.0:
    title_mean = test_df.groupby("title")["prob_ai"].transform("mean")
    test_df["prob_ai"] = (1.0 - TITLE_BLEND) * test_df["prob_ai"] + TITLE_BLEND * title_mean
    print(f"title-level blending applied with weight={TITLE_BLEND}")



In [None]:
# ─────────────────────────────────────────────────────────
# 7) Save submission.csv (id, generated)
# ─────────────────────────────────────────────────────────
submission = test_df[["ID","prob_ai"]].rename(columns={"ID":"ID", "prob_ai":"generated"})
submission.to_csv(OUTPUT_DIR+OUTPUT_CSV, index=False)
print("Saved:", submission.shape, "->", OUTPUT_CSV)
submission.head()

Saved: (1962, 2) -> submission_LoRA_klue_bert-base.csv


Unnamed: 0,ID,generated
0,TEST_0000,0.068374
1,TEST_0001,0.163658
2,TEST_0002,0.030766
3,TEST_0003,0.163549
4,TEST_0004,0.294997
