# Soft Labeling + Ridge 모델 (Colab 전용)
한국어 LLM 판별 대회용

In [None]:
# 📌 1. 라이브러리 설치
!pip install transformers scikit-learn tqdm

In [None]:
# 📌 2. 드라이브 마운트
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# 📌 3. 라이브러리 불러오기
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.linear_model import Ridge
from transformers import AutoTokenizer, AutoModel
import torch

In [None]:
# 📌 4. 데이터 경로 설정 (내 드라이브 경로에 맞게 수정)
train_path = "/content/drive/MyDrive/Colab Notebooks/train.csv"
test_path = "/content/drive/MyDrive/Colab Notebooks/test.csv"
sample_path = "/content/drive/MyDrive/Colab Notebooks/sample_submission.csv"

train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

In [None]:
# 📌 5. 문단 분리 + soft label 생성
def split_into_paragraphs(text):
    return [p.strip() for p in text.split('\n') if len(p.strip()) > 10]

train_paragraphs = []
soft_labels = []
for _, row in tqdm(train_df.iterrows(), total=len(train_df)):
    paragraphs = split_into_paragraphs(row['full_text'])
    label = row['generated']
    soft_label = label / len(paragraphs) if len(paragraphs) > 0 else 0.0
    for p in paragraphs:
        train_paragraphs.append(p)
        soft_labels.append(soft_label)

In [None]:
# 📌 6. BERT 로드 + 평균 임베딩
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = AutoTokenizer.from_pretrained("klue/roberta-base")
model = AutoModel.from_pretrained("klue/roberta-base").to(device).eval()

def get_avg_embedding(text_list, batch_size=16):
    embeddings = []
    for i in tqdm(range(0, len(text_list), batch_size)):
        batch = text_list[i:i+batch_size]
        inputs = tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=512)
        inputs = {k: v.to(device) for k, v in inputs.items()}
        with torch.no_grad():
            outputs = model(**inputs)
        avg_pool = outputs.last_hidden_state.mean(dim=1).cpu().numpy()
        embeddings.extend(avg_pool)
    return embeddings

In [None]:
# 📌 7. 학습 및 예측
X = np.array(get_avg_embedding(train_paragraphs))
y = np.array(soft_labels)

clf = Ridge(alpha=1.0)
clf.fit(X, y)

X_test = np.array(get_avg_embedding(test_df['paragraph_text'].tolist()))
probs = np.clip(clf.predict(X_test), 0, 1)

In [None]:
# 📌 8. 제출 파일 저장
submission = pd.read_csv(sample_path)
submission['generated'] = probs
submission.to_csv("/content/submission.csv", index=False)
print("✅ 제출 파일 저장 완료: /content/submission.csv")