KLUE-RoBERTa-large + 데이터 증강 + 교차검증

In [None]:
import pandas as pd
import numpy as np
import torch
import os
import re
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score
from transformers import (
    AutoTokenizer, 
    AutoModelForSequenceClassification,
    AdamW
)
from torch.utils.data import Dataset, DataLoader
import random

# 하이퍼파라미터
MAX_LEN = 128
BATCH_SIZE = 64
LEARNING_RATE = 1e-5
WEIGHT_DECAY = 1e-6
EPOCHS = 5
MODEL_NAME = "klue/roberta-base"
N_FOLDS = 5
RANDOM_SEED = 42
AUGMENT_K = 1

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)
set_seed(RANDOM_SEED)

# 데이터 로드
print("데이터 로드 중...")
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
if 'text' in test_df.columns:
    test_df = test_df.rename(columns={'text': 'conversation'})
if 'file_name' in test_df.columns:
    test_df = test_df.rename(columns={'file_name': 'idx'})

# 텍스트 전처리
def clean_text(text):
    text = re.sub(r'\s+', ' ', str(text).strip())
    return text
train_df['conversation'] = train_df['conversation'].apply(clean_text)
test_df['conversation'] = test_df['conversation'].apply(clean_text)

# 클래스 표준화 및 매핑
custom_classes = ['협박', '갈취', '직장내괴롭힘', '기타괴롭힘', '일반']
class2idx = {
    '협박': 0, '협박 대화': 0,
    '갈취': 1, '갈취 대화': 1,
    '직장내괴롭힘': 2, '직장 내 괴롭힘 대화': 2,
    '기타괴롭힘': 3, '기타 괴롭힘 대화': 3,
    '일반': 4, '일반대화': 4
}
def extract_class(x):
    if '협박' in x:
        return '협박'
    elif '갈취' in x:
        return '갈취'
    elif '직장' in x:
        return '직장내괴롭힘'
    elif '기타' in x:
        return '기타괴롭힘'
    elif '일반' in x:
        return '일반'
    return x
train_df['class'] = train_df['class'].apply(extract_class)
train_df['target'] = train_df['class'].map(class2idx)
assert train_df['target'].isna().sum() == 0, "클래스 매핑 오류 발생!"

idx2class = {i: c for i, c in enumerate(custom_classes)}

# 토크나이저
print(f"토크나이저 로드 중: {MODEL_NAME}")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# 데이터 증강 함수
def augment_data(texts, labels, k=AUGMENT_K):
    augmented_texts = []
    augmented_labels = []
    for text, label in zip(texts, labels):
        augmented_texts.append(text)
        augmented_labels.append(label)
        if k > 0:
            sentences = text.split('. ')
            if len(sentences) > 1:
                shuffled = '. '.join(sentences[::-1])
                augmented_texts.append(shuffled)
                augmented_labels.append(label)
            if len(sentences) > 2:
                del_idx = random.randint(0, len(sentences)-1)
                new_text = '. '.join(sentences[:del_idx] + sentences[del_idx+1:])
                augmented_texts.append(new_text)
                augmented_labels.append(label)
            words = text.split()
            if len(words) > 5:
                for _ in range(min(k-2, 1)):
                    mod_text = text
                    if '은' in mod_text:
                        mod_text = mod_text.replace('은', '는', 1)
                    elif '는' in mod_text:
                        mod_text = mod_text.replace('는', '은', 1)
                    elif '이' in mod_text:
                        mod_text = mod_text.replace('이', '가', 1)
                    elif '가' in mod_text:
                        mod_text = mod_text.replace('가', '이', 1)
                    if mod_text != text:
                        augmented_texts.append(mod_text)
                        augmented_labels.append(label)
    return augmented_texts, augmented_labels

# 일반대화 합성 데이터 예시
def create_normal_conversations(n=1000):
    print("일반 대화 데이터 생성 중...")
    normal_conversations = [
        "이거 들어봐 와 이 노래 진짜 좋다 그치 요즘 이 것만 들어 진짜 너무 좋다 내가 요즘 듣는 것도 들어봐 옴 난 좀 별로네 아님 넌 취향은 아닌 듯 배고프다 밥이나 먹으러 가자 그래"
    ]
    return normal_conversations * (n // len(normal_conversations))

normal_convs = create_normal_conversations()
normal_df = pd.DataFrame({
    'class': ['일반'] * len(normal_convs),
    'conversation': normal_convs,
    'target': [4] * len(normal_convs)
})
train_df = pd.concat([train_df, normal_df], ignore_index=True)

# PyTorch Dataset
class ConversationDataset(Dataset):
    def __init__(self, texts, labels=None, tokenizer=None, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.texts[idx],
            truncation=True,
            padding='max_length',
            max_length=self.max_len,
            return_tensors='pt'
        )
        item = {key: val.squeeze(0) for key, val in encoding.items()}
        if self.labels is not None:
            item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

# 교차 검증
print(f"{N_FOLDS}개 폴드 교차 검증 시작...")
skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=RANDOM_SEED)
test_predictions_folds = []
fold_metrics = []

for fold, (train_idx, val_idx) in enumerate(skf.split(train_df['conversation'], train_df['target'])):
    print(f"\n===== 폴드 {fold+1}/{N_FOLDS} 학습 시작 =====")
    X_train = train_df['conversation'].iloc[train_idx].tolist()
    y_train = train_df['target'].iloc[train_idx].values
    X_val = train_df['conversation'].iloc[val_idx].tolist()
    y_val = train_df['target'].iloc[val_idx].values

    # 데이터 증강
    print("데이터 증강 적용 중...")
    augmented_texts, augmented_labels = augment_data(X_train, y_train)
    print(f"증강 전: {len(X_train)}개, 증강 후: {len(augmented_texts)}개")
    train_dataset = ConversationDataset(augmented_texts, augmented_labels, tokenizer, MAX_LEN)
    val_dataset = ConversationDataset(X_val, y_val, tokenizer, MAX_LEN)

    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)

    # 모델 로드
    print(f"모델 로드 중: {MODEL_NAME}")
    model = AutoModelForSequenceClassification.from_pretrained(
        MODEL_NAME,
        num_labels=len(custom_classes)
    ).to(device)

    optimizer = AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)
    criterion = torch.nn.CrossEntropyLoss()

    # 학습
    print(f"모델 학습 중...")
    for epoch in range(EPOCHS):
        model.train()
        total_loss = 0
        for batch in train_loader:
            optimizer.zero_grad()
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {epoch+1}/{EPOCHS}, Loss: {total_loss/len(train_loader):.4f}")

    # 검증 세트 예측
    print("검증 세트 평가 중...")
    model.eval()
    val_preds = []
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=1).cpu().numpy()
            val_preds.extend(preds)
    val_f1 = f1_score(y_val, val_preds, average='macro')
    print(f"검증 F1 점수: {val_f1:.4f}")

    # 테스트 세트 예측
    print("테스트 세트 예측 중...")
    test_texts = test_df['conversation'].tolist()
    test_dataset = ConversationDataset(test_texts, labels=None, tokenizer=tokenizer, max_len=MAX_LEN)
    test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)
    test_preds = []
    model.eval()
    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=1).cpu().numpy()
            test_preds.extend(preds)
    test_predictions_folds.append(test_preds)
    fold_metrics.append({'fold': fold + 1, 'val_f1': val_f1})

    del model
    torch.cuda.empty_cache()

# 앙상블 (hard voting)
test_predictions = np.zeros((len(test_df),), dtype=int)
test_pred_array = np.array(test_predictions_folds)
for i in range(len(test_df)):
    test_predictions[i] = np.bincount(test_pred_array[:, i]).argmax()

# 예측 분포 확인
pred_dist = {idx2class[i]: np.sum(test_predictions == i) for i in range(len(custom_classes))}
print("\n예측 클래스 분포:")
for cls, count in pred_dist.items():
    print(f"{cls}: {count} ({count/len(test_predictions)*100:.2f}%)")

# 결과 저장
submission = pd.DataFrame({
    'idx': test_df['idx'],
    'target': test_predictions
})
submission.to_csv('submission.csv', index=False)
print("\n제출 파일 생성 완료: submission.csv")

mean_f1 = np.mean([m['val_f1'] for m in fold_metrics])
print(f"\n교차검증 평균 F1 점수: {mean_f1:.4f}")


In [None]:
BERT-GPT 하이브리드 파이프라인

v.1

In [None]:
import torch
import torch.nn as nn
from transformers import BertModel, GPT2Model
from sklearn.metrics import f1_score
from torch.utils.data import Dataset, DataLoader
import pandas as pd

# 1. 데이터셋 클래스
class TextClassificationDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]
        
        encoding = self.tokenizer(
            text,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

# 2. 하이브리드 모델
class BertGptClassifier(nn.Module):
    def __init__(self, num_classes, bert_model='bert-base-uncased', gpt_model='gpt2'):
        super().__init__()
        self.bert = BertModel.from_pretrained(bert_model)
        self.gpt = GPT2Model.from_pretrained(gpt_model)
        
        # BERT-GPT 특징 융합 레이어
        self.fusion = nn.Sequential(
            nn.Linear(768 + 768, 512),
            nn.ReLU(),
            nn.Dropout(0.3)
        )
        
        # 분류 헤드
        self.classifier = nn.Linear(512, num_classes)
        
    def forward(self, input_ids, attention_mask):
        # BERT 특징 추출
        bert_output = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        bert_features = bert_output.last_hidden_state[:, 0, :]  # [CLS] 토큰
        
        # GPT 특징 추출
        gpt_output = self.gpt(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        gpt_features = gpt_output.last_hidden_state[:, -1, :]  # 마지막 토큰
        
        # 특징 융합
        combined = torch.cat([bert_features, gpt_features], dim=1)
        fused = self.fusion(combined)
        
        # 분류
        return self.classifier(fused)

# 3. 학습 함수 (F1 Score 계산 포함)
def train_model(model, dataloader, optimizer, device, scheduler=None):
    model.train()
    total_loss = 0
    predictions = []
    true_labels = []
    
    for batch in dataloader:
        optimizer.zero_grad()
        
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
        
        outputs = model(input_ids, attention_mask)
        loss = nn.CrossEntropyLoss()(outputs, labels)
        
        loss.backward()
        optimizer.step()
        if scheduler:
            scheduler.step()
            
        total_loss += loss.item()
        
        # 예측값 저장
        _, preds = torch.max(outputs, dim=1)
        predictions.extend(preds.cpu().tolist())
        true_labels.extend(labels.cpu().tolist())
    
    # F1 Score 계산
    f1 = f1_score(true_labels, predictions, average='weighted')
    return total_loss / len(dataloader), f1

# 4. 평가 함수
def eval_model(model, dataloader, device):
    model.eval()
    total_loss = 0
    predictions = []
    true_labels = []
    
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            
            outputs = model(input_ids, attention_mask)
            loss = nn.CrossEntropyLoss()(outputs, labels)
            
            total_loss += loss.item()
            
            _, preds = torch.max(outputs, dim=1)
            predictions.extend(preds.cpu().tolist())
            true_labels.extend(labels.cpu().tolist())
    
    # F1 Score 계산
    f1 = f1_score(true_labels, predictions, average='weighted')
    return total_loss / len(dataloader), f1

# 5. 메인 실행 코드
if __name__ == "__main__":
    # 하이퍼파라미터
    BATCH_SIZE = 16
    EPOCHS = 5
    LEARNING_RATE = 2e-5
    MAX_LEN = 128
    
    # 데이터 준비 (예시)
    train_texts = ["text1", "text2", ...]  # 실제 텍스트 데이터
    train_labels = [0, 1, ...]            # 실제 레이블
    val_texts = ["text3", "text4", ...]
    val_labels = [1, 0, ...]
    
    # 토크나이저 (BERT/GPT 호환)
    from transformers import BertTokenizer
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    
    # 데이터셋 생성
    train_dataset = TextClassificationDataset(train_texts, train_labels, tokenizer, MAX_LEN)
    val_dataset = TextClassificationDataset(val_texts, val_labels, tokenizer, MAX_LEN)
    
    # 데이터로더
    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)
    
    # 장치 설정
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    
    # 모델 초기화
    model = BertGptClassifier(num_classes=5)  # 클래스 수 수정
    model = model.to(device)
    
    # 옵티마이저
    optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE)
    
    # 학습 루프
    best_f1 = 0
    for epoch in range(EPOCHS):
        train_loss, train_f1 = train_model(model, train_loader, optimizer, device)
        val_loss, val_f1 = eval_model(model, val_loader, device)
        
        print(f'Epoch {epoch+1}/{EPOCHS}')
        print(f'Train Loss: {train_loss:.4f} | Train F1: {train_f1:.4f}')
        print(f'Val Loss: {val_loss:.4f} | Val F1: {val_f1:.4f}')
        
        # 최고 성능 모델 저장
        if val_f1 > best_f1:
            best_f1 = val_f1
            torch.save(model.state_dict(), 'best_model.pth')
    
    print(f'Best Validation F1: {best_f1:.4f}')


v.2

In [None]:
import torch
import torch.nn as nn
from transformers import BertModel, GPT2Model
from sklearn.metrics import f1_score
from torch.utils.data import Dataset, DataLoader
import pandas as pd

# 1. 데이터셋 클래스
class TextClassificationDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]
        
        encoding = self.tokenizer(
            text,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

# 2. 하이브리드 모델
class BertGptClassifier(nn.Module):
    def __init__(self, num_classes, bert_model='bert-base-uncased', gpt_model='gpt2'):
        super().__init__()
        self.bert = BertModel.from_pretrained(bert_model)
        self.gpt = GPT2Model.from_pretrained(gpt_model)
        
        # BERT-GPT 특징 융합 레이어
        self.fusion = nn.Sequential(
            nn.Linear(768 + 768, 512),
            nn.ReLU(),
            nn.Dropout(0.3)
        )
        
        # 분류 헤드
        self.classifier = nn.Linear(512, num_classes)
        
    def forward(self, input_ids, attention_mask):
        # BERT 특징 추출
        bert_output = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        bert_features = bert_output.last_hidden_state[:, 0, :]  # [CLS] 토큰
        
        # GPT 특징 추출
        gpt_output = self.gpt(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        gpt_features = gpt_output.last_hidden_state[:, -1, :]  # 마지막 토큰
        
        # 특징 융합
        combined = torch.cat([bert_features, gpt_features], dim=1)
        fused = self.fusion(combined)
        
        # 분류
        return self.classifier(fused)

# 3. 학습 함수 (F1 Score 계산 포함)
def train_model(model, dataloader, optimizer, device, scheduler=None):
    model.train()
    total_loss = 0
    predictions = []
    true_labels = []
    
    for batch in dataloader:
        optimizer.zero_grad()
        
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
        
        outputs = model(input_ids, attention_mask)
        loss = nn.CrossEntropyLoss()(outputs, labels)
        
        loss.backward()
        optimizer.step()
        if scheduler:
            scheduler.step()
            
        total_loss += loss.item()
        
        # 예측값 저장
        _, preds = torch.max(outputs, dim=1)
        predictions.extend(preds.cpu().tolist())
        true_labels.extend(labels.cpu().tolist())
    
    # F1 Score 계산
    f1 = f1_score(true_labels, predictions, average='weighted')
    return total_loss / len(dataloader), f1

# 4. 평가 함수
def eval_model(model, dataloader, device):
    model.eval()
    total_loss = 0
    predictions = []
    true_labels = []
    
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            
            outputs = model(input_ids, attention_mask)
            loss = nn.CrossEntropyLoss()(outputs, labels)
            
            total_loss += loss.item()
            
            _, preds = torch.max(outputs, dim=1)
            predictions.extend(preds.cpu().tolist())
            true_labels.extend(labels.cpu().tolist())
    
    # F1 Score 계산
    f1 = f1_score(true_labels, predictions, average='weighted')
    return total_loss / len(dataloader), f1

# 5. 메인 실행 코드
if __name__ == "__main__":
    # 하이퍼파라미터
    BATCH_SIZE = 16
    EPOCHS = 5
    LEARNING_RATE = 2e-5
    MAX_LEN = 128
    
    # 데이터 준비 (예시)
    train_texts = ["text1", "text2", ...]  # 실제 텍스트 데이터
    train_labels = [0, 1, ...]            # 실제 레이블
    val_texts = ["text3", "text4", ...]
    val_labels = [1, 0, ...]
    
    # 토크나이저 (BERT/GPT 호환)
    from transformers import BertTokenizer
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    
    # 데이터셋 생성
    train_dataset = TextClassificationDataset(train_texts, train_labels, tokenizer, MAX_LEN)
    val_dataset = TextClassificationDataset(val_texts, val_labels, tokenizer, MAX_LEN)
    
    # 데이터로더
    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)
    
    # 장치 설정
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    
    # 모델 초기화
    model = BertGptClassifier(num_classes=5)  # 클래스 수 수정
    model = model.to(device)
    
    # 옵티마이저
    optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE)
    
    # 학습 루프
    best_f1 = 0
    for epoch in range(EPOCHS):
        train_loss, train_f1 = train_model(model, train_loader, optimizer, device)
        val_loss, val_f1 = eval_model(model, val_loader, device)
        
        print(f'Epoch {epoch+1}/{EPOCHS}')
        print(f'Train Loss: {train_loss:.4f} | Train F1: {train_f1:.4f}')
        print(f'Val Loss: {val_loss:.4f} | Val F1: {val_f1:.4f}')
        
        # 최고 성능 모델 저장
        if val_f1 > best_f1:
            best_f1 = val_f1
            torch.save(model.state_dict(), 'best_model.pth')
    
    print(f'Best Validation F1: {best_f1:.4f}')


colab 경량화 버전(T4)

In [None]:
# 1. 필수 라이브러리 설치 및 환경설정
!pip install -U transformers==4.41.0 sentence-transformers
import torch
import torch.nn as nn
from transformers import BertTokenizer, BertConfig, BertModel
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, classification_report
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import os

# 2. Colab 파일 업로드 (train.csv, test.csv)
from google.colab import files
uploaded = files.upload()

# 3. 데이터 로드 및 전처리
def clean_text(text):
    text = re.sub(r'[^\w\s.,!?가-힣]', '', str(text).strip())  # 한글/영문/숫자/기본 부호만
    text = re.sub(r'\s+', ' ', text)
    return text[:512]

train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
if 'text' in test_df.columns:
    test_df = test_df.rename(columns={'text': 'conversation'})
if 'file_name' in test_df.columns:
    test_df = test_df.rename(columns={'file_name': 'idx'})

train_df['conversation'] = train_df['conversation'].apply(clean_text)
test_df['conversation'] = test_df['conversation'].apply(clean_text)

custom_classes = ['협박', '갈취', '직장내괴롭힘', '기타괴롭힘', '일반']
class2idx = {
    '협박': 0, '협박 대화': 0,
    '갈취': 1, '갈취 대화': 1,
    '직장내괴롭힘': 2, '직장 내 괴롭힘 대화': 2,
    '기타괴롭힘': 3, '기타 괴롭힘 대화': 3,
    '일반': 4, '일반대화': 4
}
def extract_class(x):
    if '협박' in x:
        return '협박'
    elif '갈취' in x:
        return '갈취'
    elif '직장' in x:
        return '직장내괴롭힘'
    elif '기타' in x:
        return '기타괴롭힘'
    elif '일반' in x:
        return '일반'
    return x
train_df['class'] = train_df['class'].apply(extract_class)
train_df['target'] = train_df['class'].map(class2idx)
assert train_df['target'].isna().sum() == 0, "클래스 매핑 오류 발생!"

idx2class = {i: c for i, c in enumerate(custom_classes)}

# 클래스 분포 시각화
plt.figure(figsize=(8,4))
train_df['class'].value_counts().plot(kind='bar')
plt.title('Train Class Distribution')
plt.show()

# 4. 토크나이저 준비 (한국어면 'klue/bert-base')
MODEL_NAME = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)

# 5. 데이터셋 정의
class HybridDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128, mlm_prob=0.15):
        self.encodings = tokenizer(texts.tolist(), padding='max_length', truncation=True, max_length=max_len)
        self.labels = labels.tolist() if labels is not None else None
        self.max_len = max_len
        self.tokenizer = tokenizer
        self.mlm_prob = mlm_prob

    def __len__(self):
        return len(self.encodings['input_ids'])

    def __getitem__(self, idx):
        input_ids = torch.tensor(self.encodings['input_ids'][idx])
        attention_mask = torch.tensor(self.encodings['attention_mask'][idx])

        # MLM 마스킹
        mlm_labels = input_ids.clone()
        probability_matrix = torch.full(input_ids.shape, self.mlm_prob)
        special_tokens_mask = self.tokenizer.get_special_tokens_mask(input_ids.tolist(), already_has_special_tokens=True)
        probability_matrix.masked_fill_(torch.tensor(special_tokens_mask, dtype=torch.bool), value=0.0)
        masked_indices = torch.bernoulli(probability_matrix).bool()
        input_ids_masked = input_ids.clone()
        input_ids_masked[masked_indices] = self.tokenizer.mask_token_id
        mlm_labels[~masked_indices] = -100

        # CLM 레이블 (shift)
        clm_labels = input_ids.clone()
        clm_labels[0] = -100

        item = {
            'input_ids': input_ids_masked,
            'attention_mask': attention_mask,
            'mlm_labels': mlm_labels,
            'clm_labels': clm_labels
        }
        if self.labels is not None:
            item['label'] = torch.tensor(self.labels[idx])
        return item

# 6. 학습/검증 데이터 분할
train_texts, val_texts, train_labels, val_labels = train_test_split(
    train_df['conversation'], train_df['target'], test_size=0.2, stratify=train_df['target'], random_state=42
)
train_dataset = HybridDataset(train_texts, train_labels, tokenizer)
val_dataset = HybridDataset(val_texts, val_labels, tokenizer)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32)

# 7. 경량화 하이브리드 트랜스포머 모델 정의
class HybridTransformerForMNTP(nn.Module):
    def __init__(self, vocab_size, hidden_size=512, num_layers=6, num_heads=8, num_classes=5):
        super().__init__()
        config = BertConfig(
            vocab_size=vocab_size,
            hidden_size=hidden_size,
            num_attention_heads=num_heads,
            num_hidden_layers=num_layers,
            is_decoder=True,
            add_cross_attention=False
        )
        self.transformer = BertModel(config)
        self.lm_head = nn.Linear(hidden_size, vocab_size)
        self.classifier = nn.Linear(hidden_size, num_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.transformer(input_ids=input_ids, attention_mask=attention_mask)
        logits = self.lm_head(outputs.last_hidden_state)
        # 분류 헤드: [CLS] 토큰만
        cls_logits = self.classifier(outputs.last_hidden_state[:,0,:])
        return logits, cls_logits

# 8. 손실 함수
class HybridLoss(nn.Module):
    def __init__(self):
        super().__init__()
        self.mlm_loss = nn.CrossEntropyLoss(ignore_index=-100)
        self.clm_loss = nn.CrossEntropyLoss(ignore_index=-100)
        self.ce_loss = nn.CrossEntropyLoss()
    def forward(self, logits, mlm_labels, clm_labels, cls_logits, labels):
        mlm_loss = self.mlm_loss(logits.view(-1, logits.size(-1)), mlm_labels.view(-1))
        shift_logits = logits[:, :-1, :].contiguous()
        shift_labels = clm_labels[:, 1:].contiguous()
        clm_loss = self.clm_loss(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
        ce_loss = self.ce_loss(cls_logits, labels)
        return 0.5*mlm_loss + 0.2*clm_loss + 0.3*ce_loss

# 9. 학습 준비
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = HybridTransformerForMNTP(vocab_size=tokenizer.vocab_size).to(device)
criterion = HybridLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
scaler = torch.cuda.amp.GradScaler()

# 10. 평가 및 분석 함수
def evaluate_and_analyze(model, loader, device, tokenizer, epoch):
    model.eval()
    all_preds = []
    all_labels = []
    misclassified_samples = []
    with torch.no_grad():
        for batch in loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            with torch.autocast(device_type='cuda', dtype=torch.float16):
                _, cls_logits = model(input_ids, attention_mask)
            preds = torch.argmax(cls_logits, dim=1)
            all_preds.extend(preds.cpu().tolist())
            all_labels.extend(labels.cpu().tolist())
            # 오분류 샘플 수집
            for i in range(len(preds)):
                if preds[i] != labels[i] and len(misclassified_samples) < 5:
                    orig_text = tokenizer.decode([x for x in input_ids[i].cpu().tolist() if x != tokenizer.pad_token_id], skip_special_tokens=True)
                    misclassified_samples.append({
                        'text': orig_text,
                        'pred': idx2class[preds[i].item()],
                        'true': idx2class[labels[i].item()]
                    })
    f1 = f1_score(all_labels, all_preds, average='macro')
    print(f"\nEpoch {epoch+1} F1: {f1:.4f}")
    print(classification_report(all_labels, all_preds, target_names=custom_classes, zero_division=0))
    plt.figure(figsize=(8,4))
    pd.Series(all_preds).value_counts().sort_index().plot(kind='bar')
    plt.title("Predicted Class Distribution")
    plt.xticks(ticks=range(len(custom_classes)), labels=custom_classes, rotation=45)
    plt.show()
    print("\nTop 5 Misclassified Samples:")
    for sample in misclassified_samples:
        print(f"Text: {sample['text']}")
        print(f"Predicted: {sample['pred']} | True: {sample['true']}\n")

# 11. 학습 루프
EPOCHS = 3
for epoch in range(EPOCHS):
    model.train()
    total_loss = 0
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        mlm_labels = batch['mlm_labels'].to(device)
        clm_labels = batch['clm_labels'].to(device)
        labels = batch['label'].to(device)
        with torch.autocast(device_type='cuda', dtype=torch.float16):
            logits, cls_logits = model(input_ids, attention_mask)
            loss = criterion(logits, mlm_labels, clm_labels, cls_logits, labels)
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        total_loss += loss.item()
    print(f"Epoch {epoch+1} | Train Loss: {total_loss/len(train_loader):.4f}")
    evaluate_and_analyze(model, val_loader, device, tokenizer, epoch)

print("학습 완료!")

# 12. 테스트 예측 및 CSV 저장
test_dataset = HybridDataset(test_df['conversation'], [0]*len(test_df), tokenizer)
test_loader = DataLoader(test_dataset, batch_size=32)
model.eval()
test_preds = []
with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        with torch.autocast(device_type='cuda', dtype=torch.float16):
            _, cls_logits = model(input_ids, attention_mask)
        preds = torch.argmax(cls_logits, dim=1)
        test_preds.extend(preds.cpu().tolist())

submission = pd.DataFrame({
    'idx': test_df['idx'],
    'target': test_preds
})
submission.to_csv('submission.csv', index=False)
print("제출 파일 생성 완료: submission.csv")
print(submission.head())
