In [1]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.32.1-py3-none-any.whl (7.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.5/7.5 MB[0m [31m17.2 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.15.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m26.8 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m39.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m41.9 MB/s[0m eta [36m0:00:0

### 사전 학습된 BERT 불러오기

In [2]:
import pandas as pd
import torch
from torch.nn import CrossEntropyLoss
from torch.optim import Adam
from transformers import BertTokenizer, BertForSequenceClassification, BertConfig
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from tqdm import tqdm

In [3]:
# 데이터 불러오기
df = pd.read_csv('joongang.csv')
df['text'] = df['title'] + ' ' + df['content']
# NaN 값을 포함하는 행 삭제
df.dropna(subset=['text'], inplace=True)

In [4]:
# 원본 데이터를 학습 데이터와 테스트 데이터로 나눔
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

In [5]:
# KoBERT 토크나이저 로드
tokenizer = BertTokenizer.from_pretrained('monologg/kobert')

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/77.8k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/51.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/426 [00:00<?, ?B/s]

In [6]:
def encode_data(texts, tokenizer, max_len=512):
    all_tokens = []
    all_masks = []

    for text in texts:
        encoded_dict = tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=max_len,
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        # 직접 패딩 수행
        padding_length = max_len - len(encoded_dict['input_ids'][0])
        all_tokens.append(torch.cat([encoded_dict['input_ids'], torch.zeros((1, padding_length), dtype=torch.long)], dim=1))
        all_masks.append(torch.cat([encoded_dict['attention_mask'], torch.zeros((1, padding_length), dtype=torch.long)], dim=1))

    all_tokens = torch.cat(all_tokens, dim=0)
    all_masks = torch.cat(all_masks, dim=0)

    return all_tokens, all_masks


In [7]:
# 학습 데이터 인코딩
train_input_ids, train_attention_masks = encode_data(train_df['text'], tokenizer)
train_labels1 = torch.tensor(train_df['label1'].values) - 1
train_labels2 = torch.tensor(train_df['label2'].values) - 1

# 테스트 데이터 인코딩 및 성능 평가
test_input_ids, test_attention_masks = encode_data(test_df['text'], tokenizer)
test_labels1 = torch.tensor(test_df['label1'].values) - 1
test_labels2 = torch.tensor(test_df['label2'].values) - 1

In [8]:
# 데이터셋 및 데이터 로더 생성
train_dataset = TensorDataset(train_input_ids, train_attention_masks, train_labels1, train_labels2)
train_dataloader = DataLoader(train_dataset, batch_size=4, shuffle=True)

test_dataset = TensorDataset(test_input_ids, test_attention_masks, test_labels1, test_labels2)
test_dataloader = DataLoader(test_dataset, batch_size=2, shuffle=False)

In [9]:
# 모델 정의
class KoBERTForMultiLabelSequenceClassification(torch.nn.Module):
    def __init__(self, num_labels1, num_labels2):
        super(KoBERTForMultiLabelSequenceClassification, self).__init__()
        self.config = BertConfig.from_pretrained('monologg/kobert', num_labels=num_labels1, output_hidden_states=True)
        self.bert = BertForSequenceClassification.from_pretrained('monologg/kobert', config=self.config)
        self.classifier2 = torch.nn.Linear(768, num_labels2)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids, attention_mask=attention_mask)
        logits1 = outputs.logits
        pooled_output = outputs.hidden_states[-1][:, 0]  # CLS 토큰의 히든 상태
        logits2 = self.classifier2(pooled_output)
        return logits1, logits2




# GPU 설정
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

num_labels1 = df['label1'].nunique()
num_labels2 = df['label2'].nunique()
model = KoBERTForMultiLabelSequenceClassification(num_labels1, num_labels2).to(device)
optimizer = Adam(model.parameters(), lr=1e-5)
loss_fn1 = CrossEntropyLoss()
loss_fn2 = CrossEntropyLoss()

Downloading model.safetensors:   0%|          | 0.00/369M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at monologg/kobert and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
# 학습 데이터를 학습 및 검증 데이터로 나눔
train_inputs, val_inputs, train_masks, val_masks, train_labels1, val_labels1, train_labels2, val_labels2 = train_test_split(
    train_input_ids, train_attention_masks, train_labels1, train_labels2, random_state=42, test_size=0.1
)

train_dataset = TensorDataset(train_inputs, train_masks, train_labels1, train_labels2)
val_dataset = TensorDataset(val_inputs, val_masks, val_labels1, val_labels2)
train_dataloader = DataLoader(train_dataset, batch_size=4, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=4, shuffle=False)

In [11]:
from tqdm import tqdm
from sklearn.metrics import f1_score

EPOCHS = 10

train_losses = []
val_losses = []
val_accuracies1 = []
val_accuracies2 = []
val_f1_scores1 = []
val_f1_scores2 = []
best_model_path = "best_model.pth"

best_score1 = 0
best_score2 = 0

for epoch in range(EPOCHS):
    model.train()
    total_loss = 0
    total_correct1 = 0
    total_correct2 = 0
    total_samples = 0

    for batch in tqdm(train_dataloader, desc=f"Training Epoch {epoch+1}/{EPOCHS}"):
        optimizer.zero_grad()
        input_ids, attention_mask, label1, label2 = [b.to(device) for b in batch]
        logits1, logits2 = model(input_ids, attention_mask)

        loss1 = loss_fn1(logits1, label1)
        loss2 = loss_fn2(logits2, label2)
        loss = loss1 + loss2
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        total_correct1 += (logits1.argmax(dim=1) == label1).sum().item()
        total_correct2 += (logits2.argmax(dim=1) == label2).sum().item()
        total_samples += label1.size(0)

    avg_train_loss = total_loss / total_samples
    train_losses.append(avg_train_loss)

    # Evaluate on valid set
    model.eval()
    val_loss = 0
    val_correct1 = 0
    val_correct2 = 0
    val_samples = 0
    val_preds1 = []
    val_preds2 = []

    with torch.no_grad():
        for batch in val_dataloader:
            input_ids, attention_mask, label1, label2 = [b.to(device) for b in batch]
            logits1, logits2 = model(input_ids, attention_mask)

            loss1 = loss_fn1(logits1, label1)
            loss2 = loss_fn2(logits2, label2)
            val_loss += (loss1 + loss2).item()

            val_correct1 += (logits1.argmax(dim=1) == label1).sum().item()
            val_correct2 += (logits2.argmax(dim=1) == label2).sum().item()
            val_samples += label1.size(0)

            val_preds1.extend(logits1.argmax(dim=1).tolist())
            val_preds2.extend(logits2.argmax(dim=1).tolist())

    avg_val_loss = val_loss / val_samples
    val_losses.append(avg_val_loss)
    val_accuracy1 = val_correct1 / val_samples
    val_accuracy2 = val_correct2 / val_samples
    val_accuracies1.append(val_accuracy1)
    val_accuracies2.append(val_accuracy2)
    f1_1 = f1_score(val_labels1.cpu(), val_preds1, average='weighted')
    f1_2 = f1_score(val_labels2.cpu(), val_preds2, average='weighted')
    val_f1_scores1.append(f1_1)
    val_f1_scores2.append(f1_2)

    # Print epoch results
    print(f"\nEpoch {epoch+1}/{EPOCHS}")
    print(f"Train Loss: {avg_train_loss:.4f}")
    print(f"Validation Loss: {avg_val_loss:.4f}")
    print(f"Validation Accuracy1: {val_accuracy1:.4f}")
    print(f"Validation Accuracy2: {val_accuracy2:.4f}")
    print(f"Validation F1 Score for Label1: {f1_1:.4f}")
    print(f"Validation F1 Score for Label2: {f1_2:.4f}\n")

    # 최고 정확도를 갱신할 경우 모델 저장
    if val_accuracy1 > best_score1 and val_accuracy2 > best_score2:
        best_score1, best_score2 = val_accuracy1, val_accuracy2
        torch.save(model.state_dict(), best_model_path)
        print(f"Model saved for epoch {epoch+1} with accuracy1: {best_score1:.4f} and accuracy2: {best_score2:.4f}\n")


Training Epoch 1/10: 100%|██████████| 90/90 [00:14<00:00,  6.16it/s]



Epoch 1/10
Train Loss: 0.5961
Validation Loss: 0.6720
Validation Accuracy1: 0.4750
Validation Accuracy2: 0.3250
Validation F1 Score for Label1: 0.3059
Validation F1 Score for Label2: 0.1594

Model saved for epoch 1 with accuracy1: 0.4750 and accuracy2: 0.3250



Training Epoch 2/10: 100%|██████████| 90/90 [00:11<00:00,  7.86it/s]



Epoch 2/10
Train Loss: 0.5573
Validation Loss: 0.6768
Validation Accuracy1: 0.4750
Validation Accuracy2: 0.3250
Validation F1 Score for Label1: 0.3059
Validation F1 Score for Label2: 0.1594



Training Epoch 3/10: 100%|██████████| 90/90 [00:11<00:00,  7.84it/s]



Epoch 3/10
Train Loss: 0.5321
Validation Loss: 0.6120
Validation Accuracy1: 0.4750
Validation Accuracy2: 0.4750
Validation F1 Score for Label1: 0.3059
Validation F1 Score for Label2: 0.3924



Training Epoch 4/10: 100%|██████████| 90/90 [00:11<00:00,  7.84it/s]



Epoch 4/10
Train Loss: 0.4912
Validation Loss: 0.5668
Validation Accuracy1: 0.5000
Validation Accuracy2: 0.5500
Validation F1 Score for Label1: 0.4218
Validation F1 Score for Label2: 0.4457

Model saved for epoch 4 with accuracy1: 0.5000 and accuracy2: 0.5500



Training Epoch 5/10: 100%|██████████| 90/90 [00:11<00:00,  7.84it/s]



Epoch 5/10
Train Loss: 0.4431
Validation Loss: 0.5491
Validation Accuracy1: 0.4750
Validation Accuracy2: 0.5750
Validation F1 Score for Label1: 0.3988
Validation F1 Score for Label2: 0.5304



Training Epoch 6/10: 100%|██████████| 90/90 [00:11<00:00,  7.84it/s]



Epoch 6/10
Train Loss: 0.4173
Validation Loss: 0.5481
Validation Accuracy1: 0.5500
Validation Accuracy2: 0.6000
Validation F1 Score for Label1: 0.5077
Validation F1 Score for Label2: 0.5720

Model saved for epoch 6 with accuracy1: 0.5500 and accuracy2: 0.6000



Training Epoch 7/10: 100%|██████████| 90/90 [00:11<00:00,  7.83it/s]



Epoch 7/10
Train Loss: 0.3664
Validation Loss: 0.5605
Validation Accuracy1: 0.4750
Validation Accuracy2: 0.6000
Validation F1 Score for Label1: 0.4082
Validation F1 Score for Label2: 0.5844



Training Epoch 8/10: 100%|██████████| 90/90 [00:11<00:00,  7.84it/s]



Epoch 8/10
Train Loss: 0.3229
Validation Loss: 0.5709
Validation Accuracy1: 0.5500
Validation Accuracy2: 0.5250
Validation F1 Score for Label1: 0.4992
Validation F1 Score for Label2: 0.5086



Training Epoch 9/10: 100%|██████████| 90/90 [00:11<00:00,  7.84it/s]



Epoch 9/10
Train Loss: 0.2746
Validation Loss: 0.6176
Validation Accuracy1: 0.5000
Validation Accuracy2: 0.5000
Validation F1 Score for Label1: 0.4461
Validation F1 Score for Label2: 0.4626



Training Epoch 10/10: 100%|██████████| 90/90 [00:11<00:00,  7.83it/s]



Epoch 10/10
Train Loss: 0.2349
Validation Loss: 0.6205
Validation Accuracy1: 0.5250
Validation Accuracy2: 0.5000
Validation F1 Score for Label1: 0.4907
Validation F1 Score for Label2: 0.4429



In [12]:
# 학습이 끝난 후 가장 좋은 성능을 보인 모델을 불러옵니다.
model.load_state_dict(torch.load(best_model_path))
model.eval()

# 테스트 데이터셋에 대한 예측을 수행
true_labels1 = []
true_labels2 = []
predicted_labels1 = []
predicted_labels2 = []

with torch.no_grad():
    for batch in test_dataloader:
        input_ids, attention_mask, label1, label2 = [b.to(device) for b in batch]
        logits1, logits2 = model(input_ids, attention_mask)

        true_labels1.extend(label1.tolist())
        true_labels2.extend(label2.tolist())
        predicted_labels1.extend(logits1.argmax(dim=1).tolist())
        predicted_labels2.extend(logits2.argmax(dim=1).tolist())

# f1-score를 포함한 성능 지표를 출력
accuracy1 = sum([1 if true == pred else 0 for true, pred in zip(true_labels1, predicted_labels1)]) / len(true_labels1)
accuracy2 = sum([1 if true == pred else 0 for true, pred in zip(true_labels2, predicted_labels2)]) / len(true_labels2)

f1_1 = f1_score(true_labels1, predicted_labels1, average='weighted')
f1_2 = f1_score(true_labels2, predicted_labels2, average='weighted')

print(f"Test Accuracy for Label1: {accuracy1:.4f}")
print(f"Test Accuracy for Label2: {accuracy2:.4f}")
print(f"F1 Score for Label1: {f1_1:.4f}")
print(f"F1 Score for Label2: {f1_2:.4f}")

Test Accuracy for Label1: 0.5500
Test Accuracy for Label2: 0.6800
F1 Score for Label1: 0.5353
F1 Score for Label2: 0.6504


In [13]:
print("\nTrue vs Predicted Labels:")
for true1, pred1, true2, pred2 in zip(true_labels1, predicted_labels1, true_labels2, predicted_labels2):
    print(f"Label1 - True: {true1 + 1}, Predicted: {pred1 + 1} | Label2 - True: {true2 + 1}, Predicted: {pred2 + 1}")



True vs Predicted Labels:
Label1 - True: 2, Predicted: 4 | Label2 - True: 3, Predicted: 3
Label1 - True: 3, Predicted: 3 | Label2 - True: 4, Predicted: 3
Label1 - True: 3, Predicted: 4 | Label2 - True: 3, Predicted: 3
Label1 - True: 3, Predicted: 4 | Label2 - True: 3, Predicted: 3
Label1 - True: 2, Predicted: 4 | Label2 - True: 3, Predicted: 3
Label1 - True: 3, Predicted: 3 | Label2 - True: 3, Predicted: 4
Label1 - True: 4, Predicted: 3 | Label2 - True: 4, Predicted: 3
Label1 - True: 3, Predicted: 3 | Label2 - True: 3, Predicted: 3
Label1 - True: 4, Predicted: 4 | Label2 - True: 3, Predicted: 3
Label1 - True: 2, Predicted: 3 | Label2 - True: 3, Predicted: 4
Label1 - True: 4, Predicted: 4 | Label2 - True: 3, Predicted: 3
Label1 - True: 3, Predicted: 3 | Label2 - True: 4, Predicted: 3
Label1 - True: 4, Predicted: 3 | Label2 - True: 3, Predicted: 4
Label1 - True: 3, Predicted: 3 | Label2 - True: 2, Predicted: 2
Label1 - True: 4, Predicted: 4 | Label2 - True: 4, Predicted: 3
Label1 - True