In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# 모델 구현

In [2]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

import os

In [3]:
%cd /content/drive/MyDrive/NLP_project
path = os.getcwd()
train_path = path + '/data/' + 'train.txt'
valid_path = path + '/data/' + 'valid.txt'
test_path = path  + '/data/' + 'test.txt'

/content/drive/MyDrive/NLP_project


In [None]:
#!pip install transformers
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.preprocessing import LabelEncoder


In [12]:
class CustomDataset(Dataset):
    def __init__(self, tokenizer, sentences, labels, max_len):
        self.tokenizer = tokenizer
        self.sentences = sentences
        self.labels = labels
        self.max_len = max_len
    
    def __len__(self):
        return len(self.sentences)
    
    def __getitem__(self, item):
        encoding = self.tokenizer.encode_plus(
            self.sentences[item],
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
        )
        return {
            'text': self.sentences[item],
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(self.labels[item], dtype=torch.long)
        }

def load_data(file_path, label_encoder=None):
    with open(file_path, 'r', encoding='utf-8') as f:
        lines = f.readlines()
        labels = []
        sentences = []
        for i, data in enumerate(lines):
          if i< 2:
            continue
          if data == '\n':
            continue
          ID, speaker, utt, ethics_types, immoral = data.strip().split('\t')
          sentences.append(utt)
          labels.append(ethics_types)
    if label_encoder is None:
        label_encoder = LabelEncoder()
        labels = label_encoder.fit_transform(labels)
    else:
        labels = label_encoder.transform(labels)
    return sentences, labels, label_encoder

In [20]:
# Hyperparameters
epochs = 10
batch_size = 16
max_len = 128
learning_rate = 1e-5


In [22]:
# Model & Tokenizer
tokenizer = AutoTokenizer.from_pretrained("beomi/KcELECTRA-base-v2022")
model = AutoModelForSequenceClassification.from_pretrained("beomi/KcELECTRA-base-v2022", num_labels=8)

# Load data & Dataset & DataLoader
train_sentences, train_labels, label_encoder = load_data(train_path)
val_sentences, val_labels, _ = load_data(valid_path, label_encoder)
test_sentences, test_labels, _ = load_data(test_path, label_encoder)

train_dataset = CustomDataset(tokenizer, train_sentences, train_labels, max_len)
val_dataset = CustomDataset(tokenizer, val_sentences, val_labels, max_len)
test_dataset = CustomDataset(tokenizer, test_sentences, test_labels, max_len)

train_loader = DataLoader(train_dataset, batch_size=batch_size)
val_loader = DataLoader(val_dataset, batch_size=batch_size)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

# Optimizer
optimizer = AdamW(model.parameters(), lr=learning_rate)

# Device
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

Some weights of the model checkpoint at beomi/KcELECTRA-base-v2022 were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.weight', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at beomi/KcELECTRA-base-v2022 and are newly initialized: ['classifier.dense.bias', 'c

ElectraForSequenceClassification(
  (electra): ElectraModel(
    (embeddings): ElectraEmbeddings(
      (word_embeddings): Embedding(54343, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): ElectraEncoder(
      (layer): ModuleList(
        (0-11): 12 x ElectraLayer(
          (attention): ElectraAttention(
            (self): ElectraSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): ElectraSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): L

In [None]:
# Training loop
for epoch in range(epochs):
    model.train()
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
    
    model.eval()
    val_predictions = []
    val_true_labels = []
    for batch in val_loader:
        with torch.no_grad():
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            outputs = model(input_ids, attention_mask=attention_mask)
            val_predictions.extend(torch.argmax(outputs.logits, dim=1).tolist())
            val_true_labels.extend(labels.tolist())
    
    # Metrics
    val_accuracy = accuracy_score(val_true_labels, val_predictions)
    val_precision, val_recall, val_fscore, _ = precision_recall_fscore_support(val_true_labels, val_predictions, average='weighted')
    print(f'Epoch {epoch+1}/{epochs} | Val Accuracy: {val_accuracy:.2f} | Val Precision: {val_precision:.2f} | Val Recall: {val_recall:.2f} | Val F-score: {val_fscore:.2f}')

# Testing
test_predictions = []
test_true_labels = []
for batch in test_loader:
    with torch.no_grad():
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        outputs = model(input_ids, attention_mask=attention_mask)
        test_predictions.extend(torch.argmax(outputs.logits, dim=1).tolist())
        test_true_labels.extend(labels.tolist())

# Metrics
test_accuracy = accuracy_score(test_true_labels, test_predictions)
test_precision, test_recall, test_fscore, _ = precision_recall_fscore_support(test_true_labels, test_predictions, average='weighted')
print(f'Test Accuracy: {test_accuracy:.2f} | Test Precision: {test_precision:.2f} | Test Recall: {test_recall:.2f} | Test F-score: {test_fscore:.2f}')

# Save the model
model.save_pretrained('path_to_save_directory')
