In [1]:
# HuggingFace랑 연동되어 한국어 모델을 포함해 다양한 큰 규모의(large-scale) 트랜스포머 모델(BERT, ELECTRA 등)을 제공한다.
!pip install transformers
# 직접 사전(dictionary)를 직접 정의하지 않고도, 한국어가 가지는 특수한 단어를 포함하여 토큰으로 만들 수 있도록 도와준다.
!pip install soynlp



In [2]:

# PyTorch 라이브러리 불러오기
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

# 사전 학습된 트랜스포머(Transformer) 모델 사용하기
from transformers import (
    # HuggingFace에 있는 ELECTRA 모델과 연동된 4가지 라이브러리
    # 왜 BERT가 있는데 굳이 ELECTRA를 가져와야 할까요?
    # 이전에 했던 BigBird는 사실상 BERT와 유사한데
    # ELECTRA는 생성자(Generator)가 따로 존재하는 구조로 다르기 때문에 다른 아키텍처
    ElectraPreTrainedModel, # 대규모 데이터로 사전 학습된 모델
    ElectraModel,
    ElectraConfig,
    ElectraTokenizer, # 한국어 문장에서 자주 등장하는 "토큰"에 대해서 vocabulary 정의
    BertPreTrainedModel,
    BertModel,
    BertConfig,
    BertTokenizer
)

# 트랜스포머(Transformer) 학습을 위한 라이브러리 불러오기
from transformers import AdamW, get_linear_schedule_with_warmup

# 한국어 모델 학습을 위한 정규화 라이브러리 사용
from soynlp.normalizer import emoticon_normalize, repeat_normalize

# 기타 라이브러리 불러오기
import os
import re
import copy
import json
import logging
import random
import numpy as np

# 모델 학습 및 학습된 모델 평가를 위한 라이브러리
from sklearn.metrics import accuracy_score, f1_score, classification_report
from tqdm import tqdm, trange

# 하이퍼 파라미터 정의 목적
from argparse import Namespace
from types import SimpleNamespace

from torch.optim import AdamW
import gdown

In [3]:
args = SimpleNamespace()


directory_path = "/content/data"  # 원하는 디렉토리 경로로 변경하세요.

# 디렉토리 생성
if not os.path.exists(directory_path):
    os.makedirs(directory_path)
else:
    pass

args.task = "koreanNewsClassification"  # The name of the task to train
args.model_dir = "./model"  # Path to save, load model
#args.model_dir = "/content/Colab Notebooks/"
args.model_dir1 = "/content/drive/MyDrive/model"
args.data_dir = ""  # The input data dir
args.input_file = "complete_train.csv"
#args.input_file = "/content/complete.csv"
#args.input_test_file = "/content/test_dataset.csv"
args.input_test_file = "complete_test.csv"
args.pred_dir = "./preds"  # Directory that saves prediction files

args.train_file = "train.csv"  # Train file
args.dev_file = "validate.csv"  # Dev file
args.test_file = "test.csv"  # Test file
args.prediction_file = "prediction.csv"  # Output file for prediction

args.model_type = "koelectra-base-v2"  # Model type selected
args.model_name_or_path = "monologg/koelectra-base-v2-discriminator"  # Model name or path

args.seed = 42  # random seed for initialization
args.train_batch_size = 16  # Batch size for training.
args.eval_batch_size = 32  # Batch size for evaluation.
args.max_seq_len = 512  # The maximum total input sequence length after tokenization.
args.learning_rate = 5e-5  # The initial learning rate for Adam.
args.num_train_epochs = 20  # Total number of training epochs to perform.
args.weight_decay = 0.0  # Weight decay if we apply some.
args.gradient_accumulation_steps = 1  # Number of updates steps to accumulate before performing a backward/update pass.
args.adam_epsilon = 1e-8  # Epsilon for Adam optimizer.
args.max_grad_norm = 1.0  # Max gradient norm.
args.max_steps = -1  # If > 0: set total number of training steps to perform. Override num_train_epochs.
args.warmup_proportion = 0.1  # Warmup proportion for linear warmup

args.logging_steps = 200  # Log and save every X updates steps.

args.do_train = True  # Whether to run training.
args.do_pred = True  # Whether to run prediction on the test set.
args.no_cuda = False  # Avoid using CUDA when available

args.politic_loss_coef = 0.5  # Coefficient for the politic loss. (Changed from bias_loss_coef)
args.government_loss_coef = 1.0  # Coefficient for the government loss. (Changed from hate_loss_coef)


In [4]:
class PoliticClassificationHead(nn.Module):
    def __init__(self, config, num_politic_labels):
        super().__init__()
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size, num_politic_labels)

    def forward(self, x):
        x = self.dropout(x)
        x = self.classifier(x)
        return x


class GovernmentClassificationHead(nn.Module):
    def __init__(self, config, num_government_labels):
        super().__init__()
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size, num_government_labels)

    def forward(self, x):
        x = self.dropout(x)
        x = self.classifier(x)
        return x


In [5]:
class ElectraForPoliticClassification(ElectraPreTrainedModel):
    def __init__(self,
                 config: ElectraConfig,
                 args: Namespace,
                 politic_label_lst=None,
                 government_label_lst=None):
        super().__init__(config)
        self.args = args
        self.num_politic_labels = len(politic_label_lst) if politic_label_lst is not None else 0
        self.num_government_labels = len(government_label_lst) if government_label_lst is not None else 0

        self.electra = ElectraModel(config)  # 특징 추출기

        self.politic_classifier = PoliticClassificationHead(config, self.num_politic_labels)
        self.government_classifier = GovernmentClassificationHead(config, self.num_government_labels)

        self.loss_fct = nn.CrossEntropyLoss()
        self.init_weights()
    def forward(self, input_ids, attention_mask=None, token_type_ids=None, politic_labels=None, government_labels=None):
        discriminator_hidden_states = self.electra(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids
        )
        # discriminator_hidden_states는 BaseModelOutputWithPastAndCrossAttentions의 객체 중 하나.
        # [0]는 마지막 레이어의 hidden states로 (batch_size, sequence_length, hidden_size) 형태

        # discriminator_hidden_states[0][:, 0]의 의미:
        #   => 마지막 레이어의 hidden states 중에서 모든 배치를 유지한 상태로 첫째 sequence token만 확인 ([CLS] 토큰)
        pooled_output = discriminator_hidden_states[0][:, 0]
        politic_logits = self.politic_classifier(pooled_output)
        government_logits = self.government_classifier(pooled_output)

        total_loss = 0

        if politic_labels is not None:
            politic_loss = self.loss_fct(politic_logits.view(-1, self.num_politic_labels), politic_labels.view(-1))
            total_loss += self.args.politic_loss_coef * politic_loss

        if government_labels is not None:
            government_loss = self.loss_fct(government_logits.view(-1, self.num_government_labels), government_labels.view(-1))
            total_loss += self.args.government_loss_coef * government_loss

        outputs = ((politic_logits, government_logits),) + discriminator_hidden_states[1:]
        outputs = (total_loss,) + outputs

        return outputs



class BertForPoliticClassification(BertPreTrainedModel):
    def __init__(self,
                 config: BertConfig,
                 args: Namespace,
                 politic_label_lst=None,
                 government_label_lst=None):
        super().__init__(config)
        self.args = args
        self.num_politic_labels = len(politic_label_lst) if politic_label_lst is not None else 0
        self.num_government_labels = len(government_label_lst) if government_label_lst is not None else 0

        self.bert = BertModel(config)  # 특징 추출기

        self.politic_classifier = PoliticClassificationHead(config, self.num_politic_labels)
        self.government_classifier = GovernmentClassificationHead(config, self.num_government_labels)

        self.loss_fct = nn.CrossEntropyLoss()

        self.init_weights()

    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
        politic_labels=None,
        government_labels=None,
        output_attentions=None,
        output_hidden_states=None,
    ):
        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
        )

        pooled_output = outputs[1]

        politic_logits = self.politic_classifier(pooled_output)
        government_logits = self.government_classifier(pooled_output)

        total_loss = 0

        if politic_labels is not None:
            politic_loss = self.loss_fct(politic_logits.view(-1, self.num_politic_labels), politic_labels.view(-1))
            total_loss += self.args.politic_loss_coef * politic_loss

        if government_labels is not None:
            government_loss = self.loss_fct(government_logits.view(-1, self.num_government_labels), government_labels.view(-1))
            total_loss += self.args.government_loss_coef * government_loss

        outputs = ((politic_logits, government_logits),) + outputs[2:]

        outputs = (total_loss,) + outputs

        return outputs

In [6]:

MODEL_CLASSES = {
    "koelectra-base": (ElectraConfig, ElectraForPoliticClassification, ElectraTokenizer),
    "koelectra-small": (ElectraConfig, ElectraForPoliticClassification, ElectraTokenizer),
    "koelectra-base-v2": (ElectraConfig, ElectraForPoliticClassification, ElectraTokenizer),
    "koelectra-small-v2": (ElectraConfig, ElectraForPoliticClassification, ElectraTokenizer),
    "kcbert-base": (BertConfig, BertForPoliticClassification, BertTokenizer),
}


def load_tokenizer(args):
    return MODEL_CLASSES[args.model_type][2].from_pretrained(args.model_name_or_path)



def set_seed(args):
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if not args.no_cuda and torch.cuda.is_available():
        torch.cuda.manual_seed_all(args.seed)

def compute_metrics(pred_politic_labels, pred_government_labels, gt_politic_labels, gt_government_labels):
    politic_weighted_f1 = f1_score(gt_politic_labels, pred_politic_labels, average="weighted")
    government_weighted_f1 = f1_score(gt_government_labels, pred_government_labels, average="weighted")

    politic_macro_f1 = f1_score(gt_politic_labels, pred_politic_labels, average="macro")
    government_macro_f1 = f1_score(gt_government_labels, pred_government_labels, average="macro")

    mean_weighted_f1 = (politic_weighted_f1 + government_weighted_f1) / 2
    return {
        "politic_weighted_f1": politic_weighted_f1,
        "government_weighted_f1": government_weighted_f1,
        "mean_weighted_f1": mean_weighted_f1,
        "politic_macro_f1": politic_macro_f1,
        "government_macro_f1": government_macro_f1
    }

In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split

def load_data_from_csv(filename):
    df = pd.read_csv(filename)

    # title, content, label1, label2에 NaN 값이 있는 행만 제거
    df.dropna(subset=['title', 'content', 'label1', 'label2'], inplace=True)

    # title과 content를 합친다.
    df['text'] = df['title'] + " " + df['content']
    df['text'] = df['text'].str.replace('\n', '')

    # 필요한 열만 선택
    df = df[['text', 'label1', 'label2']]
    return df  #여기 나중에 df로 수정


class InputExample(object):
    """ A single training/test example for simple sequence classification. """

    def __init__(self,
                 guid,
                 text,
                 politic,
                 government):
        self.guid = guid
        self.text = text
        self.politic = politic
        self.government = government

    def __repr__(self):
        return str(self.to_json_string())

    def to_dict(self):
        """Serializes this instance to a Python dictionary."""
        output = copy.deepcopy(self.__dict__)
        return output

    def to_json_string(self):
        """Serializes this instance to a JSON string."""
        return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"


# DataFrame에서 InputExample 객체를 생성하는 함수
def create_examples_from_dataframe(df, set_type):
    examples = []
    for index, row in df.iterrows():
        guid = "%s-%s" % (set_type, index)
        text = row['text']
        politic = row['label1']
        government = row['label2']

        examples.append(InputExample(guid=guid,
                                     text=text,
                                     politic=politic,
                                     government=government))
    return examples


# 주어진 CSV 파일 경로
filename = os.path.join(args.data_dir, args.input_file)
df = load_data_from_csv(filename)


# 데이터셋 분리
#stratify 옵션 꺼줌 -> 에러남
#train_df, valid_df = train_test_split(df, test_size=0.1)
train_df, valid_df = train_test_split(df, train_size=4000,  random_state=args.seed)
test_df = os.path.join(args.data_dir, args.input_test_file)
test_df = load_data_from_csv(test_df)


# 분리된 데이터셋을 각각의 파일로 저장
train_df.to_csv(os.path.join(args.data_dir, args.train_file), index=False)
valid_df.to_csv(os.path.join(args.data_dir, args.dev_file), index=False)
test_df.to_csv(os.path.join(args.data_dir, args.test_file), index=False)


# InputExample 객체 생성
train_examples = create_examples_from_dataframe(train_df, "train")
valid_examples = create_examples_from_dataframe(valid_df, "dev")
test_examples = create_examples_from_dataframe(test_df, "test")

In [8]:
# 하나의 텍스트 입력의 특징(feature)에 대한 클래스
import ast
import csv
class InputFeatures(object):
    """A single set of features of data."""

    def __init__(self, input_ids, attention_mask, token_type_ids, politic_label, government_label):
        self.input_ids = input_ids
        self.attention_mask = attention_mask
        self.token_type_ids = token_type_ids
        self.politic_label = politic_label
        self.government_label = government_label
        print("self.politic_label: ", self.politic_label)
        print("self.government_label: ", self.government_label)
    def __repr__(self):
        return str(self.to_json_string())

    def to_dict(self):
        """Serializes this instance to a Python dictionary."""
        output = copy.deepcopy(self.__dict__)
        return output

    def to_json_string(self):
        """Serializes this instance to a JSON string."""
        return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"

# 한국어 뉴스 기사 및 댓글 전처리 기능 클래스
class KoreanNewsProcessor(object):
    """Processor for the Korean News data set """

    def __init__(self, args):
        self.args = args

    @classmethod
    def get_labels(cls):
        politic_lst = [str(i) for i in range(1, 6)]  # '1' to '5'
        government_lst = [str(i) for i in range(0, 6)]  # '0' to '5'

        print("politic_lst:", politic_lst, "government_lst:", government_lst)
        return politic_lst, government_lst

    @classmethod
    def _read_file(cls, input_file):
        """Reads a tab separated value file."""
        with open(input_file, "r", encoding="utf-8") as f:
            lines = []
            for line in f:
                lines.append(line.strip())
            return lines


    def _create_examples(self, lines, set_type):
        """Creates examples for the train, dev and test sets."""
        examples = []
        reader = csv.reader(lines[1:], delimiter=',', quotechar='"')

        for i, parsed_line in enumerate(reader):
            if len(parsed_line) < 3 and set_type != "test":
                print(f"Error in line {i+1}: {parsed_line}")
                continue
            guid = "%s-%s" % (set_type, i)
            text = parsed_line[0]

            politic = None
            government = None
          #  if set_type != "test": #test파일이 라벨 없는 파일일 때 사용(예를 들어 오토라벨링 시)
            politic = parsed_line[1]
            government = parsed_line[2]
            if i % 1000 == 0:
                print(text)

            examples.append(InputExample(guid=guid,
                                        text=text,
                                        politic=politic,
                                        government=government))
        return examples




    def get_examples(self, mode):
        """
        Args:
            mode: train, dev, test
        """
        file_to_read = None
        if mode == 'train':
            file_to_read = self.args.train_file
        elif mode == 'dev':
            file_to_read = self.args.dev_file
        elif mode == 'test':
            file_to_read = self.args.test_file

        print("LOOKING AT {}".format(os.path.join(self.args.data_dir, file_to_read)))
        return self._create_examples(self._read_file(os.path.join(self.args.data_dir, file_to_read)), mode)

# 실질적으로 텍스트 입력을 트랜스포머 모델에 넣어 특징(feature)을 계산하는 함수
def convert_examples_to_features(examples, tokenizer, max_length):
    # 라벨 리스트를 가져옵니다.
    politic_label_list, government_label_list = KoreanNewsProcessor.get_labels()

    # 라벨을 인덱스로 매핑합니다.
    politic_label_map = {label: i for i, label in enumerate(politic_label_list)}
    government_label_map = {label: i for i, label in enumerate(government_label_list)}

    def label_from_example(example):
        politic_label_id = -1
        government_label_id = -1
        if example.politic is not None:
            politic_label_id = politic_label_map[example.politic]
        if example.government is not None:
            government_label_id = government_label_map[example.government]
        return politic_label_id, government_label_id

    labels = [label_from_example(example) for example in examples]

    texts = [example.text for example in examples]

    batch_encoding = tokenizer.batch_encode_plus(texts, max_length=max_length, padding='max_length', truncation=True)

    features = []
    for i in range(len(examples)):
        inputs = {k: batch_encoding[k][i] for k in batch_encoding}
        if "token_type_ids" not in inputs:
            inputs["token_type_ids"] = [0] * len(inputs["input_ids"])  # For models like xlm-roberta

        feature = InputFeatures(input_ids=inputs["input_ids"],
                        attention_mask=inputs["attention_mask"],
                        token_type_ids=inputs["token_type_ids"],
                        politic_label=labels[i][0],
                        government_label=labels[i][1])

        features.append(feature)

    for i, example in enumerate(examples[:5]):
        print("*** Example ***")
        print("guid: {}".format(example.guid))
        print("input_ids: {}".format(" ".join([str(x) for x in features[i].input_ids])))
        print("attention_mask: {}".format(" ".join([str(x) for x in features[i].attention_mask])))
        print("token_type_ids: {}".format(" ".join([str(x) for x in features[i].token_type_ids])))
        print("politic_label: {}".format(features[i].politic_label))
        print("government_label: {}".format(features[i].government_label))

    print("Number of features:", len(features))

    return features



def load_examples(args, tokenizer, mode):
    processor = KoreanNewsProcessor(args)

    print("Creating features from dataset file at %s", args.data_dir)
    if mode == "train":
        examples = processor.get_examples("train")
    elif mode == "dev":
        examples = processor.get_examples("dev")
    elif mode == "test":
        examples = processor.get_examples("test")
    else:
        raise Exception("For mode, Only train, dev, test is available")

    features = convert_examples_to_features(
        examples,
        tokenizer,
        args.max_seq_len
    )

    # Convert to Tensors and build dataset
    all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
    all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long)
    all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long)
    all_politic_labels = torch.tensor([f.politic_label for f in features], dtype=torch.long)  # 수정
    all_government_labels = torch.tensor([f.government_label for f in features], dtype=torch.long)  # 수정

    dataset = TensorDataset(all_input_ids,
                            all_attention_mask,
                            all_token_type_ids,
                            all_politic_labels,  # 수정
                            all_government_labels)  # 수정
    return dataset


In [9]:
processor = KoreanNewsProcessor(args)
examples = processor.get_examples("train")
texts = [example.text for example in examples]

LOOKING AT train.csv
자체 핵무장론 펴는 남북대결주의자에 통일부 맡겨 신임 통일부 장관 후보자에 지명된 김영호 성신여대 정치외교학과 교수가 29일 인사청문회 준비 사무실이 마련된 서울 종로구 남북회담본부에 도착해 장관 후보자가 된 입장을 밝히고 있다. 연합뉴스윤석열 대통령이 29일 통일부 장관 후보자로 김영호(64) 성신여대 교수(정치외교학)를 지명하고, 통일부 차관으로는 외교부 출신인 문승현(59) 주태국(타이)대사를 기용하면서 통일부 장차관이 모두 외부 인사로 채워지게 됐다. 통일부 장차관이 모두 외부인으로 채워진 것은 김영삼 정부 때의 ‘권오기 장관, 김석우 차관’ 체제 이후 25년 만이다. 특히 김영호 후보자는 그동안 “김정은 정권 타도”를 주장하고 여러차례 자체 핵무장을 강조한 ‘남북대결주의자’라는 점에서, 대북·통일 정책을 펼쳐야 할 통일부를 형해화하는 인선이라는 지적이 나온다.김 후보자는 그동안 언론 기고와 유튜브 등을 통해 적대적 대북관을 내보이고, 자체 핵무장과 미국 전술핵 한반도 재배치를 주장해왔다. 그는 2019년 4월18일 인터넷 매체 <펜앤드마이크> 기고에서 “김정은 정권이 타도되고 북한 자유화가 이루어져서 남북한 정치체제가 ‘1체제’가 되었을 때 통일의 길이 비로소 열리게 된다”고 주장했다. 2018년 9월13일 같은 매체에 기고한 글에서는 “남북관계는 적대관계”라고 주장했다. 지난 2월20일 유튜브에서는 “미국은 한국에 전술핵무기를 재배치하는 것을 적극적으로 고려해야 되고 한국도 미국에 그런 요구를 강력하게 해야 될 시점”이라며 미국 전술핵의 한반도 재배치 필요성을 주장했다.“남북관계는 적대관계”라서 “김정은 정권 타도”로 통일의 길을 열어야 한다는 김 후보자의 주장은 ‘강압적 흡수통일론’으로 볼 수 있다. 이는 “흡수통일을 추구하지 않는다”는 윤석열 정부 공식 방침과 배치된다. 전술핵 재배치 주장 또한 정부 입장에 반한다. 권영세 통일부 장관은 지난해 10월 국회 국정감사에서 여권 일각에서 주장하는 전술핵 재배치에 

In [10]:
examples_test = processor.get_examples("test")

LOOKING AT test.csv
민주당 "윤석열 정부의 정치생명, 원희룡 입에 달려있다" 더불어민주당이 '양평 고속도로 백지화' 관련 원희룡 장관에게 "가짜뉴스 유포 그만하고 사전에 사업 백지화를 재가 받았는지 밝히라"고 촉구했다.강선우 민주당 대변인은 7일 서면브리핑에서 "원희룡 장관이 지금 해야 할 일은 서울-양평 고속도로 종점 변경을 둘러싸고 꼬리를 무는 의문에 답하는 것"이라며 이 같이 밝혔다.그는 "원희룡 장관이 라디오에 출연해 '민주당이 먼저 발표된 노선대로의 변경을 요청했다'며 우리 당 최재관 지역위원장, 정동균 당시 양평군수에게 책임을 덮어씌웠다"면서 "이는 새빨간 거짓말이다. 주무장관이라는 사람이 국책 사업에 대해서 사실도 확인하지 않고 가짜뉴스를 유포하고 있으니 어처구니없다"고 주장했다.그는 "팩트를 알려드리겠다. 2년 전에는 변경안 자체가 없었다. 그리고 당시 당정협의를 거쳐 설치하고자 했던 나들목은 강하면 방면이었다"면서 "입을 열 때마다 하나같이 가짜뉴스로 국민을 선동하고 있으니 이쯤되면 국토교통부 장관이 아니라 국민선동부 장관 아닌가. 원희룡 장관, 혹시 롤모델이 괴벨스인가"라고 반문했다.그는 "사업 백지화에 비판이 쏟아지니 백지화의 책임을 민주당에 덮어씌우려는 원희룡 장관의 나름의 기만술은 처량하고 한심하다"며 "원희룡 장관은 가짜뉴스 유포 그만하고, 사전에 사업 백지화를 윤석열 대통령에게 재가 받았는지나 밝혀라"라고 촉구했다.그는 "원희룡 장관이 판돈으로 건 것은 자신의 정치생명으로 그치지 않을 것"이라며 "윤석열 정부의 정치생명이 원희룡 장관의 입에 달렸음을 명심하고 성실하게 답하라"고 촉구했다.


In [11]:
def compute_metrics(politic_preds, government_preds, politic_labels, government_labels):
    politic_accuracy = accuracy_score(politic_labels, politic_preds)
    government_accuracy = accuracy_score(government_labels, government_preds)

    politic_f1 = f1_score(politic_labels, politic_preds, average='weighted')
    government_f1 = f1_score(government_labels, government_preds, average='weighted')

    politic_f1_macro = f1_score(politic_labels, politic_preds, average='macro')
    government_f1_macro = f1_score(government_labels, government_preds, average='macro')

    mean_weighted_f1 = (politic_f1+government_f1)/2
    mean_macro_f1 = (politic_f1+government_f1)/2

    #절대 오차 구하기
    tolerance = 1

    politic_absolute_errors = [abs(pred - true) for pred, true in zip(politic_preds, politic_labels)]
    politic_within_tolerance = [error <= tolerance for error in politic_absolute_errors]

    government_absolute_errors = [abs(pred - true) for pred, true in zip(government_preds, government_labels)]
    government_within_tolerance = [error <= tolerance for error in government_absolute_errors]

    #절대 오차 라벨 정확도
    politic_abs_error_accuracy = sum(politic_within_tolerance) / len(politic_preds)
    government_abs_error_accuracy = sum(government_within_tolerance) / len(government_preds)

    #절대 오차 라벨 F1 점수
    politic_abs_error_f1 = f1_score([1 if within else 0 for within in politic_within_tolerance], [1] * len(politic_preds)) #모든 라벨을 1로 만들고(실제 라벨 값은 모두 정답이니까), 절대 오차 안에 있으면1, 아니면 0으로 이진분류로 f1 구하기.
    government_abs_error_f1 = f1_score([1 if within else 0 for within in government_within_tolerance], [1] * len(government_preds))



    return {
        "politic_accuracy": politic_accuracy,
        "government_accuracy": government_accuracy,

        "politic_weighted_f1": politic_f1,
        "government_weighted_f1": government_f1,

        "politic_f1_macro": politic_f1_macro,
        "government_f1_macro": government_f1_macro,

        "politic_abs_error_accuracy" : politic_abs_error_accuracy,
        "government_abs_error_accuracy" : government_abs_error_accuracy,

        "politic_abs_error_f1": politic_abs_error_f1,
        "government_abs_error_f1" : government_abs_error_f1,

   #     "politic_absolute_errors" : politic_absolute_errors,
    #    "government_absolute_errors" : government_absolute_errors,

        "mean_weighted_f1": mean_weighted_f1,
        "mean_macro_f1": mean_macro_f1
        }

In [12]:
from torch.nn.modules.module import register_module_buffer_registration_hook

train_losses =[]
valid_losses =[]

valid_accuracies_1 = []
valid_accuracies_2 =[]

valid_f1_scores_1 = []
valid_f1_scores_2 = []

class Trainer(object):
    global model
    def __init__(self, args, tokenizer, train_dataset=None, dev_dataset=None, test_dataset=None):
        self.args = args
        self.tokenizer = tokenizer
        self.train_dataset = train_dataset
        self.dev_dataset = dev_dataset
        self.test_dataset = test_dataset

        self.politic_label_lst, self.government_label_lst = KoreanNewsProcessor.get_labels()  # 변경

        self.config_class, self.model_class, _ = MODEL_CLASSES[args.model_type]

        self.config = self.config_class.from_pretrained(args.model_name_or_path,
                                                        finetuning_task=args.task)
        self.model = self.model_class.from_pretrained(args.model_name_or_path,
                                                      config=self.config,
                                                      args=args,
                                                      politic_label_lst=self.politic_label_lst,  # 변경
                                                      government_label_lst=self.government_label_lst)  # 변경


        # GPU or CPU
        self.device = "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu"
        self.model.to(self.device)

    def train(self):

        train_sampler = RandomSampler(self.train_dataset)
        train_dataloader = DataLoader(self.train_dataset, sampler=train_sampler, batch_size=self.args.train_batch_size)

        if self.args.max_steps > 0:
            t_total = self.args.max_steps
            self.args.num_train_epochs = self.args.max_steps // (len(train_dataloader) // self.args.gradient_accumulation_steps) + 1
        else:
            t_total = len(train_dataloader) // self.args.gradient_accumulation_steps * self.args.num_train_epochs

        # Prepare optimizer and schedule (linear warmup and decay)
        no_decay = ['bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [
            {'params': [p for n, p in self.model.named_parameters() if not any(nd in n for nd in no_decay)],
             'weight_decay': self.args.weight_decay},
            {'params': [p for n, p in self.model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
        ]
        optimizer = AdamW(optimizer_grouped_parameters, lr=self.args.learning_rate, eps=self.args.adam_epsilon)
        scheduler = get_linear_schedule_with_warmup(optimizer,
                                                    num_warmup_steps=int(t_total * self.args.warmup_proportion),
                                                    num_training_steps=t_total)

        # Train!
        print("***** Running training *****")
        print("  Num examples = %d", len(self.train_dataset))
        print("  Num Epochs = %d", self.args.num_train_epochs)
        print("  Total train batch size = %d", self.args.train_batch_size)
        print("  Gradient Accumulation steps = %d", self.args.gradient_accumulation_steps)
        print("  Total optimization steps = %d", t_total)
        print("  Logging steps = %d", self.args.logging_steps)

        global_step = 0
        tr_loss = 0.0
        best_mean_weighted_f1 = 0.0
        best_politic_acc = 0.0
        best_government_acc = 0.0
        self.model.zero_grad()

        train_iterator = trange(int(self.args.num_train_epochs), desc="Epoch", initial=1)
        a =0
        for _ in train_iterator:
            a += 1
            print()
            print("Epoch: ",a, end="")

            #epoch마다 정확도 및 손실
            politic_preds = None
            politic_out_label_ids = None
            government_preds = None
            government_out_label_ids = None
            tr_batch_loss = 0

            epoch_iterator = tqdm(train_dataloader, desc="Iteration")
            for step, batch in enumerate(epoch_iterator):
                self.model.train()
                batch = tuple(t.to(self.device) for t in batch)  # GPU or CPU
                inputs = {'input_ids': batch[0],
                          'attention_mask': batch[1],
                          'politic_labels': batch[3],  # 변경
                          'government_labels': batch[4]}  # 변경
                if self.args.model_type != 'distilkobert':
                    inputs['token_type_ids'] = batch[2]
                outputs = self.model(**inputs)
                loss, (politic_logits, government_logits)  = outputs[:2]

                if self.args.gradient_accumulation_steps > 1:
                    loss = loss / self.args.gradient_accumulation_steps

                loss.backward()

                # Politic
                if politic_preds is None:
                    politic_preds = politic_logits.detach().cpu().numpy()
                    politic_out_label_ids = inputs['politic_labels'].detach().cpu().numpy()
                else:
                    politic_preds = np.append(politic_preds, politic_logits.detach().cpu().numpy(), axis=0)
                    politic_out_label_ids = np.append(
                        politic_out_label_ids, inputs['politic_labels'].detach().cpu().numpy(), axis=0)

                # Government
                if government_preds is None:
                    government_preds = government_logits.detach().cpu().numpy()
                    government_out_label_ids = inputs['government_labels'].detach().cpu().numpy()
                else:
                    government_preds = np.append(government_preds, government_logits.detach().cpu().numpy(), axis=0)
                    government_out_label_ids = np.append(
                        government_out_label_ids, inputs['government_labels'].detach().cpu().numpy(), axis=0)


                tr_batch_loss += loss.mean().item()
                tr_loss += loss.item()
                if (step + 1) % self.args.gradient_accumulation_steps == 0:
                    torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.args.max_grad_norm)

                    optimizer.step()
                    scheduler.step()  # Update learning rate schedule
                    self.model.zero_grad()
                    global_step += 1

            train_losses.append(tr_batch_loss/(step+1))


            # 정확도 계산
            politic_preds = np.argmax(politic_preds, axis=1)
            government_preds = np.argmax(government_preds, axis=1)

            train_result = compute_metrics(politic_preds, government_preds, politic_out_label_ids, government_out_label_ids)  # 변경
            valid_accuracies_1.append(train_result["politic_accuracy"])
            valid_accuracies_2.append(train_result["government_accuracy"])

            #평가
            results = self.evaluate("dev")

            valid_accuracies_1.append(results["politic_accuracy"])
            valid_f1_scores_1.append(results["politic_f1_macro"])
            valid_accuracies_2.append(results["government_accuracy"])
            valid_f1_scores_2.append(results["government_f1_macro"])

            if results["politic_accuracy"] >= best_politic_acc and results["government_accuracy"] >= best_government_acc :  # Save best result based on accuracy
                best_politic_acc = results["politic_accuracy"]
                best_government_acc = results["government_accuracy"]

                self.save_model()


            print("train politic Accuracy:", train_result["politic_accuracy"], "train government accuracy:",train_result["government_accuracy"])

        return global_step, tr_loss / global_step

    def evaluate(self, mode):

        if mode == 'test':
            dataset = self.test_dataset
        elif mode == 'dev':
            dataset = self.dev_dataset
        else:
            raise Exception("Only dev and test dataset available")

        eval_sampler = SequentialSampler(dataset)
        eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=self.args.eval_batch_size)

        # Eval!
        print("***** Running evaluation on %s dataset *****", mode)
        print("  Num examples = %d", len(dataset))
        print("  Batch size = %d", self.args.eval_batch_size)
        eval_loss = 0.0
        nb_eval_steps = 0

        politic_preds = None
        politic_out_label_ids = None
        government_preds = None
        government_out_label_ids = None

        politic_results = []
        government_results = []

        self.model.eval()

        for batch in tqdm(eval_dataloader, desc="Evaluating"):
            batch = tuple(t.to(self.device) for t in batch)
            with torch.no_grad():
                inputs = {'input_ids': batch[0],
                          'attention_mask': batch[1],
                          'politic_labels': batch[3],  # 변경
                          'government_labels': batch[4]}  # 변경
                if self.args.model_type != 'distilkobert':
                    inputs['token_type_ids'] = batch[2]
                outputs = self.model(**inputs)
                tmp_eval_loss, (politic_logits, government_logits) = outputs[:2]  # 변경

                eval_loss += tmp_eval_loss.mean().item()
            nb_eval_steps += 1

            # Politic
            if politic_preds is None:
                politic_preds = politic_logits.detach().cpu().numpy()
                politic_out_label_ids = inputs['politic_labels'].detach().cpu().numpy()
            else:
                politic_preds = np.append(politic_preds, politic_logits.detach().cpu().numpy(), axis=0)
                politic_out_label_ids = np.append(politic_out_label_ids, inputs['politic_labels'].detach().cpu().numpy(), axis=0)
                                                                            #.detach(): PyTorch 텐서에서 사용되는 메서드로, 그래디언트 계산을 중지하고 텐서를 복제
                                                                            #.cpu(): 텐서를 CPU로 이동, 일반적으로 모델은 gpu에서 학습하고 예측. 따라서 gpu에서 생성된 텐서를 cpu로 이동
                                                                            #.numpy() numpy 배열로 변환
            # Government
            if government_preds is None:
                government_preds = government_logits.detach().cpu().numpy()
                government_out_label_ids = inputs['government_labels'].detach().cpu().numpy()
            else:
                government_preds = np.append(government_preds, government_logits.detach().cpu().numpy(), axis=0)
                government_out_label_ids = np.append(
                    government_out_label_ids, inputs['government_labels'].detach().cpu().numpy(), axis=0)



        eval_loss = eval_loss / nb_eval_steps
        results = {
            "loss": eval_loss
        }
        valid_losses.append(eval_loss)
        #예측 확률 중 가장 높은 거 1개, 2개 뽑기
        top2_politic_preds = np.argsort(-politic_preds, axis=1)[:, :2]
        top2_government_preds = np.argsort(-government_preds, axis=1)[:, :2]

        politic_preds = np.argmax(politic_preds, axis=1)
        government_preds = np.argmax(government_preds, axis=1)
        result = compute_metrics(politic_preds, government_preds, politic_out_label_ids, government_out_label_ids)  # 변경
        results.update(result)

        top2_politic_accuracy = 0
        top2_government_accuracy = 0
        #top2 정확도
        for i in range(len(politic_out_label_ids)):
            if politic_out_label_ids[i] in top2_politic_preds[i]:
                top2_politic_accuracy += 1
            if government_out_label_ids[i] in top2_government_preds[i]:
                top2_government_accuracy += 1

        politic_correct = [true_label in top_predictions for true_label, top_predictions in zip(politic_out_label_ids, top2_politic_preds)]
        government_correct = [true_label in top_predictions for true_label, top_predictions in zip(government_out_label_ids, top2_government_preds)]

        top2_politic_accuracy /= len(politic_out_label_ids)
        top2_government_accuracy /= len(government_out_label_ids)

        #top2 f1
        top2_politic_f1 = f1_score(politic_correct, [1] * len(politic_correct))
        top2_government_f1 = f1_score(government_correct, [1] * len(government_correct))


        print("***** Eval results *****")

        for key in sorted(results.keys()):
            print("  %s = %s", key, str(results[key]))
        print("  %s = %s", "top2_politic_accuracy:", top2_politic_accuracy)
        print("  %s = %s", "top2_government_accuracy:", top2_government_accuracy)
        print("  %s = %s", "top2_politic_f1:", top2_politic_f1)
        print("  %s = %s", "top2_government_f1:", top2_government_f1)
        return results


    def save_model(self):
        # Save model checkpoint (Overwrite)
        if not os.path.exists(self.args.model_dir):
            os.makedirs(self.args.model_dir)
        model_to_save = self.model.module if hasattr(self.model, 'module') else self.model
        model_to_save.save_pretrained(self.args.model_dir)
        self.tokenizer.save_pretrained(self.args.model_dir)

        # Save training arguments together with the trained model
        torch.save(self.args, os.path.join(self.args.model_dir, 'training_args.bin'))
        print("Saving model checkpoint to %s", self.args.model_dir)

    def load_model(self):
        # Check whether model exists
        if not os.path.exists(self.args.model_dir1):
            raise Exception("Model doesn't exists! Train first!")

        self.config = self.config_class.from_pretrained(self.args.model_dir1)
        self.model = self.model_class.from_pretrained(self.args.model_dir1,
                                                      config=self.config,
                                                      args=self.args,
                                                      politic_label_lst=self.politic_label_lst,  # 변경
                                                      government_label_lst=self.government_label_lst)  # 변경
        self.model.to(self.device)
        print("***** Model Loaded *****")

#        self.predict()


In [None]:
set_seed(args)

tokenizer = load_tokenizer(args)
train_dataset = load_examples(args, tokenizer, mode="train")  # label_types 인자 제거
dev_dataset = load_examples(args, tokenizer, mode="dev")  # label_types 인자 제거
test_dataset = load_examples(args, tokenizer, mode="test")  # label_types 인자 제거
trainer = Trainer(args, tokenizer, train_dataset, dev_dataset, test_dataset)

[1;30;43m스트리밍 출력 내용이 길어서 마지막 5000줄이 삭제되었습니다.[0m
self.politic_label:  1
self.government_label:  1
self.politic_label:  3
self.government_label:  3
self.politic_label:  2
self.government_label:  0
self.politic_label:  2
self.government_label:  2
self.politic_label:  3
self.government_label:  0
self.politic_label:  2
self.government_label:  0
self.politic_label:  1
self.government_label:  0
self.politic_label:  3
self.government_label:  4
self.politic_label:  2
self.government_label:  3
self.politic_label:  1
self.government_label:  1
self.politic_label:  2
self.government_label:  3
self.politic_label:  1
self.government_label:  0
self.politic_label:  3
self.government_label:  0
self.politic_label:  0
self.government_label:  1
self.politic_label:  3
self.government_label:  0
self.politic_label:  1
self.government_label:  1
self.politic_label:  0
self.government_label:  1
self.politic_label:  3
self.government_label:  4
self.politic_label:  1
self.government_label:  0
self.politic_label:

In [None]:
if args.do_train:
    global_step, average_loss = trainer.train()
    print("***** Final Training Results *****")
    print(f"  Global Step: {global_step}")
    print(f"  Average Loss: {average_loss:.4f}")

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(18, 6))

# Train and Valid Losses 변화를 그립니다.
plt.subplot(1, 3, 1)
plt.plot(train_losses, label="Train Loss")
plt.plot(valid_losses, label="Valid Loss")
plt.title("Losses over Epochs")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.legend()

# Accuracy 변화를 그립니다.
plt.subplot(1, 3, 2)
plt.plot(valid_accuracies_1, label="Accuracy Label 1")
plt.plot(valid_accuracies_2, label="Accuracy Label 2")
plt.title("Accuracy over Epochs")
plt.xlabel("Epoch")
plt.ylabel("Accuracy")
plt.legend()

# F1-Score 변화를 그립니다.
plt.subplot(1, 3, 3)
plt.plot(valid_f1_scores_1, 'g-', alpha=0.7, label="Valid F1 Score Label 1")  # added alpha for transparency
plt.plot(valid_f1_scores_2, 'g--', label="Valid F1 Score Label 2")
plt.title("F1 Scores over Epochs")
plt.xlabel("Epoch")
plt.ylabel("F1 Score")
plt.legend()

plt.tight_layout()
plt.show()

In [None]:
#trainer.load_model()

In [None]:
trainer.evaluate("test")