# 사전 설치

In [None]:
!pip install wordcloud
!pip install konlpy
!pip install torch
!sudo apt-get update
!sudo apt-get install -y fonts-nanum fonts-noto-cjk
!fc-list | grep -i "nanum\|noto"
!sudo apt-get update
!sudo apt-get install openjdk-11-jdk -y

# Source Code

In [6]:
import torch
import torch.nn as nn
import torch.optim as optim

import math
import numpy as np
import pandas as pd

import re
import konlpy
from konlpy.tag import Okt
from collections import Counter
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from pathlib import Path

from typing import Optional, Dict

from sklearn.metrics import f1_score, accuracy_score
from tqdm.auto import tqdm

import warnings
warnings.filterwarnings("ignore")

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
url = '../data/'
train = pd.read_csv(url+'train.csv')

# 지연님 생성 데이터
a = pd.read_csv(url+'general_dialog1.csv').rename(columns={'dialogue': 'conversation'})
a['class'] = '일반 대화'
a['idx'] = range(0,len(a))
a = a[['idx','class','conversation']]

# 유찬님 생성 데이터
b = pd.read_csv(url+'general_dialog2.csv')
b = b[b['class'] == "일반 대화"] # class에 일반대화가 아닌 conversation이 적혀있어 제거

train = pd.concat([train, a, b], axis=0,ignore_index=True).drop(columns='idx')
train.to_csv(url+"train_w_general_conv.csv", index=False)

In [37]:
raw_data = pd.read_csv('./data/train_w_general_conv.csv')
raw_data.head()

Unnamed: 0.1,Unnamed: 0,class,conversation
0,0,협박 대화,지금 너 스스로를 죽여달라고 애원하는 것인가?\n 아닙니다. 죄송합니다.\n 죽을 ...
1,1,협박 대화,길동경찰서입니다.\n9시 40분 마트에 폭발물을 설치할거다.\n네?\n똑바로 들어 ...
2,2,기타 괴롭힘 대화,너 되게 귀여운거 알지? 나보다 작은 남자는 첨봤어.\n그만해. 니들 놀리는거 재미...
3,3,갈취 대화,어이 거기\n예??\n너 말이야 너. 이리 오라고\n무슨 일.\n너 옷 좋아보인다?...
4,4,갈취 대화,저기요 혹시 날이 너무 뜨겁잖아요? 저희 회사에서 이 선크림 파는데 한 번 손등에 ...


In [38]:
raw_data = raw_data.drop('Unnamed: 0', axis=1)

In [39]:
raw_data.head()

Unnamed: 0,class,conversation
0,협박 대화,지금 너 스스로를 죽여달라고 애원하는 것인가?\n 아닙니다. 죄송합니다.\n 죽을 ...
1,협박 대화,길동경찰서입니다.\n9시 40분 마트에 폭발물을 설치할거다.\n네?\n똑바로 들어 ...
2,기타 괴롭힘 대화,너 되게 귀여운거 알지? 나보다 작은 남자는 첨봤어.\n그만해. 니들 놀리는거 재미...
3,갈취 대화,어이 거기\n예??\n너 말이야 너. 이리 오라고\n무슨 일.\n너 옷 좋아보인다?...
4,갈취 대화,저기요 혹시 날이 너무 뜨겁잖아요? 저희 회사에서 이 선크림 파는데 한 번 손등에 ...


In [40]:
raw_data.groupby('class').count()

Unnamed: 0_level_0,conversation
class,Unnamed: 1_level_1
갈취 대화,981
기타 괴롭힘 대화,1094
일반 대화,1000
직장 내 괴롭힘 대화,979
협박 대화,896


In [50]:
okt = Okt()
stop_words = {
    '이', '그', '저', '것', '수', '등', '때', '곳', '나', '너', '우리', '경우', 
    '사람', '일', '지금', '생각', '말', '안', '뭐', '정말', '왜', '오늘', '내일',
    '여기', '거기', '이제', '먼저', '하나', '무슨', '위해', '때문', '정도', 
    '그냥', '진짜', '너무', '완전', '혹시', '계속', '아니', '알', '더', '좀', '이다'
}

In [42]:
def preprocess_sentence(sentence, stop_words):
    # 1. 양쪽 공백 제거
    sentence = sentence.strip()

    # 2. 특수문자 및 이모지 제거 (한글, 영어, 숫자, 기본 구두점만 허용)
    sentence = re.sub(r"[^가-힣0-9a-zA-Z.,!?~\s]", " ", sentence)

    # 3. 연속된 공백 하나로 축소 및 줄 바꿈 무시
    sentence = re.sub(r"\s+", " ", sentence)
    sentence = re.sub(r"\n", " ", sentence)

    # 4. 문장 부호 앞뒤로 공백 추가 (토큰 구분을 위함)
    sentence = re.sub(r"([?.!,~])", r" \1 ", sentence)
    sentence = re.sub(r'\s{2,}', ' ', sentence)
    
    # 형태소 분석 (단어, 품사)
    include_tags = {"Noun", "Verb", "Adjective", "Exclamation", "Adverb"}
    pos_tags = okt.pos(sentence, stem=True, norm=True)
    # 원하는 품사만 추출
    tokens = [
        word for word, tag in pos_tags
        if tag in include_tags and len(word) > 1 and word not in stop_words
    ]
        
    return tokens

In [43]:
sample_text = raw_data['conversation'][0]
tokens = preprocess_sentence(sample_text, stop_words)
print(tokens)

['지금', '스스로', '죽이다', '달라', '애원', '아니다', '죄송하다', '혼자', '죽지', '우리', '사건', '말리', '진짜', '죽이다', '버리다', '싶다', '정말', '선택', '죽다', '가족', '죽여주다', '죄송하다', '정말', '선택', '없다', '선택', '가족', '모조리', '죽이다', '버리다', '선택', '한번', '도와주다', '그냥', '죽이다', '버리다', '이의', '없다', '제발', '도와주다']


In [44]:
raw_data['tokens'] = raw_data['conversation'].apply(lambda x: preprocess_sentence(str(x), stop_words))
raw_data.head()

Unnamed: 0,class,conversation,tokens
0,협박 대화,지금 너 스스로를 죽여달라고 애원하는 것인가?\n 아닙니다. 죄송합니다.\n 죽을 ...,"[지금, 스스로, 죽이다, 달라, 애원, 아니다, 죄송하다, 혼자, 죽지, 우리, ..."
1,협박 대화,길동경찰서입니다.\n9시 40분 마트에 폭발물을 설치할거다.\n네?\n똑바로 들어 ...,"[길동, 경찰서, 이다, 마트, 폭발물, 설치, 똑바로, 들다, 한번, 얘기, 장난..."
2,기타 괴롭힘 대화,너 되게 귀여운거 알지? 나보다 작은 남자는 첨봤어.\n그만해. 니들 놀리는거 재미...,"[되게, 귀엽다, 작다, 남자, 보다, 그만하다, 놀리다, 재미없다, 지영, 이지,..."
3,갈취 대화,어이 거기\n예??\n너 말이야 너. 이리 오라고\n무슨 일.\n너 옷 좋아보인다?...,"[어이, 거기, 이리, 오라, 무슨, 좋다, 보이다, 있다, 보다, 아니다, 없다,..."
4,갈취 대화,저기요 혹시 날이 너무 뜨겁잖아요? 저희 회사에서 이 선크림 파는데 한 번 손등에 ...,"[저기, 혹시, 너무, 뜨겁다, 저희, 회사, 선크림, 팔다, 손등, 발라, 보다,..."


In [15]:
train_tokens = pd.read_csv('../data/train_w_pos_list.csv')
train_tokens.head()

Unnamed: 0.1,Unnamed: 0,class,conversation,length,text_len,Adverb,Adverb_sw,Adjective,Adjective_sw,Exclamation,Exclamation_sw,Noun,Noun_sw,Verb,Verb_sw
0,0,협박 대화,지금 너 스스로를 죽여달라고 애원하는 것인가?\r\n 아닙니다. 죄송합니다.\r\n...,251,52,[],[],"['아니다', '죄송하다', '죄송하다', '없다', '없다']","['아니다', '죄송하다', '죄송하다', '없다', '없다']",[],[],"['지금', '스스로', '달라', '애원', '혼자', '죽지', '우리', '사...","['스스로', '달라', '애원', '혼자', '죽지', '사건', '말리', '선...","['죽이다', '하다', '죽이다', '버리다', '싶다', '하다', '하다', ...","['죽이다', '하다', '죽이다', '버리다', '싶다', '하다', '하다', ..."
1,1,협박 대화,길동경찰서입니다.\r\n9시 40분 마트에 폭발물을 설치할거다.\r\n네?\r\n똑...,197,39,[],[],"['이다', '재미있다', '진정하다']","['재미있다', '진정하다']",[],[],"['길동', '경찰서', '마트', '폭발물', '설치', '똑바로', '한번', ...","['길동', '경찰서', '마트', '폭발물', '설치', '똑바로', '한번', ...","['하다', '들다', '하다', '걸다', '말다', '터지다', '죽다', '되...","['하다', '들다', '하다', '걸다', '말다', '터지다', '죽다', '되..."
2,2,기타 괴롭힘 대화,너 되게 귀여운거 알지? 나보다 작은 남자는 첨봤어.\r\n그만해. 니들 놀리는거 ...,227,52,"['되게', '그만']","['되게', '그만']","['귀엽다', '작다', '그만하다', '재미없다', '그렇다', '좋다', '작다...","['귀엽다', '작다', '그만하다', '재미없다', '그렇다', '좋다', '작다...",[],[],"['남자', '지영', '이지', '재는', '군대', '보태', '난쟁이', '장...","['남자', '지영', '이지', '재는', '군대', '보태', '난쟁이', '장...","['보다', '놀리다', '돼다', '가다', '주다', '가다', '가다', '보...","['보다', '놀리다', '돼다', '가다', '주다', '가다', '가다', '보..."
3,3,갈취 대화,어이 거기\r\n예??\r\n너 말이야 너. 이리 오라고\r\n무슨 일.\r\n너 ...,125,34,['이리'],['이리'],"['좋다', '있다', '아니다', '없다', '있다', '없다']","['좋다', '있다', '아니다', '없다', '있다', '없다']",[],[],"['어이', '거기', '오라', '무슨', '오늘', '피시방', '마지막', '...","['어이', '오라', '피시방', '마지막', '기회']","['보이다', '보다', '뒤지다', '나오다', '죽다', '내놓다']","['보이다', '보다', '뒤지다', '나오다', '죽다', '내놓다']"
4,4,갈취 대화,저기요 혹시 날이 너무 뜨겁잖아요? 저희 회사에서 이 선크림 파는데 한 번 손등에 ...,471,118,"['너무', '그래도', '따끔', '열심히', '많이', '많이', '많이', '...","['그래도', '따끔', '열심히', '많이', '많이', '많이', '빨리', '...","['뜨겁다', '필요하다', '좋다', '좋다', '좋다', '좋다', '같다', ...","['뜨겁다', '필요하다', '좋다', '좋다', '좋다', '좋다', '같다', ...",[],[],"['저기', '혹시', '저희', '회사', '선크림', '손등', '발라', '진...","['저기', '저희', '회사', '선크림', '손등', '발라', '선크림', '...","['팔다', '하다', '보다', '알아보다', '하다', '보다', '하다', '...","['팔다', '하다', '보다', '알아보다', '하다', '보다', '하다', '..."


In [144]:
# ===== 1) Vocab 빌드 =====
from collections import Counter
from typing import List, Tuple, Dict, Iterable
import json

SPECIALS = ["<pad>", "<unk>"]

def build_vocab(
    token_lists: Iterable[List[str]],
    min_freq: int = 2,
    max_size: int = 30000,
    specials: List[str] = SPECIALS,
) -> Tuple[Dict[str, int], List[str], Counter]:
    """
    token_lists: 각 샘플의 토큰 리스트(iterable of list[str])
    min_freq: 최소 등장 빈도 미만 토큰은 제외
    max_size: special 포함 전체 vocab 상한 (None이면 제한 없음)
    returns: (stoi, itos, counter)
    """
    counter = Counter()
    for toks in token_lists:
        counter.update(toks)

    # 빈도 필터 + 상위 max_size-특수토큰 만큼
    most = [tok for tok, cnt in counter.most_common() if cnt >= min_freq]
    if max_size is not None:
        cap = max_size - len(specials)
        most = most[:max(0, cap)]

    itos = list(specials) + most
    stoi = {tok: i for i, tok in enumerate(itos)}
    return stoi, itos, counter

def save_vocab(path: str, itos: List[str]) -> None:
    with open(path, "w", encoding="utf-8") as f:
        json.dump(itos, f, ensure_ascii=False)

def load_vocab(path: str) -> Tuple[Dict[str, int], List[str]]:
    with open(path, "r", encoding="utf-8") as f:
        itos = json.load(f)
    stoi = {tok: i for i, tok in enumerate(itos)}
    return stoi, itos
# ===== 2) 토큰 → ID 인코딩 =====
def encode_tokens(
    tokens: List[str],
    stoi: Dict[str, int],
    max_len: int = 256,
) -> Tuple[List[int], List[int]]:
    """
    tokens -> input_ids, attention_mask
    - OOV는 <unk>
    - max_len을 초과하면 적절히 자름
    """
    pad_id = stoi["<pad>"]
    unk_id = stoi["<unk>"]

    ids = [stoi.get(t, unk_id) for t in tokens]

    # 길이 계산 (cls/sep 포함해서 자르기)
    keep = max_len
    keep = max(0, keep)
    ids = ids[:keep]

    attn = [1] * len(ids)
    return ids, attn
# ===== 3) 배치 패딩(collate) =====
import torch

def collate_batch(
    batch,
    pad_id: int,
):
    """
    batch: [{"input_ids": List[int], "attention_mask": List[int], "label": int}, ...]
    """
    bs = len(batch)
    maxlen = max(len(x["input_ids"]) for x in batch)
    input_ids = torch.full((bs, maxlen), pad_id, dtype=torch.long)
    attention_mask = torch.zeros((bs, maxlen), dtype=torch.long)
    labels = torch.tensor([x["label"] for x in batch], dtype=torch.long)

    for i, x in enumerate(batch):
        L = len(x["input_ids"])
        input_ids[i, :L] = torch.tensor(x["input_ids"], dtype=torch.long)
        attention_mask[i, :L] = torch.tensor(x["attention_mask"], dtype=torch.long)

    return {"input_ids": input_ids, "attention_mask": attention_mask, "labels": labels}
# ===== 4) 예시 파이프라인 (라벨 매핑 포함) =====
# 4-1) tokens 컬럼이 없다면 먼저 생성

import ast
stopwords = {'하다', '보다', '알다', '가다', '되다', '돼다', '오다', '진짜', '지금', '사람', '우리', '오늘', '생각', '그냥', '무슨'}
cols = ["Noun", "Verb", "Adjective", "Adverb", "Exclamation"]
epochs = 20

all_stopwords = stopwords | stop_words

train_tokens["tokens"] = train_tokens[cols].apply(
    lambda row: sum((r for r in row if isinstance(r, list)), []),
    axis=1
)

def clean_token(t: str) -> str:
    if not isinstance(t, str):
        t = str(t)
    t = t.strip()
    # 1) 앞/뒤에 붙은 따옴표 제거
    t = re.sub(r"^[\"']+", "", t)   # 맨 앞의 ' 또는 " 연속 제거
    t = re.sub(r"[\"']+$", "", t)   # 맨 뒤의 ' 또는 " 연속 제거
    # 2) 남아있을 수 있는 괄호/대괄호/쉼표 잔여 제거
    t = t.strip("[],")
    return t.strip()

# tokens 컬럼 클린업 (빈 문자열은 제거)
train_tokens["tokens"] = train_tokens["tokens"].apply(
    lambda toks: [clean_token(x) for x in toks if clean_token(x)]
)

# 불용어 제거 (통합 집합 사용)
all_stopwords = stopwords | stop_words
train_tokens["tokens_sw"] = train_tokens["tokens"].apply(
    lambda toks: [t for t in toks if t and t not in all_stopwords]
)

# 4-2) 라벨 매핑
labels = sorted(train_tokens["class"].unique().tolist())
label2id = {
    "협박 대화": 0,
    "갈취 대화": 1,
    "직장 내 괴롭힘 대화": 2,
    "기타 괴롭힘 대화": 3,
    "일반 대화": 4,
}
id2label = {v: k for k, v in label2id.items()}

# 4-3) vocab 빌드
stoi, itos, counter = build_vocab(train_tokens["tokens_sw"], min_freq=1, max_size=20000)
pad_id = stoi["<pad>"]

# 4-4) 인코딩 (train/valid 분할은 이미 되어있다고 가정하거나 아래처럼 간단 분할)
from sklearn.model_selection import train_test_split
train_df, valid_df = train_test_split(train_tokens, test_size=0.2, random_state=42, stratify=train_tokens["class"])

def encode_row(row, max_len=256):
    ids, attn = encode_tokens(row["tokens_sw"], stoi, max_len=max_len)
    return {
        "input_ids": ids,
        "attention_mask": attn,
        "label": label2id[row["class"]],
    }

train_records = [encode_row(r) for _, r in train_df.iterrows()]
valid_records = [encode_row(r) for _, r in valid_df.iterrows()]

# 4-5) PyTorch Dataset/Dataloader
from torch.utils.data import Dataset, DataLoader

class SimpleListDataset(Dataset):
    def __init__(self, records):
        self.records = records
    def __len__(self):
        return len(self.records)
    def __getitem__(self, idx):
        return self.records[idx]

train_ds = SimpleListDataset(train_records)
valid_ds = SimpleListDataset(valid_records)

train_loader = DataLoader(train_ds, batch_size=32, shuffle=True,
                          collate_fn=lambda b: collate_batch(b, pad_id))
valid_loader = DataLoader(valid_ds, batch_size=32, shuffle=False,
                          collate_fn=lambda b: collate_batch(b, pad_id))

print(f"Vocab size: {len(itos)} | Labels: {label2id}")
print(next(iter(train_loader))["input_ids"].shape)  # (B, S)

Vocab size: 9531 | Labels: {'협박 대화': 0, '갈취 대화': 1, '직장 내 괴롭힘 대화': 2, '기타 괴롭힘 대화': 3, '일반 대화': 4}
torch.Size([32, 61])


In [145]:
train_tokens['tokens'][0]

['지금',
 '스스로',
 '달라',
 '애원',
 '혼자',
 '죽지',
 '우리',
 '사건',
 '말리',
 '진짜',
 '정말',
 '선택',
 '가족',
 '정말',
 '선택',
 '선택',
 '가족',
 '모조리',
 '선택',
 '한번',
 '그냥',
 '이의',
 '제발',
 '죽이다',
 '하다',
 '죽이다',
 '버리다',
 '싶다',
 '하다',
 '하다',
 '죽다',
 '죽여주다',
 '하다',
 '하다',
 '죽이다',
 '버리다',
 '하다',
 '도와주다',
 '죽이다',
 '버리다',
 '도와주다',
 '아니다',
 '죄송하다',
 '죄송하다',
 '없다',
 '없다']

In [131]:
# ----------------------------
# Positional Encoding (sin/cos)
# ----------------------------
class PositionalEncoding(nn.Module):
    def __init__(self, d_model: int, max_len: int = 512, dropout: float = 0.1):
        super().__init__()
        self.dropout = nn.Dropout(dropout)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)  # (max_len,1)
        div_term = torch.exp(
            torch.arange(0, d_model, 2, dtype=torch.float) * (-math.log(10000.0) / d_model)
        )
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)  # (1, max_len, d_model)
        self.register_buffer("pe", pe)

        # 미세한 안정화용
        nn.init.zeros_(self.pe)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        # x: (B, S, E)
        x = x + self.pe[:, : x.size(1)]
        return self.dropout(x)


In [132]:
# ----------------------------
# Transformer Encoder Classifier
# ----------------------------
class TransformerClassifier(nn.Module):
    """
    순수 Transformer-Encoder 기반 문서/대화 분류기.
    - input_ids: (B, S) 토큰 인덱스
    - attention_mask: (B, S) 1=유효, 0=패딩
    """
    def __init__(
        self,
        vocab_size: int,
        num_classes: int,
        emb_dim: int = 256,
        nhead: int = 8,
        num_layers: int = 4,
        dim_feedforward: int = 512,
        max_len: int = 512,
        dropout: float = 0.1,
        pad_id: int = 0,
        use_cls_pool: bool = True,  # True면 첫 토큰(<cls>)을 문장 표현으로 사용, False면 마스크 평균
    ):
        super().__init__()
        assert emb_dim % nhead == 0, "emb_dim must be divisible by nhead"

        self.emb = nn.Embedding(vocab_size, emb_dim, padding_idx=pad_id)
        self.pos = PositionalEncoding(emb_dim, max_len=max_len, dropout=dropout)

        enc_layer = nn.TransformerEncoderLayer(
            d_model=emb_dim,
            nhead=nhead,
            dim_feedforward=dim_feedforward,
            dropout=dropout,
            activation="gelu",
            batch_first=False,  # 입력은 (S,B,E)
        )
        self.encoder = nn.TransformerEncoder(enc_layer, num_layers=num_layers)

        self.norm = nn.LayerNorm(emb_dim)
        self.classifier = nn.Linear(emb_dim, num_classes)

        self.emb_scale = math.sqrt(emb_dim)
        self.use_cls_pool = use_cls_pool

        # Xavier init (선택)
        nn.init.xavier_uniform_(self.classifier.weight)
        nn.init.zeros_(self.classifier.bias)

    def forward(
        self,
        input_ids: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
        return_repr: bool = False,
    ):
        # (B,S) -> (B,S,E)
        x = self.emb(input_ids) * self.emb_scale
        x = self.pos(x)                         # (B,S,E)
        x = x.transpose(0, 1)                   # (S,B,E)

        key_padding_mask = None
        if attention_mask is not None:
            key_padding_mask = (attention_mask == 0)  # True=mask

        x = self.encoder(x, src_key_padding_mask=key_padding_mask)  # (S,B,E)
        x = x.transpose(0, 1)                                       # (B,S,E)

        if self.use_cls_pool:
            sent_repr = x[:, 0, :]  # <cls> 위치
        else:
            if attention_mask is None:
                sent_repr = x.mean(dim=1)
            else:
                mask = attention_mask.unsqueeze(-1).float()         # (B,S,1)
                sent_repr = (x * mask).sum(dim=1) / mask.sum(dim=1).clamp(min=1e-6)

        sent_repr = self.norm(sent_repr)
        logits = self.classifier(sent_repr)

        if return_repr:
            return logits, sent_repr
        return logits



In [133]:
# ----------------------------
# 학습/평가 루프
# ----------------------------
def train_one_epoch(
    model: nn.Module,
    dataloader,
    optimizer: torch.optim.Optimizer,
    device: torch.device,
    class_weights: Optional[torch.Tensor] = None,
    grad_clip: float = 1.0,
    scheduler = None,
    use_amp: bool = True,
) -> Dict[str, float]:
    model.train()
    scaler = torch.cuda.amp.GradScaler(enabled=use_amp)
    ce = nn.CrossEntropyLoss(weight=class_weights.to(device) if class_weights is not None else None)

    losses, all_preds, all_labels = [], [], []
    for batch in tqdm(dataloader, desc="train", leave=False):
        input_ids = batch["input_ids"].to(device)
        attn = batch.get("attention_mask")
        attn = attn.to(device) if attn is not None else None
        labels = batch["labels"].to(device)

        optimizer.zero_grad(set_to_none=True)
        with torch.cuda.amp.autocast(enabled=use_amp):
            logits = model(input_ids, attention_mask=attn)
            loss = ce(logits, labels)

        scaler.scale(loss).backward()
        if grad_clip is not None:
            scaler.unscale_(optimizer)
            nn.utils.clip_grad_norm_(model.parameters(), grad_clip)
        scaler.step(optimizer)
        scaler.update()
        if scheduler is not None:
            scheduler.step()

        losses.append(loss.item())
        all_preds += logits.argmax(dim=-1).detach().cpu().tolist()
        all_labels += labels.detach().cpu().tolist()

    acc = accuracy_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds, average="macro")
    return {"loss": sum(losses)/len(losses), "acc": acc, "f1_macro": f1}

@torch.no_grad()
def evaluate(
    model: nn.Module,
    dataloader,
    device: torch.device,
) -> Dict[str, float]:
    model.eval()
    ce = nn.CrossEntropyLoss()

    losses, all_preds, all_labels = [], [], []
    for batch in tqdm(dataloader, desc="eval", leave=False):
        input_ids = batch["input_ids"].to(device)
        attn = batch.get("attention_mask")
        attn = attn.to(device) if attn is not None else None
        labels = batch["labels"].to(device)

        logits = model(input_ids, attention_mask=attn)
        loss = ce(logits, labels)
        losses.append(loss.item())

        all_preds += logits.argmax(dim=-1).detach().cpu().tolist()
        all_labels += labels.detach().cpu().tolist()

    acc = accuracy_score(all_labels, all_preds) if all_labels else 0.0
    f1 = f1_score(all_labels, all_preds, average="macro") if all_labels else 0.0
    return {"loss": sum(losses)/len(losses), "acc": acc, "f1_macro": f1}

In [134]:
# ----------------------------
# 모델 팩토리 (간단 생성기)
# ----------------------------
def create_model(
    vocab_size: int,
    num_classes: int = 5,          # 협박0, 갈취1, 직장2, 기타3, 일반4
    pad_id: int = 0,
    emb_dim: int = 256,
    nhead: int = 8,
    num_layers: int = 4,
    dim_ff: int = 512,
    max_len: int = 512,
    dropout: float = 0.1,
    use_cls_pool: bool = True,
) -> nn.Module:
    return TransformerClassifier(
        vocab_size=vocab_size,
        num_classes=num_classes,
        emb_dim=emb_dim,
        nhead=nhead,
        num_layers=num_layers,
        dim_feedforward=dim_ff,
        max_len=max_len,
        dropout=dropout,
        pad_id=pad_id,
        use_cls_pool=use_cls_pool,
    )

In [135]:
# 가정: stoi, itos, train_loader, valid_loader, label2id 존재
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = create_model(
    vocab_size=len(itos),
    num_classes=5,                # 고정 매핑(협박0, 갈취1, 직장2, 기타3, 일반4)
    pad_id=stoi["<pad>"],
    emb_dim=256,
    nhead=8,
    num_layers=3,                 # 처음엔 3~4로 시작 추천
    dim_ff=512,
    max_len=256,                  # 인코딩에서 쓴 max_len과 동일하게
    dropout=0.1,
    use_cls_pool=True,
).to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=2e-4)
scheduler = None  # 필요하면 CosineAnnealingLR 등 연결

In [136]:
# (선택) 클래스 가중치: train_df의 정수 라벨 리스트로 계산
from collections import Counter
train_labels = [rec["label"] for rec in train_records]  # 이전 단계 encode_records 기준
cnt = Counter(train_labels)
weights = torch.tensor([1.0 / max(cnt.get(i, 1), 1) for i in range(5)], dtype=torch.float)
weights = weights / weights.mean()  # 평균 1로 정규화
class_weights = weights

best_f1 = 0.0
for ep in range(1, epochs+1):
    tr = train_one_epoch(model, train_loader, optimizer, device,
                         class_weights=class_weights, grad_clip=1.0, scheduler=scheduler, use_amp=True)
    va = evaluate(model, valid_loader, device)
    print(f"[{ep:02d}] train: {tr} | valid: {va}")

    if va["f1_macro"] > best_f1:
        best_f1 = va["f1_macro"]
        torch.save(model.state_dict(), "./stopwords1_"+str(best_f1)+"_best_transformer_cls.pt")
        print("  ✔ saved best model (F1 ↑)")

                                                        

[01] train: {'loss': 1.1870362875968452, 'acc': 0.5460624071322436, 'f1_macro': 0.5415067864113065} | valid: {'loss': 0.8939134161919355, 'acc': 0.6693069306930693, 'f1_macro': 0.6609375017011339}
  ✔ saved best model (F1 ↑)


                                                        

[02] train: {'loss': 0.652233277249524, 'acc': 0.7644873699851411, 'f1_macro': 0.7612607992640013} | valid: {'loss': 0.707449602894485, 'acc': 0.7435643564356436, 'f1_macro': 0.7386206372168495}
  ✔ saved best model (F1 ↑)


                                                        

[03] train: {'loss': 0.4752842848460505, 'acc': 0.8315998018821199, 'f1_macro': 0.8291924363214177} | valid: {'loss': 0.7025370709598064, 'acc': 0.7584158415841584, 'f1_macro': 0.755839067427152}
  ✔ saved best model (F1 ↑)


                                                        

[04] train: {'loss': 0.35462314881912366, 'acc': 0.866765725606736, 'f1_macro': 0.8647911644998638} | valid: {'loss': 0.7096115606836975, 'acc': 0.7643564356435644, 'f1_macro': 0.7593688159916184}
  ✔ saved best model (F1 ↑)


                                                        

[05] train: {'loss': 0.282549075899631, 'acc': 0.8984645864289252, 'f1_macro': 0.8967606198915019} | valid: {'loss': 0.7374368305318058, 'acc': 0.7792079207920792, 'f1_macro': 0.7762914396281936}
  ✔ saved best model (F1 ↑)


                                                        

[06] train: {'loss': 0.20597267690606005, 'acc': 0.9309063893016345, 'f1_macro': 0.9297230496370457} | valid: {'loss': 0.783844695892185, 'acc': 0.7801980198019802, 'f1_macro': 0.7758827499126026}


                                                        

[07] train: {'loss': 0.17060229449173597, 'acc': 0.9380881624566617, 'f1_macro': 0.9369508438132602} | valid: {'loss': 0.8267900026403368, 'acc': 0.7881188118811882, 'f1_macro': 0.7835696383102848}
  ✔ saved best model (F1 ↑)


                                                        

[08] train: {'loss': 0.13538128305886557, 'acc': 0.9517087667161961, 'f1_macro': 0.9509930784327792} | valid: {'loss': 0.856289628893137, 'acc': 0.7910891089108911, 'f1_macro': 0.7876510687939001}
  ✔ saved best model (F1 ↑)


                                                        

[09] train: {'loss': 0.118821156244197, 'acc': 0.95789995047053, 'f1_macro': 0.957244682260388} | valid: {'loss': 0.9495013216510415, 'acc': 0.7772277227722773, 'f1_macro': 0.7734740424479443}


                                                        

[10] train: {'loss': 0.09327781329657442, 'acc': 0.966319960376424, 'f1_macro': 0.9659255326453436} | valid: {'loss': 1.008877214975655, 'acc': 0.7861386138613862, 'f1_macro': 0.7797125863377754}


                                                        

[11] train: {'loss': 0.09246088092134694, 'acc': 0.9653293709757306, 'f1_macro': 0.9648410869163975} | valid: {'loss': 1.0634757606312633, 'acc': 0.7960396039603961, 'f1_macro': 0.7918684795148287}
  ✔ saved best model (F1 ↑)


                                                        

[12] train: {'loss': 0.07469012699028987, 'acc': 0.9732540861812778, 'f1_macro': 0.9726568123309886} | valid: {'loss': 1.170522352680564, 'acc': 0.7930693069306931, 'f1_macro': 0.7874220247211134}


                                                        

[13] train: {'loss': 0.0757397032828908, 'acc': 0.9769687964338781, 'f1_macro': 0.9765570712461044} | valid: {'loss': 1.1868348885327578, 'acc': 0.7910891089108911, 'f1_macro': 0.7876722994237964}


                                                        

[14] train: {'loss': 0.04848632534441546, 'acc': 0.9831599801882119, 'f1_macro': 0.9829581101171033} | valid: {'loss': 1.294426467269659, 'acc': 0.7891089108910891, 'f1_macro': 0.7852370070362993}


                                                        

[15] train: {'loss': 0.05824377340794904, 'acc': 0.9816740960871718, 'f1_macro': 0.9814271862446666} | valid: {'loss': 1.4290023855865002, 'acc': 0.8, 'f1_macro': 0.7973447056609324}
  ✔ saved best model (F1 ↑)


                                                        

[16] train: {'loss': 0.04591781822095731, 'acc': 0.9831599801882119, 'f1_macro': 0.9831006666971416} | valid: {'loss': 1.5077247638255358, 'acc': 0.7910891089108911, 'f1_macro': 0.78711101443547}


                                                        

[17] train: {'loss': 0.05298539988527452, 'acc': 0.9826646854878652, 'f1_macro': 0.9825345022888141} | valid: {'loss': 1.5635885084047914, 'acc': 0.806930693069307, 'f1_macro': 0.804322278913521}
  ✔ saved best model (F1 ↑)


                                                        

[18] train: {'loss': 0.03580034854731341, 'acc': 0.9898464586428926, 'f1_macro': 0.9897002816533724} | valid: {'loss': 1.5799787510186434, 'acc': 0.800990099009901, 'f1_macro': 0.7990896186178064}


                                                        

[19] train: {'loss': 0.03787967900994615, 'acc': 0.9903417533432393, 'f1_macro': 0.9902123034120741} | valid: {'loss': 1.4758273772895336, 'acc': 0.804950495049505, 'f1_macro': 0.8014603777257909}


                                                        

[20] train: {'loss': 0.033502606170145555, 'acc': 0.9900941059930659, 'f1_macro': 0.9899919329989008} | valid: {'loss': 1.5958001390099525, 'acc': 0.799009900990099, 'f1_macro': 0.7960687166284792}




## Stopword로 일반 대화 토큰 전부 입력

In [137]:
from collections import Counter

general_rows = train_tokens[train_tokens['class'] == '일반 대화']

def collect_tokens_by_cols(df, cols):
    """지정한 품사 컬럼들에서 토큰을 모아 clean_token 적용 후 반환"""
    bag = []
    for tag in cols:
        if tag in df.columns:
            for lst in df[tag].dropna():
                if isinstance(lst, (list, tuple)):
                    bag.extend(clean_token(t) for t in lst if clean_token(t))
    return [t for t in bag if t]  # 빈 문자열 제거

# 1) 일반 대화 + 지정 품사에서 토큰 수집
general_tokens = collect_tokens_by_cols(general_rows, cols)

# 2) 이미 등록된 불용어는 제외하고 빈도 계산
base_stop = stopwords | stop_words
freq = Counter(t for t in general_tokens if t not in base_stop)

# 3) 누적 80%를 커버하는 최소 토큰 집합 선택
coverage = 0.80
total = sum(freq.values())

top80_new_stops = set()
if total > 0:
    cum = 0
    for tok, cnt in freq.most_common():  # 빈도 내림차순
        cum += cnt
        top80_new_stops.add(tok)
        if cum / total >= coverage:
            break

# 4) 최종 불용어 집합 갱신
all_stopwords2 = base_stop | top80_new_stops

# 5) 적용 컬럼 생성/갱신 (tokens_sw2)
train_tokens["tokens_sw2"] = train_tokens["tokens"].apply(
    lambda toks: [t for t in toks if t and t not in all_stopwords2]
)

# 4-3) vocab 빌드
stoi, itos, counter = build_vocab(train_tokens["tokens_sw2"], min_freq=1, max_size=20000)
pad_id = stoi["<pad>"]

# 4-4) 인코딩 (train/valid 분할은 이미 되어있다고 가정하거나 아래처럼 간단 분할)
from sklearn.model_selection import train_test_split
train_df, valid_df = train_test_split(train_tokens, test_size=0.2, random_state=42, stratify=train_tokens["class"])

def encode_row(row, max_len=256):
    ids, attn = encode_tokens(row["tokens_sw2"], stoi, max_len=max_len)
    return {
        "input_ids": ids,
        "attention_mask": attn,
        "label": label2id[row["class"]],
    }

train_records = [encode_row(r) for _, r in train_df.iterrows()]
valid_records = [encode_row(r) for _, r in valid_df.iterrows()]

# 4-5) PyTorch Dataset/Dataloader
from torch.utils.data import Dataset, DataLoader

class SimpleListDataset(Dataset):
    def __init__(self, records):
        self.records = records
    def __len__(self):
        return len(self.records)
    def __getitem__(self, idx):
        return self.records[idx]

train_ds = SimpleListDataset(train_records)
valid_ds = SimpleListDataset(valid_records)

train_loader = DataLoader(train_ds, batch_size=32, shuffle=True,
                          collate_fn=lambda b: collate_batch(b, pad_id))
valid_loader = DataLoader(valid_ds, batch_size=32, shuffle=False,
                          collate_fn=lambda b: collate_batch(b, pad_id))

print(f"Vocab size: {len(itos)} | Labels: {label2id}")
print(next(iter(train_loader))["input_ids"].shape)  # (B, S)

Vocab size: 9403 | Labels: {'협박 대화': 0, '갈취 대화': 1, '직장 내 괴롭힘 대화': 2, '기타 괴롭힘 대화': 3, '일반 대화': 4}
torch.Size([32, 94])


In [138]:
# 가정: stoi, itos, train_loader, valid_loader, label2id 존재
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = create_model(
    vocab_size=len(itos),
    num_classes=5,                # 고정 매핑(협박0, 갈취1, 직장2, 기타3, 일반4)
    pad_id=stoi["<pad>"],
    emb_dim=256,
    nhead=8,
    num_layers=3,                 # 처음엔 3~4로 시작 추천
    dim_ff=512,
    max_len=256,                  # 인코딩에서 쓴 max_len과 동일하게
    dropout=0.1,
    use_cls_pool=True,
).to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=2e-4)
scheduler = None  # 필요하면 CosineAnnealingLR 등 연결

In [139]:
# (선택) 클래스 가중치: train_df의 정수 라벨 리스트로 계산
from collections import Counter
train_labels = [rec["label"] for rec in train_records]  # 이전 단계 encode_records 기준
cnt = Counter(train_labels)
weights = torch.tensor([1.0 / max(cnt.get(i, 1), 1) for i in range(5)], dtype=torch.float)
weights = weights / weights.mean()  # 평균 1로 정규화
class_weights = weights

best_f1 = 0.0
for ep in range(1, epochs+1):
    tr = train_one_epoch(model, train_loader, optimizer, device,
                         class_weights=class_weights, grad_clip=1.0, scheduler=scheduler, use_amp=True)
    va = evaluate(model, valid_loader, device)
    print(f"[{ep:02d}] train: {tr} | valid: {va}")

    if va["f1_macro"] > best_f1:
        best_f1 = va["f1_macro"]
        torch.save(model.state_dict(), "./stopwords2_"+str(best_f1)+"_best_transformer_cls.pt")
        print("  ✔ saved best model (F1 ↑)")

                                                        

[01] train: {'loss': 1.247204883830754, 'acc': 0.5111441307578009, 'f1_macro': 0.5086680611928837} | valid: {'loss': 0.860168443992734, 'acc': 0.6693069306930693, 'f1_macro': 0.6672323583311806}
  ✔ saved best model (F1 ↑)


                                                        

[02] train: {'loss': 0.6956655461722472, 'acc': 0.7322932144626052, 'f1_macro': 0.7295971923484854} | valid: {'loss': 0.8299516383558512, 'acc': 0.7039603960396039, 'f1_macro': 0.6985639420512975}
  ✔ saved best model (F1 ↑)


                                                        

[03] train: {'loss': 0.5026819986386561, 'acc': 0.8199603764239722, 'f1_macro': 0.8171912316052976} | valid: {'loss': 0.6862445455044508, 'acc': 0.7653465346534654, 'f1_macro': 0.7634188255247354}
  ✔ saved best model (F1 ↑)


                                                        

[04] train: {'loss': 0.3541648086952412, 'acc': 0.8764239722634968, 'f1_macro': 0.8749730360192464} | valid: {'loss': 0.7327685505151749, 'acc': 0.7663366336633664, 'f1_macro': 0.7638124662164618}
  ✔ saved best model (F1 ↑)


                                                        

[05] train: {'loss': 0.2806653075917499, 'acc': 0.9011887072808321, 'f1_macro': 0.8995479788932664} | valid: {'loss': 0.7693733274936676, 'acc': 0.7841584158415842, 'f1_macro': 0.7819244790460199}
  ✔ saved best model (F1 ↑)


                                                        

[06] train: {'loss': 0.21185703525745023, 'acc': 0.9257057949479941, 'f1_macro': 0.9246715017318852} | valid: {'loss': 0.7490243958309293, 'acc': 0.7930693069306931, 'f1_macro': 0.7907693003131007}
  ✔ saved best model (F1 ↑)


                                                        

[07] train: {'loss': 0.16437088221487567, 'acc': 0.9437840515106488, 'f1_macro': 0.9429927418145324} | valid: {'loss': 0.8225733861327171, 'acc': 0.801980198019802, 'f1_macro': 0.7989661818373613}
  ✔ saved best model (F1 ↑)


                                                        

[08] train: {'loss': 0.1325972052951028, 'acc': 0.9514611193660227, 'f1_macro': 0.9507486360641979} | valid: {'loss': 0.8134371312335134, 'acc': 0.806930693069307, 'f1_macro': 0.8044624786715922}
  ✔ saved best model (F1 ↑)


                                                        

[09] train: {'loss': 0.10444111777121186, 'acc': 0.9613670133729569, 'f1_macro': 0.9609318114760115} | valid: {'loss': 0.954157424159348, 'acc': 0.802970297029703, 'f1_macro': 0.7998211534251947}


                                                        

[10] train: {'loss': 0.09656824067685987, 'acc': 0.967062902426944, 'f1_macro': 0.9666766906304192} | valid: {'loss': 1.036713121458888, 'acc': 0.8, 'f1_macro': 0.7971369291750475}


                                                        

[11] train: {'loss': 0.09084439429722169, 'acc': 0.9675581971272907, 'f1_macro': 0.9669013267272618} | valid: {'loss': 1.0838420754298568, 'acc': 0.800990099009901, 'f1_macro': 0.7996739531476675}


                                                        

[12] train: {'loss': 0.06922743726392482, 'acc': 0.9752352649826647, 'f1_macro': 0.9750057573252752} | valid: {'loss': 1.2253995956853032, 'acc': 0.7980198019801981, 'f1_macro': 0.7947734996385529}


                                                        

[13] train: {'loss': 0.07438762203650916, 'acc': 0.9752352649826647, 'f1_macro': 0.9747429449186399} | valid: {'loss': 1.2021596059203148, 'acc': 0.800990099009901, 'f1_macro': 0.7980946092772578}


                                                        

[14] train: {'loss': 0.04965884260985428, 'acc': 0.9821693907875185, 'f1_macro': 0.9818848802744148} | valid: {'loss': 1.3203552020713687, 'acc': 0.8, 'f1_macro': 0.7955818871358387}


                                                        

[15] train: {'loss': 0.047772274478191996, 'acc': 0.9821693907875185, 'f1_macro': 0.9819590101954241} | valid: {'loss': 1.4275393821299076, 'acc': 0.7980198019801981, 'f1_macro': 0.7953728231567873}


                                                        

[16] train: {'loss': 0.06358970182861419, 'acc': 0.9811788013868251, 'f1_macro': 0.9808905262393974} | valid: {'loss': 1.2991219451650977, 'acc': 0.808910891089109, 'f1_macro': 0.8056129930400292}
  ✔ saved best model (F1 ↑)


                                                        

[17] train: {'loss': 0.03609556381624008, 'acc': 0.986627043090639, 'f1_macro': 0.9865791726399669} | valid: {'loss': 1.3946415148675442, 'acc': 0.806930693069307, 'f1_macro': 0.8053627449501064}


                                                        

[18] train: {'loss': 0.03335814001294833, 'acc': 0.9881129271916791, 'f1_macro': 0.9880938735831866} | valid: {'loss': 1.5355686247348785, 'acc': 0.7970297029702971, 'f1_macro': 0.7952278185123836}


                                                        

[19] train: {'loss': 0.04809915552750037, 'acc': 0.9861317483902923, 'f1_macro': 0.9861251621255974} | valid: {'loss': 1.5893879812210798, 'acc': 0.807920792079208, 'f1_macro': 0.8058296280644857}
  ✔ saved best model (F1 ↑)


                                                        

[20] train: {'loss': 0.04659026393323805, 'acc': 0.9858841010401189, 'f1_macro': 0.9857786889047414} | valid: {'loss': 1.6903959326446056, 'acc': 0.808910891089109, 'f1_macro': 0.8087226104781735}
  ✔ saved best model (F1 ↑)




In [None]:
import pandas as pd
import torch
from torch.utils.data import DataLoader, Dataset
from typing import Dict, Callable, List, Optional
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

# ------------------------------
# 내부용 Dataset / collate
# ------------------------------
class _ListDataset(Dataset):
    def __init__(self, items): self.items = items
    def __len__(self): return len(self.items)
    def __getitem__(self, i): return self.items[i]

def _collate_batch(batch, pad_id):
    bs = len(batch)
    maxlen = max(len(x["input_ids"]) for x in batch)
    input_ids = torch.full((bs, maxlen), pad_id, dtype=torch.long)
    attention_mask = torch.zeros((bs, maxlen), dtype=torch.long)
    for i, x in enumerate(batch):
        L = len(x["input_ids"])
        input_ids[i, :L] = torch.tensor(x["input_ids"], dtype=torch.long)
        attention_mask[i, :L] = torch.tensor(x["attention_mask"], dtype=torch.long)
    return {"input_ids": input_ids, "attention_mask": attention_mask}

# ------------------------------
# 메인 함수
# ------------------------------
def fill_submission_class(
    test_csv_path: str,
    submission_csv_path: str,
    model: torch.nn.Module,
    stoi: Dict[str, int],
    encode_tokens: Callable[[List[str], Dict[str, int], int, bool, bool], tuple],
    preprocess_fn: Optional[Callable[[str, set], List[str]]] = None,
    stop_words: Optional[set] = None,
    id2label: Optional[Dict[int, str]] = None,
    input_text_col: str = "conversation",
    tokens_col: str = "tokens",
    max_len: int = 256,
    batch_size: int = 64,
    output_label_as: str = "id",  # 'id' or 'name'
):
    """
    이미 idx 컬럼이 존재하는 submission 템플릿에 class만 채워 넣는 함수.
    """
    # 1. CSV 로드
    test_df = pd.read_csv(test_csv_path)
    sub_df = pd.read_csv(submission_csv_path)
    if "Unnamed: 0" in test_df.columns:
        test_df = test_df.drop(columns=["Unnamed: 0"])

    # 2. 토큰화 확보
    if tokens_col in test_df.columns:
        maybe = test_df[tokens_col].iloc[0]
        if isinstance(maybe, str):
            import ast
            test_df[tokens_col] = test_df[tokens_col].apply(lambda s: ast.literal_eval(s))
    else:
        assert preprocess_fn is not None, "preprocess_fn이 필요합니다."
        test_df[input_text_col] = test_df[input_text_col].astype(str).fillna("")
        test_df[tokens_col] = test_df[input_text_col].apply(lambda s: preprocess_fn(s, stop_words or set()))

    # 3. 인코딩
    pad_id = stoi["<pad>"]
    def _encode_row(tokens):
        ids, attn = encode_tokens(tokens, stoi, max_len=max_len, add_cls=True, add_sep=True)
        return {"input_ids": ids, "attention_mask": attn}

    encoded = [_encode_row(t) for t in test_df[tokens_col]]
    ds = _ListDataset(encoded)
    dl = DataLoader(ds, batch_size=batch_size, shuffle=False,
                    collate_fn=lambda b: _collate_batch(b, pad_id))

    # 4. 추론
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    model.eval()

    preds = []
    with torch.no_grad():
        for batch in dl:
            input_ids = batch["input_ids"].to(device)
            attn = batch["attention_mask"].to(device)
            logits = model(input_ids, attention_mask=attn)
            preds.extend(logits.argmax(dim=-1).detach().cpu().tolist())

    # 5. 결과 매핑
    if output_label_as == "name":
        assert id2label is not None, "output_label_as='name'이면 id2label 필요."
        pred_labels = [id2label[i] for i in preds]
    else:
        pred_labels = preds

    # 6. submission 채워넣기
    sub_df["class"] = pred_labels
    sub_df.to_csv(submission_csv_path, index=False)
    print(f"✅ '{submission_csv_path}' 저장 완료 ({len(sub_df)}개 샘플)")
    return sub_df


In [None]:
sub = fill_submission_class(
    test_csv_path="../data/test.csv",
    submission_csv_path="../data/submission.csv",
    model=model,
    stoi=stoi,
    encode_tokens=encode_tokens,
    preprocess_fn=preprocess_sentence,   # tokens이 이미 있으면 생략 가능
    stop_words=stop_words,
    id2label=id2label,
    input_text_col="conversation",
    tokens_col="tokens",
    output_label_as="id",   # 'id'면 숫자 라벨, 'name'이면 문자열 라벨
)
print(sub.head())

In [None]:
test = pd.read_csv("../data/test.csv")
test.head()

In [148]:
import copy
import time
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from collections import Counter

# ---- small utilities ----
class SimpleListDataset(Dataset):
    def __init__(self, records):
        self.records = records
    def __len__(self):
        return len(self.records)
    def __getitem__(self, idx):
        return self.records[idx]

def _collect_tokens_rowwise(df, pos_cols):
    """
    From POS columns (each cell = list[str]), flatten into a single tokens list per row.
    Assumes columns may be missing or contain NaN.
    """
    vals = df[pos_cols].apply(
        lambda row: sum((lst for lst in row if isinstance(lst, (list, tuple))), []),
        axis=1
    )
    return vals

def _clean_list(tokens):
    return [clean_token(x) for x in tokens if clean_token(x)]

def _encode_df(df, stoi, max_len=256):
    def _encode_row(row):
        ids, attn = encode_tokens(row["tokens_sw"], stoi, max_len=max_len)
        return {"input_ids": ids, "attention_mask": attn, "label": label2id[row["class"]]}
    return [ _encode_row(r) for _, r in df.iterrows() ]

def _make_loaders(train_records, valid_records, pad_id, bs=32):
    train_ds = SimpleListDataset(train_records)
    valid_ds = SimpleListDataset(valid_records)
    train_loader = DataLoader(
        train_ds, batch_size=bs, shuffle=True,
        collate_fn=lambda b: collate_batch(b, pad_id)
    )
    valid_loader = DataLoader(
        valid_ds, batch_size=bs, shuffle=False,
        collate_fn=lambda b: collate_batch(b, pad_id)
    )
    return train_loader, valid_loader

def _make_optimizer_scheduler(model, lr=3e-4, weight_decay=0.01, warmup_steps=0):
    optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)
    # optional simple scheduler (linear warmup -> none): keep compatible with your train_one_epoch signature
    scheduler = None
    return optimizer, scheduler

# ---- main experiment runner ----
def run_all_stopword_pos_experiments(
    df: pd.DataFrame,
    save_csv_path: str | None = "./experiment_results.csv",
    save_ckpt_dir: str = "./ckpts",
    batch_size: int = 32,
    max_len: int = 256,
    lr: float = 3e-4,
    weight_decay: float = 0.01,
    seed: int = 42,
):
    """
    Runs 8 experiment cases:
      1) No stopwords,      POS=5 (Noun,Verb,Adjective,Adverb,Exclamation)
      2) No stopwords,      POS=3 (Noun,Verb,Adjective)
      3) stopwords only,    POS=5
      4) stopwords only,    POS=3
      5) stopwords|stop_words,              POS=5
      6) stopwords|stop_words,              POS=3
      7) stopwords|stop_words|top80_new,    POS=5
      8) stopwords|stop_words|top80_new,    POS=3

    Assumes globals exist: stopwords, stop_words, top80_new_stops, label2id, id2label, epochs,
    and helper functions/classes already defined in your script:
      - build_vocab, encode_tokens, collate_batch, create_model, clean_token
      - train_one_epoch(model, train_loader, optimizer, device, class_weights=..., grad_clip=..., scheduler=..., use_amp=...)
      - evaluate(model, valid_loader, device) -> dict with 'f1_macro' (and maybe 'acc')
    """
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        device = torch.device("cuda")
    else:
        device = torch.device("cpu")

    # POS sets
    POS5 = ["Noun", "Verb", "Adjective", "Adverb", "Exclamation"]
    POS3 = ["Noun", "Verb", "Adjective"]

    # stopword combos
    sw_none = set()
    sw_stop = set(stopwords)
    sw_stop_plus = set(stopwords) | set(stop_words)
    sw_stop_plus_top80 = sw_stop_plus | set(top80_new_stops)

    # define the 8 cases
    CASES = [
        {"name": "noSW_POS5",          "pos_cols": POS5, "sw_set": sw_none,            "sw_desc": "none"},
        {"name": "noSW_POS3",          "pos_cols": POS3, "sw_set": sw_none,            "sw_desc": "none"},
        {"name": "stop_only_POS5",     "pos_cols": POS5, "sw_set": sw_stop,            "sw_desc": "stopwords"},
        {"name": "stop_only_POS3",     "pos_cols": POS3, "sw_set": sw_stop,            "sw_desc": "stopwords"},
        {"name": "stop_stopwords_POS5","pos_cols": POS5, "sw_set": sw_stop_plus,       "sw_desc": "stopwords|stop_words"},
        {"name": "stop_stopwords_POS3","pos_cols": POS3, "sw_set": sw_stop_plus,       "sw_desc": "stopwords|stop_words"},
        {"name": "stop_stop80_POS5",   "pos_cols": POS5, "sw_set": sw_stop_plus_top80, "sw_desc": "stopwords|stop_words|top80"},
        {"name": "stop_stop80_POS3",   "pos_cols": POS3, "sw_set": sw_stop_plus_top80, "sw_desc": "stopwords|stop_words|top80"},
    ]

    # ensure checkpoint dir
    import os
    os.makedirs(save_ckpt_dir, exist_ok=True)

    rows = []
    start_all = time.time()

    # split once for fair comparison (same train/valid across cases)
    train_df, valid_df = train_test_split(
        df, test_size=0.2, random_state=seed, stratify=df["class"]
    )

    for case in CASES:
        t0 = time.time()
        case_name = case["name"]
        pos_cols = case["pos_cols"]
        sw_set   = case["sw_set"]
        sw_desc  = case["sw_desc"]

        # 1) Compose tokens per row from selected POS
        #    We don’t mutate original df; work on copies with new columns.
        tr = train_df.copy()
        va = valid_df.copy()

        tr["tokens"] = _collect_tokens_rowwise(tr, pos_cols).apply(_clean_list)
        va["tokens"] = _collect_tokens_rowwise(va, pos_cols).apply(_clean_list)

        # 2) Apply stopwords (if any)
        if len(sw_set) > 0:
            tr["tokens_sw"] = tr["tokens"].apply(lambda toks: [t for t in toks if t not in sw_set])
            va["tokens_sw"] = va["tokens"].apply(lambda toks: [t for t in toks if t not in sw_set])
            used_sw = f"|SW={sw_desc}"
        else:
            # just copy tokens → tokens_sw
            tr["tokens_sw"] = tr["tokens"]
            va["tokens_sw"] = va["tokens"]
            used_sw = "|SW=none"

        # 3) Vocab
        stoi, itos, counter = build_vocab(tr["tokens_sw"], min_freq=1, max_size=20000)
        pad_id = stoi["<pad>"]

        # 4) Encode
        train_records = _encode_df(tr, stoi, max_len=max_len)
        valid_records = _encode_df(va, stoi, max_len=max_len)

        # 5) Loaders
        train_loader, valid_loader = _make_loaders(train_records, valid_records, pad_id, bs=batch_size)

        # 6) Class weights on *train split*
        train_labels = [rec["label"] for rec in train_records]
        cnt = Counter(train_labels)
        weights = torch.tensor([1.0 / max(cnt.get(i, 1), 1) for i in range(len(label2id))], dtype=torch.float)
        weights = weights / weights.mean()
        class_weights = weights.to(device)

        # 7) Model / Optim / Sched
        model = create_model(
            vocab_size=len(itos),
            num_classes=len(label2id),
            pad_id=pad_id,
            emb_dim=256, nhead=8, num_layers=4, dim_ff=512, max_len=512, dropout=0.1, use_cls_pool=True
        ).to(device)

        optimizer, scheduler = _make_optimizer_scheduler(model, lr=lr, weight_decay=weight_decay)

        # 8) Train loop
        best_f1 = -1.0
        best_acc = None
        best_epoch = -1
        best_ckpt_path = None

        for ep in range(1, epochs + 1):
            tr_log = train_one_epoch(
                model, train_loader, optimizer, device,
                class_weights=class_weights, grad_clip=1.0, scheduler=scheduler, use_amp=True
            )
            va_log = evaluate(model, valid_loader, device)  # dict with f1_macro (and maybe 'acc', 'loss', etc.)

            f1 = va_log.get("f1_macro", None)
            acc = va_log.get("acc", None)

            if f1 is not None and f1 > best_f1:
                best_f1 = float(f1)
                best_acc = float(acc) if acc is not None else None
                best_epoch = ep
                # save
                ckpt_name = f"{case_name}_F1-{best_f1:.4f}_ep{best_epoch}.pt"
                best_ckpt_path = os.path.join(save_ckpt_dir, ckpt_name)
                torch.save(model.state_dict(), best_ckpt_path)
                print(f"[{case_name}] ✔ New best at epoch {ep}: f1_macro={best_f1:.4f} (ckpt saved)")

        # 9) record results
        elapsed = time.time() - t0
        rows.append({
            "case": case_name,
            "pos_cols": ",".join(pos_cols),
            "stopwords_mode": sw_desc,
            "vocab_size": len(itos),
            "best_epoch": best_epoch,
            "best_f1_macro": best_f1 if best_f1 is not None else float("nan"),
            "best_acc": best_acc if best_acc is not None else float("nan"),
            "ckpt_path": best_ckpt_path,
            "time_sec": round(elapsed, 2),
        })

    total_elapsed = time.time() - start_all
    print(f"All experiments finished in {total_elapsed/60:.1f} min")

    results_df = pd.DataFrame(rows).sort_values(by=["best_f1_macro"], ascending=False).reset_index(drop=True)
    if save_csv_path:
        results_df.to_csv(save_csv_path, index=False, encoding="utf-8-sig")
        print(f"Results saved to {save_csv_path}")
    return results_df


In [149]:

# 이미 준비된 변수/함수: train_tokens, stopwords, stop_words, top80_new_stops,
# label2id, id2label, epochs, build_vocab, encode_tokens, collate_batch, create_model,
# clean_token, train_one_epoch, evaluate

results_df = run_all_stopword_pos_experiments(
    train_tokens,
    save_csv_path="./experiment_results.csv",
    save_ckpt_dir="./ckpts",
    batch_size=32,
    max_len=256,
    lr=3e-4,
    weight_decay=0.01,
    seed=42,
)
results_df.head()

                                                        

[noSW_POS5] ✔ New best at epoch 1: f1_macro=0.6989 (ckpt saved)


                                                        

[noSW_POS5] ✔ New best at epoch 2: f1_macro=0.7628 (ckpt saved)


                                                        

[noSW_POS5] ✔ New best at epoch 3: f1_macro=0.7634 (ckpt saved)


                                                        

[noSW_POS5] ✔ New best at epoch 4: f1_macro=0.7924 (ckpt saved)


                                                        

[noSW_POS5] ✔ New best at epoch 9: f1_macro=0.8219 (ckpt saved)


                                                        

[noSW_POS5] ✔ New best at epoch 18: f1_macro=0.8293 (ckpt saved)


                                                        

[noSW_POS3] ✔ New best at epoch 1: f1_macro=0.6752 (ckpt saved)


                                                        

[noSW_POS3] ✔ New best at epoch 2: f1_macro=0.7600 (ckpt saved)


                                                        

[noSW_POS3] ✔ New best at epoch 3: f1_macro=0.7732 (ckpt saved)


                                                        

[noSW_POS3] ✔ New best at epoch 5: f1_macro=0.8006 (ckpt saved)


                                                        

[noSW_POS3] ✔ New best at epoch 7: f1_macro=0.8076 (ckpt saved)


                                                        

[noSW_POS3] ✔ New best at epoch 10: f1_macro=0.8118 (ckpt saved)


                                                        

[noSW_POS3] ✔ New best at epoch 12: f1_macro=0.8287 (ckpt saved)


                                                        

[stop_only_POS5] ✔ New best at epoch 1: f1_macro=0.6785 (ckpt saved)


                                                        

[stop_only_POS5] ✔ New best at epoch 2: f1_macro=0.7179 (ckpt saved)


                                                        

[stop_only_POS5] ✔ New best at epoch 3: f1_macro=0.7669 (ckpt saved)


                                                        

[stop_only_POS5] ✔ New best at epoch 4: f1_macro=0.7947 (ckpt saved)


                                                        

[stop_only_POS5] ✔ New best at epoch 11: f1_macro=0.7952 (ckpt saved)


                                                        

[stop_only_POS5] ✔ New best at epoch 13: f1_macro=0.8150 (ckpt saved)


                                                        

[stop_only_POS5] ✔ New best at epoch 17: f1_macro=0.8247 (ckpt saved)


                                                        

[stop_only_POS3] ✔ New best at epoch 1: f1_macro=0.6928 (ckpt saved)


                                                        

[stop_only_POS3] ✔ New best at epoch 2: f1_macro=0.7413 (ckpt saved)


                                                        

[stop_only_POS3] ✔ New best at epoch 3: f1_macro=0.7646 (ckpt saved)


                                                        

[stop_only_POS3] ✔ New best at epoch 5: f1_macro=0.7846 (ckpt saved)


                                                        

[stop_only_POS3] ✔ New best at epoch 7: f1_macro=0.7877 (ckpt saved)


                                                        

[stop_only_POS3] ✔ New best at epoch 10: f1_macro=0.7906 (ckpt saved)


                                                        

[stop_only_POS3] ✔ New best at epoch 17: f1_macro=0.8000 (ckpt saved)


                                                        

[stop_only_POS3] ✔ New best at epoch 19: f1_macro=0.8114 (ckpt saved)


                                                        

[stop_stopwords_POS5] ✔ New best at epoch 1: f1_macro=0.6592 (ckpt saved)


                                                        

[stop_stopwords_POS5] ✔ New best at epoch 2: f1_macro=0.7436 (ckpt saved)


                                                        

[stop_stopwords_POS5] ✔ New best at epoch 4: f1_macro=0.7750 (ckpt saved)


                                                        

[stop_stopwords_POS5] ✔ New best at epoch 5: f1_macro=0.7772 (ckpt saved)


                                                        

[stop_stopwords_POS5] ✔ New best at epoch 6: f1_macro=0.7957 (ckpt saved)


                                                        

[stop_stopwords_POS5] ✔ New best at epoch 9: f1_macro=0.7960 (ckpt saved)


                                                        

[stop_stopwords_POS5] ✔ New best at epoch 10: f1_macro=0.8086 (ckpt saved)


                                                        

[stop_stopwords_POS3] ✔ New best at epoch 1: f1_macro=0.7196 (ckpt saved)


                                                        

[stop_stopwords_POS3] ✔ New best at epoch 2: f1_macro=0.7687 (ckpt saved)


                                                        

[stop_stopwords_POS3] ✔ New best at epoch 3: f1_macro=0.7800 (ckpt saved)


                                                        

[stop_stopwords_POS3] ✔ New best at epoch 5: f1_macro=0.8061 (ckpt saved)


                                                        

[stop_stopwords_POS3] ✔ New best at epoch 11: f1_macro=0.8064 (ckpt saved)


                                                        

[stop_stopwords_POS3] ✔ New best at epoch 14: f1_macro=0.8064 (ckpt saved)


                                                        

[stop_stopwords_POS3] ✔ New best at epoch 16: f1_macro=0.8103 (ckpt saved)


                                                        

[stop_stopwords_POS3] ✔ New best at epoch 17: f1_macro=0.8105 (ckpt saved)


                                                        

[stop_stopwords_POS3] ✔ New best at epoch 18: f1_macro=0.8117 (ckpt saved)


                                                        

[stop_stopwords_POS3] ✔ New best at epoch 19: f1_macro=0.8146 (ckpt saved)


                                                        

[stop_stop80_POS5] ✔ New best at epoch 1: f1_macro=0.7286 (ckpt saved)


                                                        

[stop_stop80_POS5] ✔ New best at epoch 2: f1_macro=0.7634 (ckpt saved)


                                                        

[stop_stop80_POS5] ✔ New best at epoch 4: f1_macro=0.7948 (ckpt saved)


                                                        

[stop_stop80_POS5] ✔ New best at epoch 5: f1_macro=0.7961 (ckpt saved)


                                                        

[stop_stop80_POS5] ✔ New best at epoch 9: f1_macro=0.8128 (ckpt saved)


                                                        

[stop_stop80_POS3] ✔ New best at epoch 1: f1_macro=0.6904 (ckpt saved)


                                                        

[stop_stop80_POS3] ✔ New best at epoch 2: f1_macro=0.7422 (ckpt saved)


                                                        

[stop_stop80_POS3] ✔ New best at epoch 3: f1_macro=0.7663 (ckpt saved)


                                                        

[stop_stop80_POS3] ✔ New best at epoch 4: f1_macro=0.7712 (ckpt saved)


                                                        

[stop_stop80_POS3] ✔ New best at epoch 5: f1_macro=0.7957 (ckpt saved)


                                                        

[stop_stop80_POS3] ✔ New best at epoch 8: f1_macro=0.7997 (ckpt saved)


                                                        

[stop_stop80_POS3] ✔ New best at epoch 10: f1_macro=0.8052 (ckpt saved)


                                                        

[stop_stop80_POS3] ✔ New best at epoch 19: f1_macro=0.8066 (ckpt saved)


                                                        

All experiments finished in 143.5 min
Results saved to ./experiment_results.csv




Unnamed: 0,case,pos_cols,stopwords_mode,vocab_size,best_epoch,best_f1_macro,best_acc,ckpt_path,time_sec
0,noSW_POS5,"Noun,Verb,Adjective,Adverb,Exclamation",none,8600,18,0.829331,0.832673,./ckpts\noSW_POS5_F1-0.8293_ep18.pt,1423.31
1,noSW_POS3,"Noun,Verb,Adjective",none,8391,12,0.828749,0.831683,./ckpts\noSW_POS3_F1-0.8287_ep12.pt,1351.11
2,stop_only_POS5,"Noun,Verb,Adjective,Adverb,Exclamation",stopwords,8585,17,0.824681,0.826733,./ckpts\stop_only_POS5_F1-0.8247_ep17.pt,1105.07
3,stop_stopwords_POS3,"Noun,Verb,Adjective",stopwords|stop_words,8361,19,0.814552,0.815842,./ckpts\stop_stopwords_POS3_F1-0.8146_ep19.pt,1019.2
4,stop_stop80_POS5,"Noun,Verb,Adjective,Adverb,Exclamation",stopwords|stop_words|top80,8441,9,0.812804,0.814851,./ckpts\stop_stop80_POS5_F1-0.8128_ep9.pt,841.61


In [150]:
result = pd.read_csv('./experiment_results.csv')
result

Unnamed: 0,case,pos_cols,stopwords_mode,vocab_size,best_epoch,best_f1_macro,best_acc,ckpt_path,time_sec
0,noSW_POS5,"Noun,Verb,Adjective,Adverb,Exclamation",none,8600,18,0.829331,0.832673,./ckpts\noSW_POS5_F1-0.8293_ep18.pt,1423.31
1,noSW_POS3,"Noun,Verb,Adjective",none,8391,12,0.828749,0.831683,./ckpts\noSW_POS3_F1-0.8287_ep12.pt,1351.11
2,stop_only_POS5,"Noun,Verb,Adjective,Adverb,Exclamation",stopwords,8585,17,0.824681,0.826733,./ckpts\stop_only_POS5_F1-0.8247_ep17.pt,1105.07
3,stop_stopwords_POS3,"Noun,Verb,Adjective",stopwords|stop_words,8361,19,0.814552,0.815842,./ckpts\stop_stopwords_POS3_F1-0.8146_ep19.pt,1019.2
4,stop_stop80_POS5,"Noun,Verb,Adjective,Adverb,Exclamation",stopwords|stop_words|top80,8441,9,0.812804,0.814851,./ckpts\stop_stop80_POS5_F1-0.8128_ep9.pt,841.61
5,stop_only_POS3,"Noun,Verb,Adjective",stopwords,8376,19,0.81143,0.813861,./ckpts\stop_only_POS3_F1-0.8114_ep19.pt,1014.41
6,stop_stopwords_POS5,"Noun,Verb,Adjective,Adverb,Exclamation",stopwords|stop_words,8569,10,0.808601,0.811881,./ckpts\stop_stopwords_POS5_F1-0.8086_ep10.pt,1067.42
7,stop_stop80_POS3,"Noun,Verb,Adjective",stopwords|stop_words|top80,8237,19,0.806648,0.807921,./ckpts\stop_stop80_POS3_F1-0.8066_ep19.pt,788.27


추가로 진행할 내용
- morphs 사용해서/ pos 사용 코드 재활용할때 모든 품사 사용하도록 해서 모델 학습 다시 진행

