In [1]:
!conda install -y -c conda-forge openjdk=11 jpype1
!pip install konlpy==0.6.0

Channels:
 - conda-forge
Platform: linux-64
Collecting package metadata (repodata.json): done
Solving environment: done


    current version: 25.5.1
    latest version: 25.9.1

Please update conda by running

    $ conda update -n base -c conda-forge conda



## Package Plan ##

  environment location: /opt/conda

  added / updated specs:
    - jpype1
    - openjdk=11


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    alsa-lib-1.2.14            |       hb9d3cd8_0         553 KB  conda-forge
    ca-certificates-2025.10.5  |       hbd8a1cb_0         152 KB  conda-forge
    cairo-1.18.4               |       h3394656_0         955 KB  conda-forge
    certifi-2025.10.5          |     pyhd8ed1ab_0         156 KB  conda-forge
    font-ttf-dejavu-sans-mono-2.37|       hab24e00_0         388 KB  conda-forge
    font-ttf-inconsolata-3.000 |       h77eed37_0          94 KB  conda-forge
    font-tt

In [2]:
!ls $CONDA_PREFIX/lib/jvm

bin  conf  include  jmods  legal  lib  man


In [None]:
!conda install -y -c conda-forge openjdk=11 jpype1
!pip install -U konlpy

Channels:
 - conda-forge
Platform: linux-64
Collecting package metadata (repodata.json): done
Solving environment: \ 

In [None]:
import os, sys
java_home = os.path.join(os.environ["CONDA_PREFIX"], "lib", "jvm", "java-11-openjdk")  # 2단계에서 확인한 폴더명으로 변경
os.environ["JAVA_HOME"] = java_home
os.environ["PATH"] += os.pathsep + os.path.join(java_home, "bin")

In [None]:
# =========================================================
# 1) 라이브러리
# =========================================================
import os, json, math, random
import numpy as np
import pandas as pd
from collections import Counter, defaultdict

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

from konlpy.tag import Okt

# 재현성
SEED = 42
random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED); torch.cuda.manual_seed_all(SEED)

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
SAVE_DIR = "work/team2025_pt"
os.makedirs(SAVE_DIR, exist_ok=True)

In [None]:
# =========================================================
# 2) 라벨 인덱싱
# =========================================================
train = pd.read_csv("team2025/merged_train.csv")
labels = sorted(train['class'].unique())
label2id = {c:i for i,c in enumerate(labels)}
id2label = {i:c for c,i in label2id.items()}
train['label'] = train['class'].map(label2id).astype(int)

with open(os.path.join(SAVE_DIR, "label_classes.json"), "w", encoding="utf-8") as f:
    json.dump(id2label, f, ensure_ascii=False, indent=2)

In [None]:
# =========================================================
# 3) Stratified 8:2 split (수작업)
# =========================================================
def stratified_split_idx(y, test_size=0.2, seed=42):
    rng = np.random.default_rng(seed)
    y = np.asarray(y)
    cls_indices = defaultdict(list)
    for i, yi in enumerate(y):
        cls_indices[int(yi)].append(i)
    train_idx, valid_idx = [], []
    for cls, idxs in cls_indices.items():
        idxs = np.array(idxs)
        rng.shuffle(idxs)
        n_valid = max(1, int(round(len(idxs)*test_size)))
        valid_idx.extend(idxs[:n_valid].tolist())
        train_idx.extend(idxs[n_valid:].tolist())
    rng.shuffle(train_idx); rng.shuffle(valid_idx)
    return np.array(train_idx), np.array(valid_idx)

tr_idx, va_idx = stratified_split_idx(train['label'].values, test_size=0.2, seed=SEED)
tr_df = train.iloc[tr_idx].reset_index(drop=True)
va_df = train.iloc[va_idx].reset_index(drop=True)

In [None]:
# =========================================================
# 4) OKT 토큰화
# =========================================================
okt = Okt()

def tokenize_okt(text, norm=True, stem=False):
    s = str(text).strip()
    # 필요 시 추가 정규화 로직을 여기에 삽입
    return okt.morphs(s, stem=stem) if not norm else okt.morphs(s, stem=stem)

tr_tokens = [tokenize_okt(s) for s in tr_df['conversation']]
va_tokens = [tokenize_okt(s) for s in va_df['conversation']]