In [None]:
import os
os.environ["WANDB_DISABLED"] = "true"


In [None]:
!pip install kobert-transformers transformers datasets seqeval --quiet

from kobert_transformers import get_tokenizer
from transformers import BertForTokenClassification, Trainer, TrainingArguments
from datasets import Dataset
from sklearn.metrics import precision_recall_fscore_support
import numpy as np

# 라벨 정의
label_list = ["O", "B-PER", "B-AGE", "B-ORG", "B-DISEASE", "B-LOC"]
label2id = {label: i for i, label in enumerate(label_list)}
id2label = {i: label for i, label in enumerate(label_list)}

# 샘플 데이터
train_data = {
    "tokens": [
        ["김철수", "는", "서울대병원", "에서", "위암", "치료", "를", "받았다", "."],
        ["70세", "남성", "환자", "는", "강남구", "에", "거주", "하고", "있다", "."]
    ],
    "ner_tags": [
        [1, 0, 3, 0, 4, 0, 0, 0, 0],
        [2, 2, 0, 0, 5, 0, 0, 0, 0, 0]
    ]
}

tokenizer = get_tokenizer()
model = BertForTokenClassification.from_pretrained("monologg/kobert", num_labels=len(label_list))

# 라벨 정렬
def tokenize_and_align_labels(example):
    tokens = example["tokens"]
    labels = example["ner_tags"]
    encoding = tokenizer(tokens, is_split_into_words=True, padding="max_length", truncation=True, return_tensors="pt")
    input_len = encoding["input_ids"].shape[1]
    aligned_labels = labels[:input_len] + [-100] * (input_len - len(labels))
    return {
        "input_ids": encoding["input_ids"][0],
        "attention_mask": encoding["attention_mask"][0],
        "labels": aligned_labels
    }

dataset = Dataset.from_dict(train_data)
tokenized_dataset = dataset.map(tokenize_and_align_labels)

# 평가지표
def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=2)
    labels = p.label_ids
    true_labels = [[label_list[l] for l in label if l != -100] for label in labels]
    true_preds = [[label_list[p] for (p, l) in zip(pred, label) if l != -100] for pred, label in zip(preds, labels)]
    precision, recall, f1, _ = precision_recall_fscore_support(
        sum(true_labels, []), sum(true_preds, []), average="macro"
    )
    return {"precision": precision, "recall": recall, "f1": f1}

# ⚠️ 구버전 대응: evaluation_strategy 제거
training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=4,
    num_train_epochs=3
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()


Some weights of BertForTokenClassification were not initialized from the model checkpoint at monologg/kobert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/2 [00:00<?, ? examples/s]

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(


Step,Training Loss


TypeError: KoBertTokenizer.save_vocabulary() got an unexpected keyword argument 'filename_prefix'

In [None]:
# ✅ KoBERT NER 전체 학습 코드 (Colab 호환, 저장 생략, wandb 제거 포함)
!pip install kobert-transformers transformers datasets seqeval --quiet

import os
os.environ["WANDB_DISABLED"] = "true"  # wandb 비활성화

from kobert_transformers import get_tokenizer
from transformers import BertForTokenClassification, Trainer, TrainingArguments
from datasets import Dataset
from sklearn.metrics import precision_recall_fscore_support
import numpy as np

# 라벨 정의
label_list = ["O", "B-PER", "B-AGE", "B-ORG", "B-DISEASE", "B-LOC"]
label2id = {label: i for i, label in enumerate(label_list)}
id2label = {i: label for i, label in enumerate(label_list)}

# 샘플 데이터 (자유롭게 교체 가능)
train_data = {
    "tokens": [
        ["김철수", "는", "서울대병원", "에서", "위암", "치료", "를", "받았다", "."],
        ["70세", "남성", "환자", "는", "강남구", "에", "거주", "하고", "있다", "."]
    ],
    "ner_tags": [
        [1, 0, 3, 0, 4, 0, 0, 0, 0],
        [2, 2, 0, 0, 5, 0, 0, 0, 0, 0]
    ]
}

# 토크나이저 및 모델 로드
tokenizer = get_tokenizer()
model = BertForTokenClassification.from_pretrained("monologg/kobert", num_labels=len(label_list))

# 토큰 정렬 및 라벨 정렬 함수
def tokenize_and_align_labels(example):
    tokens = example["tokens"]
    labels = example["ner_tags"]
    encoding = tokenizer(tokens, is_split_into_words=True, padding="max_length", truncation=True, return_tensors="pt")
    input_len = encoding["input_ids"].shape[1]
    aligned_labels = labels[:input_len] + [-100] * (input_len - len(labels))
    return {
        "input_ids": encoding["input_ids"][0],
        "attention_mask": encoding["attention_mask"][0],
        "labels": aligned_labels
    }

# 데이터셋 변환
dataset = Dataset.from_dict(train_data)
tokenized_dataset = dataset.map(tokenize_and_align_labels)

# 평가 지표 정의
def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=2)
    labels = p.label_ids
    true_labels = [[label_list[l] for l in label if l != -100] for label in labels]
    true_preds = [[label_list[p] for (p, l) in zip(pred, label) if l != -100] for pred, label in zip(preds, labels)]
    precision, recall, f1, _ = precision_recall_fscore_support(
        sum(true_labels, []), sum(true_preds, []), average="macro"
    )
    return {"precision": precision, "recall": recall, "f1": f1}

# Trainer 설정 (저장/로깅 모두 제거)
training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=4,
    num_train_epochs=3,
    save_strategy="no",     # 모델 저장 X
    report_to="none"        # wandb/MLflow X
)

# Trainer 초기화 (tokenizer 제거로 오류 방지)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    eval_dataset=tokenized_dataset,
    compute_metrics=compute_metrics
)

# 학습 시작
trainer.train()


Some weights of BertForTokenClassification were not initialized from the model checkpoint at monologg/kobert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/2 [00:00<?, ? examples/s]

Step,Training Loss


TrainOutput(global_step=3, training_loss=1.5093096097310383, metrics={'train_runtime': 0.7198, 'train_samples_per_second': 8.336, 'train_steps_per_second': 4.168, 'total_flos': 1567837237248.0, 'train_loss': 1.5093096097310383, 'epoch': 3.0})

In [None]:
metrics = trainer.evaluate()
print(metrics)


{'eval_loss': 1.1146095991134644, 'eval_precision': 0.11403508771929825, 'eval_recall': 0.16666666666666666, 'eval_f1': 0.13541666666666666, 'eval_runtime': 0.1251, 'eval_samples_per_second': 15.992, 'eval_steps_per_second': 7.996, 'epoch': 3.0}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
predictions = trainer.predict(tokenized_dataset)
print(predictions.metrics)


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'test_loss': 1.1146095991134644, 'test_precision': 0.11403508771929825, 'test_recall': 0.16666666666666666, 'test_f1': 0.13541666666666666, 'test_runtime': 0.1428, 'test_samples_per_second': 14.001, 'test_steps_per_second': 7.0}


In [None]:
preds = np.argmax(predictions.predictions, axis=2)
labels = predictions.label_ids

for i in range(len(labels)):
    print("🔹 문장", i+1)
    print("GT: ", [label_list[t] for t in labels[i] if t != -100])
    print("PR: ", [label_list[p] for (p, l) in zip(preds[i], labels[i]) if l != -100])
    print()


🔹 문장 1
GT:  ['B-PER', 'O', 'B-ORG', 'O', 'B-DISEASE', 'O', 'O', 'O', 'O']
PR:  ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']

🔹 문장 2
GT:  ['B-AGE', 'B-AGE', 'O', 'O', 'B-LOC', 'O', 'O', 'O', 'O', 'O']
PR:  ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']



In [None]:
# ✅ 1. 라이브러리 설치
!pip install kobert-transformers transformers datasets seqeval --quiet

# ✅ 2. 환경 설정
import os
os.environ["WANDB_DISABLED"] = "true"

from kobert_transformers import get_tokenizer
from transformers import BertForTokenClassification, Trainer, TrainingArguments
from datasets import Dataset
from sklearn.metrics import precision_recall_fscore_support
import numpy as np

# ✅ 3. 라벨 정의
label_list = ["O", "B-PER", "B-AGE", "B-ORG", "B-DISEASE"]
label2id = {label: i for i, label in enumerate(label_list)}
id2label = {i: label for i, label in enumerate(label_list)}

# ✅ 4. 증강된 학습 데이터 (20문장)
train_data = {
    "tokens": [
        ['김철수','는','70세','환자로','분당서울대병원','에서','당뇨','치료','를','받았다','.'],
        ['이영희','는','60세','여성으로','강남세브란스병원','에서','위암','수술','을','받았다','.'],
        ['정우성','은','45세','남성으로','서울아산병원','에서','고혈압','진료','를','받았다','.'],
        ['박민정','은','35세','환자로','분당서울대병원','에서','간암','치료','를','받았다','.'],
        ['최유리','는','55세','여성으로','강남세브란스병원','에서','당뇨','수술','을','받았다','.'],
        ['김지훈','은','50세','남성으로','서울아산병원','에서','위암','진단','을','받았다','.'],
        ['이민호','는','65세','환자로','분당서울대병원','에서','간암','진료','를','받았다','.'],
        ['한지민','은','40세','여성으로','강남세브란스병원','에서','고혈압','치료','를','받았다','.'],
        ['박서준','은','72세','남성으로','서울아산병원','에서','당뇨','수술','을','받았다','.'],
        ['강다현','은','38세','환자로','분당서울대병원','에서','위암','치료','를','받았다','.'],
        ['김예린','은','47세','여성으로','서울아산병원','에서','고혈압','진료','를','받았다','.'],
        ['이준기','는','69세','남성으로','강남세브란스병원','에서','간암','수술','을','받았다','.'],
        ['정다은','은','53세','여성으로','분당서울대병원','에서','당뇨','치료','를','받았다','.'],
        ['최현우','는','61세','남성으로','서울아산병원','에서','위암','진료','를','받았다','.'],
        ['박하늘','은','36세','환자로','강남세브란스병원','에서','고혈압','수술','을','받았다','.'],
        ['이수정','은','58세','여성으로','분당서울대병원','에서','간암','진단','을','받았다','.'],
        ['장민재','는','66세','남성으로','서울아산병원','에서','당뇨','진료','를','받았다','.'],
        ['유소연','은','44세','여성으로','강남세브란스병원','에서','위암','치료','를','받았다','.'],
        ['오세훈','은','70세','환자로','분당서울대병원','에서','고혈압','수술','을','받았다','.'],
        ['임하늘','은','52세','여성으로','서울아산병원','에서','간암','치료','를','받았다','.']
    ],
    "ner_tags": [
        ['B-PER','O','B-AGE','O','B-ORG','O','B-DISEASE','O','O','O','O']
    ] * 20
}

# ✅ 5. Tokenizer 및 Model 로드
tokenizer = get_tokenizer()
model = BertForTokenClassification.from_pretrained("monologg/kobert", num_labels=len(label_list))

# ✅ 6. 토큰화 함수
def tokenize_and_align_labels(example):
    tokens = example["tokens"]
    labels = example["ner_tags"]
    encoding = tokenizer(tokens, is_split_into_words=True, padding="max_length", truncation=True, return_tensors="pt")
    input_len = encoding["input_ids"].shape[1]
    aligned_labels = labels[:input_len] + [-100] * (input_len - len(labels))
    return {
        "input_ids": encoding["input_ids"][0],
        "attention_mask": encoding["attention_mask"][0],
        "labels": [label2id.get(t, 0) if t != -100 else -100 for t in aligned_labels]
    }

# ✅ 7. 데이터셋 생성 및 변환
dataset = Dataset.from_dict(train_data)
tokenized_dataset = dataset.map(tokenize_and_align_labels)

# ✅ 8. 평가 함수
def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=2)
    labels = p.label_ids
    true_labels = [[id2label[l] for l in label if l != -100] for label in labels]
    true_preds = [[id2label[p] for (p, l) in zip(pred, label) if l != -100] for pred, label in zip(preds, labels)]
    precision, recall, f1, _ = precision_recall_fscore_support(
        sum(true_labels, []), sum(true_preds, []), average="macro", zero_division=0
    )
    return {"precision": precision, "recall": recall, "f1": f1}

# ✅ 9. 학습 파라미터
training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=4,
    num_train_epochs=5,
    save_strategy="no",
    report_to="none"
)

# ✅ 10. Trainer 생성 및 학습 실행
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    eval_dataset=tokenized_dataset,
    compute_metrics=compute_metrics
)

trainer.train()
metrics = trainer.evaluate()
print(metrics)


Some weights of BertForTokenClassification were not initialized from the model checkpoint at monologg/kobert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/20 [00:00<?, ? examples/s]

Step,Training Loss


{'eval_loss': 0.3641359210014343, 'eval_precision': 0.8801901408450705, 'eval_recall': 0.8542857142857143, 'eval_f1': 0.8623180627435947, 'eval_runtime': 0.8386, 'eval_samples_per_second': 23.849, 'eval_steps_per_second': 3.577, 'epoch': 5.0}


In [None]:
# ✅ 1. 필수 라이브러리 설치 (처음 한 번만)
!pip install kobert-transformers transformers datasets seqeval --quiet

# ✅ 2. 전체 학습 코드
import os
os.environ["WANDB_DISABLED"] = "true"

import pandas as pd
import numpy as np
import ast
from datasets import Dataset
from transformers import BertForTokenClassification, Trainer, TrainingArguments
from kobert_transformers import get_tokenizer
from sklearn.metrics import precision_recall_fscore_support

# ✅ 3. CSV 경로 (수정 필요 시 아래만 변경)
df = pd.read_csv("KoBERT______________300__.csv")

# ✅ 4. CSV 로드 및 파싱
df = pd.read_csv(file_path)
df["tokens"] = df["tokens"].apply(ast.literal_eval)
df["ner_tags"] = df["ner_tags"].apply(ast.literal_eval)
dataset = Dataset.from_pandas(df)

# ✅ 5. 라벨 정의
label_list = ["O", "B-PER", "B-AGE", "B-ORG", "B-DISEASE"]
label2id = {l: i for i, l in enumerate(label_list)}
id2label = {i: l for i, l in enumerate(label_list)}

# ✅ 6. Tokenizer & Model
tokenizer = get_tokenizer()
model = BertForTokenClassification.from_pretrained("monologg/kobert", num_labels=len(label_list))

# ✅ 7. 토큰화 함수
def tokenize_and_align_labels(example):
    tokens = example["tokens"]
    labels = example["ner_tags"]
    encoding = tokenizer(tokens, is_split_into_words=True, padding="max_length", truncation=True, return_tensors="pt")
    input_len = encoding["input_ids"].shape[1]
    aligned_labels = labels[:input_len] + [-100] * (input_len - len(labels))
    return {
        "input_ids": encoding["input_ids"][0],
        "attention_mask": encoding["attention_mask"][0],
        "labels": [label2id.get(t, 0) if t != -100 else -100 for t in aligned_labels]
    }

tokenized_dataset = dataset.map(tokenize_and_align_labels)

# ✅ 8. 평가 함수
def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=2)
    labels = p.label_ids
    true_labels = [[id2label[l] for l in label if l != -100] for label in labels]
    true_preds = [[id2label[p] for (p, l) in zip(pred, label) if l != -100] for pred, label in zip(preds, labels)]
    precision, recall, f1, _ = precision_recall_fscore_support(
        sum(true_labels, []), sum(true_preds, []), average="macro", zero_division=0
    )
    return {"precision": precision, "recall": recall, "f1": f1}

# ✅ 9. 학습 인자 설정
training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=4,
    num_train_epochs=5,
    save_strategy="no",
    report_to="none"
)

# ✅ 10. Trainer 생성 및 학습
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    eval_dataset=tokenized_dataset,
    compute_metrics=compute_metrics
)

trainer.train()
metrics = trainer.evaluate()
print(metrics)


FileNotFoundError: [Errno 2] No such file or directory: 'E:/KoBERT______________300__.csv'

In [None]:
# ✅ 1. 필수 라이브러리 설치 (처음 한 번만)
!pip install kobert-transformers transformers datasets seqeval --quiet

# ✅ 2. 전체 학습 코드
import os
os.environ["WANDB_DISABLED"] = "true"

import pandas as pd
import numpy as np
import ast
from datasets import Dataset
from transformers import BertForTokenClassification, Trainer, TrainingArguments
from kobert_transformers import get_tokenizer
from sklearn.metrics import precision_recall_fscore_support

# ✅ 3. CSV 경로에서 로드
df = pd.read_csv("KoBERT______________300__.csv")  # ✔️ 여기만 사용

# ✅ 4. 리스트 파싱
df["tokens"] = df["tokens"].apply(ast.literal_eval)
df["ner_tags"] = df["ner_tags"].apply(ast.literal_eval)
dataset = Dataset.from_pandas(df)

# ✅ 5. 라벨 정의
label_list = ["O", "B-PER", "B-AGE", "B-ORG", "B-DISEASE"]
label2id = {l: i for i, l in enumerate(label_list)}
id2label = {i: l for i, l in enumerate(label_list)}

# ✅ 6. Tokenizer & Model
tokenizer = get_tokenizer()
model = BertForTokenClassification.from_pretrained("monologg/kobert", num_labels=len(label_list))

# ✅ 7. 토큰화 함수
def tokenize_and_align_labels(example):
    tokens = example["tokens"]
    labels = example["ner_tags"]
    encoding = tokenizer(tokens, is_split_into_words=True, padding="max_length", truncation=True, return_tensors="pt")
    input_len = encoding["input_ids"].shape[1]
    aligned_labels = labels[:input_len] + [-100] * (input_len - len(labels))
    return {
        "input_ids": encoding["input_ids"][0],
        "attention_mask": encoding["attention_mask"][0],
        "labels": [label2id.get(t, 0) if t != -100 else -100 for t in aligned_labels]
    }

tokenized_dataset = dataset.map(tokenize_and_align_labels)

# ✅ 8. 평가 함수
def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=2)
    labels = p.label_ids
    true_labels = [[id2label[l] for l in label if l != -100] for label in labels]
    true_preds = [[id2label[p] for (p, l) in zip(pred, label) if l != -100] for pred, label in zip(preds, labels)]
    precision, recall, f1, _ = precision_recall_fscore_support(
        sum(true_labels, []), sum(true_preds, []), average="macro", zero_division=0
    )
    return {"precision": precision, "recall": recall, "f1": f1}

# ✅ 9. 학습 인자 설정
training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=4,
    num_train_epochs=5,
    save_strategy="no",
    report_to="none"
)

# ✅ 10. Trainer 생성 및 학습
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    eval_dataset=tokenized_dataset,
    compute_metrics=compute_metrics
)

trainer.train()
metrics = trainer.evaluate()
print(metrics)


Some weights of BertForTokenClassification were not initialized from the model checkpoint at monologg/kobert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/300 [00:00<?, ? examples/s]

Step,Training Loss


{'eval_loss': 0.007920964621007442, 'eval_precision': 1.0, 'eval_recall': 1.0, 'eval_f1': 1.0, 'eval_runtime': 8.0238, 'eval_samples_per_second': 37.389, 'eval_steps_per_second': 4.736, 'epoch': 5.0}


In [None]:
from google.colab import files
uploaded = files.upload()

Saving KoBERT______________300__.csv to KoBERT______________300__.csv
