In [1]:
import pandas as pd
import numpy as np
import os

import re
import emoji
from konlpy.tag import Okt

from sklearn.metrics import accuracy_score
from scipy.stats import entropy

from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset, DatasetDict
import torch

import requests
import json
import ollama

from langchain_openai import ChatOpenAI
from langchain_ollama import ChatOllama
from langchain_core.prompts import PromptTemplate

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
os.chdir("C:/Users/flexmatch/Desktop/ssom/code/3.SNS-categorizer")

In [None]:
# from dotenv import load_dotenv
# load_dotenv('config/.env')

# token = os.getenv("HUGGINGFACE_TOKEN")

# model_name = "mistralai/Mistral-7B-v0.1"

# tokenizer = AutoTokenizer.from_pretrained(model_name, token=token)
# model = AutoModelForCausalLM.from_pretrained(
#     model_name,
#     token=token,
#     device_map="auto",  # 자동 GPU/CPU 분배
#     load_in_4bit=True,  # 8GB 환경 필수
#     torch_dtype=torch.float16
# )

In [None]:
new = pd.read_csv("./tests/data/final_fine-tuning_multi-columns_data.csv", index_col=0)
new = new.drop(['label_list'], axis=1)
new.info()

<class 'pandas.core.frame.DataFrame'>
Index: 39823 entries, 0 to 39822
Data columns (total 5 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   acnt_sub_nm_cleaned  35282 non-null  object
 1   intro_txt_cleaned    36009 non-null  object
 2   text                 39823 non-null  object
 3   single_label         39823 non-null  object
 4   label_id             39823 non-null  int64 
dtypes: int64(1), object(4)
memory usage: 1.8+ MB


In [None]:
# --- 하이퍼파라미터 및 BERT 모델 설정 ---
# 학습된 BERT 모델 설정
MODEL_NAME = "kykim/bert-kor-base" # 또는 finetune-bert-kykim 등 님이 학습시킨 모델 경로
FINETUNED_BERT_MODEL_PATH = "muli-columns-kykim-bert-kor" # finetune-bert-kykim 혹은 앙상블 모델 등

# 카테고리 라벨 목록 (BERT 학습 시 사용했던 라벨과 동일해야 함)
category_labels = ['IT', '게임', '결혼/연애', '교육', '다이어트/건강보조식품', '만화/애니/툰', '문구/완구', '미술/디자인', '반려동물', '베이비/키즈', '뷰티', '브랜드공식계정',
                   '사진/영상', '셀럽', '스포츠', '시사', '엔터테인먼트', '여행/관광', '유명장소/핫플', '일상', '자동차/모빌리티', '짤/밈', '취미', '패션', '푸드', '홈/리빙']

# --- 1. BERT 모델 및 토크나이저 로드 ---
bert_tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
bert_model = AutoModelForSequenceClassification.from_pretrained(
    FINETUNED_BERT_MODEL_PATH,
    num_labels=len(category_labels)
)
bert_model.eval() # 추론 모드로 전환

def tokenize_three_columns(examples):
    combined_texts = [
        f"{acnt} {bert_tokenizer.sep_token} {intro} {bert_tokenizer.sep_token} {txt}"
        for acnt, intro, txt in zip(
            examples["acnt_sub_nm_cleaned"],
            examples["intro_txt_cleaned"],
            examples["text"]
        )
    ]
    
    return bert_tokenizer(
        combined_texts,
        padding="max_length",
        truncation=True,
        max_length=512 
    )

predict_dataset = Dataset.from_pandas(new)
predict_dataset = predict_dataset.map(tokenize_three_columns, batched=True)
columns_to_remove = ['acnt_sub_nm_cleaned', 'intro_txt_cleaned', 'text']
predict_dataset = predict_dataset.remove_columns(columns_to_remove)
predict_dataset.set_format(type="torch", columns=['input_ids', 'attention_mask'])

Map: 100%|██████████| 39823/39823 [00:08<00:00, 4514.16 examples/s]


In [None]:
predict_df = new.copy()

In [None]:
# 예측용 TrainingArguments 및 Trainer 설정
prediction_args = TrainingArguments(
    output_dir="./prediction_output",
    per_device_eval_batch_size=16,
    do_train=False,
    do_predict=True,
    report_to="none",
    disable_tqdm=False,
)
trainer = Trainer(model=bert_model, args=prediction_args)

# 예측 수행
predictions_output = trainer.predict(predict_dataset)
logits = predictions_output.predictions
probabilities = torch.softmax(torch.tensor(logits), dim=-1).numpy()
predicted_class_indices = np.argmax(logits, axis=-1)

# 4. 결과 DataFrame에 추가
predict_df['bert_probabilities'] = [probs.tolist() for probs in probabilities]
predict_df['bert_top_label_idx'] = np.argmax(probabilities, axis=-1)
predict_df['bert_top_label'] = [category_labels[idx] for idx in predict_df['bert_top_label_idx']]
predict_df['bert_top_prob'] = np.max(probabilities, axis=-1)

In [None]:
bert_accuracy = accuracy_score(predict_df['label_id'], predict_df['bert_top_label_idx'])
print(bert_accuracy)

0.9157522035004897
