In [1]:
pip freeze > requirements.txt

Note: you may need to restart the kernel to use updated packages.


In [4]:
# 필요 라이브러리 로드
import time, sys, warnings
import pandas as pd  # 데이터 처리
import torch  # 텐서 계산
from torch.utils.data import Dataset, DataLoader  # 데이터 로딩
from transformers import AutoTokenizer, AutoModelForSequenceClassification  # 트랜스포머 모델
import torch.nn.functional as F

# 사용자 정의 모듈 로드
from common import *
from func_collection import *

# 경고 메시지 무시 설정
warnings.filterwarnings('ignore')

# 데이터셋 클래스 정의
class TextDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

TEXT_MODEL_FILES = os.path.join(os.getcwd(), 'models_text')

# 표준 단어 세트 로드 및 대표단어 변경
standard_word_set = pd_read_json(os.path.join(TEXT_MODEL_FILES, 'TB_MMA_CNV_DIC_M.json'))
word_dict = dict(zip(standard_word_set['INP_DTLS'], standard_word_set['RPSN_DTLS']))

# 레이블 데이터 로드
labels_df = pd.read_excel(os.path.join(TEXT_MODEL_FILES, 'labels_task1.xlsx'))
labels_df = labels_df.drop('Unnamed: 0', axis=1)

# GPU 설정
if torch.cuda.is_available():
    torch.cuda.set_device(0)
    device = torch.device('cuda:0')
    print("Using GPU:", torch.cuda.get_device_name(0))
    tokenizer = torch.load(os.path.join(TEXT_MODEL_FILES, 'dev_dev_lh_text_clf_tok_20240625_thi.pt'))
    model = torch.load(os.path.join(TEXT_MODEL_FILES, 'dev_dev_lh_text_clf_20240625_thi.pt'))
else:
    device = torch.device('cpu')
    tokenizer = torch.load(os.path.join(TEXT_MODEL_FILES, 'dev_dev_lh_text_clf_tok_20240625_thi.pt'))
    model = torch.load(os.path.join(TEXT_MODEL_FILES, 'dev_dev_lh_text_clf_20240625_thi.pt'), map_location=torch.device('cpu'))    
    print("Using CPU")

Using CPU


In [27]:
# 텍스트 예측 함수 정의
def txt_prd_func(model, flw_cts, word_dict, tokenizer, labels_df, num):
    start_time = time.time()  # 시작 시간 저장

    # 입력 처리'../data/dic/불용어.txt'
    input_text = flw_cts
    pre_input_text = preprocess_text(TEXT_MODEL_FILES, 'stopwords.txt', input_text)
    stan_input_text = replace_words(pre_input_text, 1, word_dict)

    # 데이터 토큰화, 데이터셋 생성, 데이터 로더 생성
    test_encodings = tokenizer([stan_input_text], truncation=True, padding=True)    
    test_dataset = TextDataset(test_encodings, [0])         
    test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False)

    # 장치 설정
    device = torch.device('cuda:0') if torch.cuda.is_available() else torch.device('cpu')
    
    # 모델 평가
    model.to(device)
    model.eval()        
    for batch in test_loader:  
        with torch.no_grad():
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask)

    # 결과 계산
    probabilities = F.softmax(outputs.logits, dim=1)
    prob, indices = torch.topk(probabilities, num)

    # 응답 생성
    result = []
    for i, label_index in enumerate(indices.tolist()[0][:num]):
        result.append({
            "tp_cd": labels_df['tp_cd'].loc[label_index],
            "tp_nm": labels_df['labels'].loc[label_index],
            "tp_prb": round(prob.tolist()[0][i], 4)
        })

    def _ordinal_suffix(n):
        if 11 <= n % 100 <= 13:
            return 'th'
        else:
            return {1: 'st', 2: 'nd', 3: 'rd'}.get(n % 10, 'th')        

    # Rename keys for the first three results
    key_mapping = {i: f"{_ordinal_suffix(i+1)}_tp" for i in range(num)}
    # key_mapping = {0: "fir", 1: "sec", 2: "thi"}
    rspns = {"result": [{f"{key_mapping[i]}_tp_cd": item["tp_cd"], f"{key_mapping[i]}_tp_nm": item["tp_nm"], f"{key_mapping[i]}_tp_prb": item["tp_prb"]} for i, item in enumerate(result)], "status": "ok"}

    end_time = time.time()  # 종료 시간 저장
    print("응답시간: ", end_time - start_time)  # 현재시각 - 시작시간 = 실행 시간

    return rspns

In [30]:
flw_cts = '상부장이 흔들려서 문이 잘 안닫혀요'
txt_prd_func(model, flw_cts, word_dict, tokenizer, labels_df, 51)

indices 1 SequenceClassifierOutput(loss=None, logits=tensor([[ 1.1367,  1.6419,  8.7133, -1.4026,  0.9806,  0.1393,  0.9250, 10.2871,
         -0.7516, -0.4098,  0.1448, -2.3899, -1.8774, -0.7302, -0.2367, -1.3805,
         -0.5325,  1.0246, -1.0576, -0.1312,  1.7065,  0.2016, -2.8904, -2.0111,
         -1.8138, -1.0579, -2.4571, -0.0890, -2.8503, -2.4373, -1.1945, -2.3927,
         -2.3696, -1.6583,  0.2017, -1.4041, -2.7171, -1.9852, -2.5601, -2.5393,
         -1.9778, -3.2035, -1.8429, -2.3358, -2.3097, -0.3879, -1.5173, -3.4042,
         -3.2906,  0.8751, -2.8683]]), hidden_states=None, attentions=None) le 1
응답시간:  0.2009413242340088


{'result': [{'st_tp_tp_cd': 1, 'st_tp_tp_nm': '개폐불량', 'st_tp_tp_prb': 0.8274},
  {'nd_tp_tp_cd': 7, 'nd_tp_tp_nm': '고정불량', 'nd_tp_tp_prb': 0.1715},
  {'rd_tp_tp_cd': 65, 'rd_tp_tp_nm': '잠금불량', 'rd_tp_tp_prb': 0.0002},
  {'th_tp_tp_cd': 67, 'th_tp_tp_nm': '점검/교체', 'th_tp_tp_prb': 0.0001},
  {'th_tp_tp_cd': 64, 'th_tp_tp_nm': '작동불량', 'th_tp_tp_prb': 0.0001},
  {'th_tp_tp_cd': 61, 'th_tp_tp_nm': '이격', 'th_tp_tp_prb': 0.0001},
  {'th_tp_tp_cd': 84, 'th_tp_tp_nm': '흠집', 'th_tp_tp_prb': 0.0001},
  {'th_tp_tp_cd': 57, 'th_tp_tp_nm': '오염', 'th_tp_tp_prb': 0.0001},
  {'th_tp_tp_cd': 38, 'th_tp_tp_nm': '변형', 'th_tp_tp_prb': 0.0001},
  {'th_tp_tp_cd': 14, 'th_tp_tp_nm': '누락', 'th_tp_tp_prb': 0.0},
  {'th_tp_tp_cd': 99, 'th_tp_tp_nm': '손잡이불량', 'th_tp_tp_prb': 0.0},
  {'th_tp_tp_cd': 81, 'th_tp_tp_nm': '파손', 'th_tp_tp_prb': 0.0},
  {'th_tp_tp_cd': 26, 'th_tp_tp_nm': '들뜸', 'th_tp_tp_prb': 0.0},
  {'th_tp_tp_cd': 19, 'th_tp_tp_nm': '단차', 'th_tp_tp_prb': 0.0},
  {'th_tp_tp_cd': 34, 'th_tp_tp_nm': '배수불

'ok'

In [None]:
txt_prd_func('수전에서 물이새요')