In [None]:
from src.dataset import MakeDataset
from src.model import MakeEmbed
from torch.utils.data import DataLoader


dataset = MakeDataset()
embed_dataset = dataset.make_embed_dataset()

embed = MakeEmbed()
embed.word2vec_init()
embed.word2vec.build_vocab(embed_dataset)
embed.word2vec_train(embed_dataset, epoch=1000)

In [None]:
import pandas as pd
train_data = pd.read_csv("./cafe_intent_data.csv")
train_data.isnull().values.any()
len(train_data)
print(embed.word2vec.wv.vectors.shape)
print(embed.word2vec.wv.vector_size)

In [None]:
from src.dataset import MakeDataset
from src.model import MakeEmbed
from torch.utils.data import DataLoader

# 데이터셋 만들기
dataset = MakeDataset()

# 임베딩 모델 불러오기
embed = MakeEmbed()
embed.load_word2vec()

intent_train_dataset, intent_test_dataset = dataset.make_intent_dataset(embed)

batch_size = 128

train_dataloader = DataLoader(intent_train_dataset, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(intent_test_dataset, batch_size=batch_size, shuffle=True)
for X, y in train_dataloader:
    print(X)
    print(y)
    break

In [None]:
import torch
from src.model import textCNN

# 텍스트 CNN 모델 만들기
weights = embed.word2vec.wv.vectors
weights = torch.FloatTensor(weights)
print(weights.shape)
num_class = len(dataset.intent_label)
print(weights.shape, num_class)
textcnn_model = textCNN(weights, 256, [3,4,5], 0.5, num_class)
optimizer = torch.optim.Adam(textcnn_model.parameters(), lr=0.01)

In [None]:
textcnn_model

In [None]:
print(textcnn_model.embed.weight.shape)
print(textcnn_model.embed.weight[:].data.shape)
print(textcnn_model.w2v.shape)
print(textcnn_model.vocab_size)
print(textcnn_model.emb_dim)
print(textcnn_model.embed)
for X,y in train_dataloader:
    print(X.shape)
    print(y.shape)
    break

In [None]:
!pip install tqdm
from tqdm import tqdm
from tqdm import trange
import os
import torch.nn.functional as F

epoch = 100
prev_acc = 0
save_dir = "./nlp/pretrained/"
save_prefix = "cafe_intent_clsf"

def save(model, save_dir, save_prefix, epoch):
    if not os.path.isdir(save_dir):
        os.makedirs(save_dir)
    save_prefix = os.path.join(save_dir, save_prefix)
    save_path = '{}_steps_{}.pt'.format(save_prefix, epoch)
    torch.save(model.state_dict(), save_path)

for i in range(epoch):
    steps = 0

    textcnn_model.train() # 모델 학습 하겠다. (parameters가 수정됨)

    with tqdm(train_dataloader, unit="batch") as tepoch: # 진행상황 표시
        for data in tepoch:
            tepoch.set_description(f"Epoch {i}")
            x = data[0]
            target = data[1]
            logit = textcnn_model.forward(x)

            optimizer.zero_grad()
            loss = F.cross_entropy(logit, target) 
            loss.backward()
            optimizer.step()

            corrects = (torch.max(logit, 1)[1].view(target.size()).data == target.data).sum()
            accuracy = 100.0 * corrects/x.size()[0]
            tepoch.set_postfix(loss=loss.item(), accuracy= accuracy.numpy())

    textcnn_model.eval() # 모델 검증하겠다 (parameters 수정안됨)
    steps = 0
    accuarcy_list = []
    with tqdm(test_dataloader, unit="batch") as tepoch:
        for data in tepoch:
            tepoch.set_description(f"Epoch {i}")
            x = data[0]
            target = data[1]

            logit = textcnn_model.forward(x)
            loss = F.cross_entropy(logit, target)
            corrects = (torch.max(logit, 1)[1].view(target.size()).data == target.data).sum()
            accuracy = 100.0 * corrects/x.size()[0]
            accuarcy_list.append(accuracy.tolist())

            tepoch.set_postfix(loss=loss.item(), accuracy= sum(accuarcy_list)/len(accuarcy_list))

    # epoch 당 검증 셋의 정확도를 계산하고 이전 정확도 보다 높으면 저장
    acc = sum(accuarcy_list)/len(accuarcy_list)
    if(acc>prev_acc):
        prev_acc = acc
        save(textcnn_model, save_dir, save_prefix+"_"+str(round(acc,3)), i)

In [3]:
import torch
import re
from src.NLU import NaturalLanguageUnderstanding
from src.NLG import NaturalLanguageGenerator

text = "밀크티 다섯잔만 주세요"
text = re.sub('[-=+,#/\?:^$.@*\"※~&%ㆍ!』\\‘|\(\)\[\]\<\>`\'…》]', '', text) # 특수문자 제거
text = re.sub('만 ',' ',text)
text = re.sub('쥬스','주스',text)
text = re.sub('티라미수','티라미슈',text)
text = re.sub('티라미스','티라미슈',text)
text = re.sub('마키아토','마끼아또',text)
text = re.sub('캐러멜','카라멜',text)
print(text)
nlg = NaturalLanguageGenerator()
nlg.run_nlg(text)

밀크티 다섯잔 주세요
intent: 일반주문
predict: [[19, 22, 0]]
nlu_result: {'INTENT': '일반주문', 'SLOT': ['TEA^밀크티', 'COUNT^다섯잔']}
templates: []
result: []


[]