In [None]:
from src.dataset import MakeDataset
from src.model import MakeEmbed
from torch.utils.data import DataLoader


dataset = MakeDataset()
embed_dataset = dataset.make_embed_dataset()

embed = MakeEmbed()
embed.word2vec_init()
embed.word2vec.build_vocab(embed_dataset)
embed.word2vec_train(embed_dataset, epoch=1000)

In [None]:
import pandas as pd
train_data = pd.read_csv("./cafe_intent_data.csv")
train_data.isnull().values.any()
len(train_data)
print(embed.word2vec.wv.vectors.shape)
print(embed.word2vec.wv.vector_size)

In [None]:
from src.dataset import MakeDataset
from src.model import MakeEmbed
from torch.utils.data import DataLoader

# 데이터셋 만들기
dataset = MakeDataset()

# 임베딩 모델 불러오기
embed = MakeEmbed()
embed.load_word2vec()

intent_train_dataset, intent_test_dataset = dataset.make_intent_dataset(embed)

batch_size = 128

train_dataloader = DataLoader(intent_train_dataset, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(intent_test_dataset, batch_size=batch_size, shuffle=True)
for X, y in train_dataloader:
    print(X)
    print(y)
    break

In [None]:
import torch
from src.model import textCNN

# 텍스트 CNN 모델 만들기
weights = embed.word2vec.wv.vectors
weights = torch.FloatTensor(weights)
print(weights.shape)
num_class = len(dataset.intent_label)
print(weights.shape, num_class)
textcnn_model = textCNN(weights, 256, [3,4,5], 0.5, num_class)
optimizer = torch.optim.Adam(textcnn_model.parameters(), lr=0.01)

In [None]:
textcnn_model

In [None]:
print(textcnn_model.embed.weight.shape)
print(textcnn_model.embed.weight[:].data.shape)
print(textcnn_model.w2v.shape)
print(textcnn_model.vocab_size)
print(textcnn_model.emb_dim)
print(textcnn_model.embed)
for X,y in train_dataloader:
    print(X.shape)
    print(y.shape)
    break

In [None]:
!pip install tqdm
from tqdm import tqdm
from tqdm import trange
import os
import torch.nn.functional as F

epoch = 100
prev_acc = 0
save_dir = "./nlp/pretrained/"
save_prefix = "cafe_intent_clsf"

def save(model, save_dir, save_prefix, epoch):
    if not os.path.isdir(save_dir):
        os.makedirs(save_dir)
    save_prefix = os.path.join(save_dir, save_prefix)
    save_path = '{}_steps_{}.pt'.format(save_prefix, epoch)
    torch.save(model.state_dict(), save_path)

for i in range(epoch):
    steps = 0

    textcnn_model.train() # 모델 학습 하겠다. (parameters가 수정됨)

    with tqdm(train_dataloader, unit="batch") as tepoch: # 진행상황 표시
        for data in tepoch:
            tepoch.set_description(f"Epoch {i}")
            x = data[0]
            target = data[1]
            logit = textcnn_model.forward(x)

            optimizer.zero_grad()
            loss = F.cross_entropy(logit, target) 
            loss.backward()
            optimizer.step()

            corrects = (torch.max(logit, 1)[1].view(target.size()).data == target.data).sum()
            accuracy = 100.0 * corrects/x.size()[0]
            tepoch.set_postfix(loss=loss.item(), accuracy= accuracy.numpy())

    textcnn_model.eval() # 모델 검증하겠다 (parameters 수정안됨)
    steps = 0
    accuarcy_list = []
    with tqdm(test_dataloader, unit="batch") as tepoch:
        for data in tepoch:
            tepoch.set_description(f"Epoch {i}")
            x = data[0]
            target = data[1]

            logit = textcnn_model.forward(x)
            loss = F.cross_entropy(logit, target)
            corrects = (torch.max(logit, 1)[1].view(target.size()).data == target.data).sum()
            accuracy = 100.0 * corrects/x.size()[0]
            accuarcy_list.append(accuracy.tolist())

            tepoch.set_postfix(loss=loss.item(), accuracy= sum(accuarcy_list)/len(accuarcy_list))

    # epoch 당 검증 셋의 정확도를 계산하고 이전 정확도 보다 높으면 저장
    acc = sum(accuarcy_list)/len(accuarcy_list)
    if(acc>prev_acc):
        prev_acc = acc
        save(textcnn_model, save_dir, save_prefix+"_"+str(round(acc,3)), i)

In [2]:
import torch
import re
from src.NLU import NaturalLanguageUnderstanding
from src.NLG import NaturalLanguageGenerator

text = "안녕하세요"

nlg = NaturalLanguageGenerator()
nlg.run_nlg(text)

intent: ood
predict: [[0]]


TypeError: cos_sim() takes 2 positional arguments but 3 were given

In [11]:
from src.model import SBERT
from sentence_transformers import SentenceTransformer

# class SBERT():
#     def __init__(self):
#         self.ood_model_pretrain_path = "./nlp/pretrained/ood_answer_pretrained_model"
#         self.ood_model = SentenceTransformer(self.ood_model_pretrain_path)
#         self.train_data = pd.read_csv('cafe_ood_answer_data.csv')

#     def cos_sim(self, A, B):
#         return dot(A, B) / (norm(A) * norm(B))

#     def return_answer(self, question):
#         embedding = self.ood_model.encode(question)
#         self.train_data['score'] = self.train_data.apply(lambda x: self.cos_sim(x['embedding'], embedding), axis=1)
#         return self.train_data.loc[self.train_data['score'].idxmax()]['A']
    
text = "안녕하세요"
ood_answer = SBERT()
embedding = ood_answer.ood_model.encode(text)
print(embedding)
# result = ood_answer.return_answer(text)
# print(result)



[-2.28155255e-01  1.19632082e-02  1.06543672e+00 -4.24747616e-02
  8.74028578e-02 -1.58011213e-01  5.06849468e-01  5.05672634e-01
 -5.03933668e-01 -9.99585614e-02 -4.53358680e-01  1.60735592e-01
 -5.11951268e-01 -9.05622318e-02  6.59945756e-02  1.53185725e-01
 -3.81028444e-01 -4.44019586e-01  4.68312293e-01 -1.75748348e-01
 -8.20154190e-01 -1.83119059e-01  2.00009868e-01 -2.25576818e-01
 -4.63023752e-01 -6.02596700e-01  4.91864204e-01 -7.72864044e-01
  3.80416393e-01 -2.52616256e-01 -2.89161634e-02 -6.63760975e-02
  2.37034187e-01 -2.32946053e-01 -4.01798159e-01  1.67035744e-01
 -7.68121719e-01 -2.38607470e-02 -1.40064970e-01 -1.62645400e-01
  1.04701269e+00 -2.87956834e-01  7.74595916e-01  2.88067490e-01
 -7.74425983e-01 -4.31935906e-01 -4.50164795e-01  5.82178056e-01
 -1.85537502e-01 -2.69602895e-01 -5.90912879e-01 -3.80476445e-01
 -3.28489989e-01  4.70188469e-01  8.92082676e-02  9.21492353e-02
 -4.60205674e-01 -3.03531796e-01  3.68865103e-01  5.84823489e-01
  1.00241315e+00 -7.32150