In [1]:
!bash <(curl -s https://raw.githubusercontent.com/konlpy/konlpy/v0.6.0/scripts/mecab.sh)

mecab-ko is already installed
mecab-ko-dic is already installed
mecab-python is already installed
Done.


In [2]:
import numpy as np
import pandas as pd
import tensorflow as tf
from konlpy.tag import Mecab
import re
import os
import io
import time
import random

mecab = Mecab()
from sklearn.model_selection import train_test_split

print(tf.__version__)

2.4.1


In [3]:
path_to_zip = tf.keras.utils.get_file(
    'spa-eng.zip',
    origin='http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip',
    extract=True)

path_to_file = os.path.dirname(path_to_zip)+"/spa-eng/spa.txt"

with open(path_to_file, "r") as f:
    corpus = f.read().splitlines()

print("Data Size:", len(corpus))
print("Example:")

for sen in corpus[0:100][::20]: print(">>", sen)

Data Size: 118964
Example:
>> Go.	Ve.
>> Wait.	Esperen.
>> Hug me.	Abrázame.
>> No way!	¡Ni cagando!
>> Call me.	Llamame.


In [4]:
def generate_tokenizer(corpus,
                       vocab_size,
                       lang="spa-eng",
                       pad_id=0,   # pad token의 일련번호
                       bos_id=1,  # 문장의 시작을 의미하는 bos token(<s>)의 일련번호
                       eos_id=2,  # 문장의 끝을 의미하는 eos token(</s>)의 일련번호
                       unk_id=3):   # unk token의 일련번호
    file = "./%s_corpus.txt" % lang
    model = "%s_spm" % lang

    with open(file, 'w') as f:
        for row in corpus: f.write(str(row) + '\n')

    import sentencepiece as spm
    spm.SentencePieceTrainer.Train(
        '--input=./%s --model_prefix=%s --vocab_size=%d'\
        % (file, model, vocab_size) + \
        '--pad_id==%d --bos_id=%d --eos_id=%d --unk_id=%d'\
        % (pad_id, bos_id, eos_id, unk_id)
    )

    tokenizer = spm.SentencePieceProcessor()
    tokenizer.Load('%s.model' % model)

    return tokenizer
print("슝=3")

슝=3


In [5]:
cleaned_corpus = list(set(corpus)) 

VOCAB_SIZE = 20000
tokenizer = generate_tokenizer(cleaned_corpus, VOCAB_SIZE)
tokenizer.set_encode_extra_options("bos:eos")  # 문장 양 끝에 <s> , </s> 추가

True

In [6]:
def preprocess_sentence(sentence):
    sentence = sentence.lower()

    sentence = re.sub(r"([?.!,¿¡])", r" \1 ", sentence)
    sentence = re.sub(r'[" "]+', " ", sentence)
    sentence = re.sub(r"[^a-zA-Z?.!,¿¡]+", " ", sentence)

    sentence = sentence.strip()
    
    return sentence
print("슝=3")

슝=3


In [7]:
from tqdm import tqdm_notebook    # Process 과정을 보기 위해

src_corpus = []
tgt_corpus = []

for pair in tqdm_notebook(cleaned_corpus):
    src, tgt = pair.split('\t')

    src_tokens = tokenizer.encode_as_ids(preprocess_sentence(src))   # encode_ad_ids() 는 문자열을 숫자로 분할합니다.
    tgt_tokens = tokenizer.encode_as_ids(preprocess_sentence(tgt))

    if (len(src_tokens) > 50): continue
    if (len(tgt_tokens) > 50): continue
    
    src_corpus.append(src_tokens)
    tgt_corpus.append(tgt_tokens)

len(src_corpus)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


  0%|          | 0/118964 [00:00<?, ?it/s]

118951

In [8]:
enc_tensor = tf.keras.preprocessing.sequence.pad_sequences(src_corpus, padding='post')
dec_tensor = tf.keras.preprocessing.sequence.pad_sequences(tgt_corpus, padding='post')

enc_train, enc_val, dec_train, dec_val = \
train_test_split(enc_tensor, dec_tensor, test_size=0.01)

print("enc_train :", len(enc_train), "enc_val :", len(enc_val))
print("dec_train :", len(dec_train), "dec_val :",len(dec_val))

enc_train : 117761 enc_val : 1190
dec_train : 117761 dec_val : 1190


In [9]:
!ls aiffel/CloudData/nn


transformer.py


In [24]:
from aiffel.CloudData.nn.transformer import Transformer, loss_function



transformer = Transformer(n_layers=2, d_model=512, n_heads=8, dff=2048,
                          src_vocab_size=20000, tgt_vocab_size=20000,
                          pos_len=100, loss_function=loss_function, dropout=0.3, shared=True)

In [25]:
history = transformer.fit(1,enc_train, dec_train, enc_val, dec_val, 256)

Epoch  1: 100%|██████████| 461/461 [10:09<00:00,  1.32s/it, Loss 3.2357]
Val_epoch  1: 100%|██████████| 3/3 [00:02<00:00,  1.47it/s, Val_loss 2.0753]


In [28]:
model_dir = "aiffel/CloudData/Model/transformer/weight"

model_path = os.path.join(model_dir, "2layer3Epoch.h5")
transformer.save_weights(model_path)

# checkpoint_dir = "/aiffel/CloudData/Model/transformer/weight"

# checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
# checkpoint = tf.train.Checkpoint(transformer=transformer)

# checkpoint.save(file_prefix=checkpoint_prefix)

# latest = tf.train.latest_checkpoint(checkpoint_dir)
# checkpoint.restore(latest)
# transformer = checkpoint.transformer

In [11]:

from aiffel.CloudData.nn.transformer import Transformer, loss_function



transformer = Transformer(n_layers=2, d_model=512, n_heads=8, dff=1024,
                          src_vocab_size=20000, tgt_vocab_size=20000,
                          pos_len=100, loss_function=loss_function, dropout=0.3, shared=True)
transformer.fit(1,enc_val,dec_val)
model_dir = "aiffel/CloudData/Model/transformer/weight"

model_path = os.path.join(model_dir, "2layer3Epoch.h5")
transformer.load_weights(model_path)

Epoch  1: 100%|██████████| 10/10 [00:14<00:00,  1.44s/it, Loss 10.0983]


In [12]:
# !pip install nltk # nltk가 설치되어 있지 않은 경우 주석 해제
from nltk.translate.bleu_score import sentence_bleu

reference = "많 은 자연어 처리 연구자 들 이 트랜스포머 를 선호 한다".split()
candidate = "적 은 자연어 학 개발자 들 가 트랜스포머 을 선호 한다 요".split()

print("원문:", reference)
print("번역문:", candidate)
print("BLEU Score:", sentence_bleu([reference], candidate))

원문: ['많', '은', '자연어', '처리', '연구자', '들', '이', '트랜스포머', '를', '선호', '한다']
번역문: ['적', '은', '자연어', '학', '개발자', '들', '가', '트랜스포머', '을', '선호', '한다', '요']
BLEU Score: 8.190757052088229e-155


The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


In [13]:
print("1-gram:", sentence_bleu([reference], candidate, weights=[1, 0, 0, 0]))
print("2-gram:", sentence_bleu([reference], candidate, weights=[0, 1, 0, 0]))
print("3-gram:", sentence_bleu([reference], candidate, weights=[0, 0, 1, 0]))
print("4-gram:", sentence_bleu([reference], candidate, weights=[0, 0, 0, 1]))

1-gram: 0.5
2-gram: 0.18181818181818182
3-gram: 2.2250738585072626e-308
4-gram: 2.2250738585072626e-308


### 자세히 보니, 3개 이상 연속으로 맞춘것이 없었군요.
### 조금 덜 엄격하게 바꿔보겠습니다.

In [14]:
from nltk.translate.bleu_score import SmoothingFunction

def calculate_bleu(reference, candidate, weights=[0.25, 0.25, 0.25, 0.25]):
    return sentence_bleu([reference],
                         candidate,
                         weights=weights,
                         smoothing_function=SmoothingFunction().method1)  # smoothing_function 적용

print("BLEU-1:", calculate_bleu(reference, candidate, weights=[1, 0, 0, 0]))
print("BLEU-2:", calculate_bleu(reference, candidate, weights=[0, 1, 0, 0]))
print("BLEU-3:", calculate_bleu(reference, candidate, weights=[0, 0, 1, 0]))
print("BLEU-4:", calculate_bleu(reference, candidate, weights=[0, 0, 0, 1]))

print("\nBLEU-Total:", calculate_bleu(reference, candidate))

BLEU-1: 0.5
BLEU-2: 0.18181818181818182
BLEU-3: 0.010000000000000004
BLEU-4: 0.011111111111111112

BLEU-Total: 0.05637560315259291


In [15]:
def generate_padding_mask(seq):
    seq = tf.cast(tf.math.equal(seq, 0), tf.float32)
    return seq[:, tf.newaxis, tf.newaxis, :]

def generate_causality_mask(src_len, tgt_len):
    mask = 1 - np.cumsum(np.eye(src_len, tgt_len), 0)
    return tf.cast(mask, tf.float32)

def generate_masks(src, tgt):
    enc_mask = generate_padding_mask(src)
    dec_mask = generate_padding_mask(tgt)

    dec_causality_mask = generate_causality_mask(tgt.shape[1], tgt.shape[1])
    dec_mask = tf.maximum(dec_mask, dec_causality_mask)

    dec_enc_causality_mask = generate_causality_mask(tgt.shape[1], src.shape[1])
    dec_enc_mask = tf.maximum(enc_mask, dec_enc_causality_mask)

    return enc_mask, dec_enc_mask, dec_mask

In [16]:
# def evaluate(sentence, model, src_tokenizer, tgt_tokenizer):
#     sentence = preprocess_sentence(sentence)

#     pieces = src_tokenizer.encode_as_pieces(sentence)  # 문자열을 token으로 분할합니다. 
#     tokens = src_tokenizer.encode_as_ids(sentence)  # 문자열을 숫자로 분할합니다.

#     _input = tf.keras.preprocessing.sequence.pad_sequences([tokens],
#                                                            maxlen=enc_train.shape[-1],
#                                                            padding='post')
    
#     ids = []
#     output = tf.expand_dims([tgt_tokenizer.bos_id()], 0)   
#     for i in range(dec_train.shape[-1]):
#         enc_padding_mask, combined_mask, dec_padding_mask = \
#         generate_masks(_input, output)

#         predictions, enc_attns, dec_attns, dec_enc_attns =\
#         model(_input, 
#               output,
#               enc_padding_mask,
#               combined_mask,
#               dec_padding_mask)

#         predicted_id = \
#         tf.argmax(tf.math.softmax(predictions, axis=-1)[0, -1]).numpy().item()  # predictions에 소프트맥스 함수를 적용하여 가장 큰 값의 인덱스를 predicted_id로 저장합니다.

#         if tgt_tokenizer.eos_id() == predicted_id:
#             result = tgt_tokenizer.decode_ids(ids)  # 숫자를 문자열로 복원합니다.  
#             return pieces, result, enc_attns, dec_attns, dec_enc_attns

#         ids.append(predicted_id)
#         output = tf.concat([output, tf.expand_dims([predicted_id], 0)], axis=-1)

#     result = tgt_tokenizer.decode_ids(ids)  
#     return pieces, result, enc_attns, dec_attns, dec_enc_attns

# print("슝=3")

# def translate(sentence, model, src_tokenizer, tgt_tokenizer):
#     pieces, result, enc_attns, dec_attns, dec_enc_attns = \
#     evaluate(sentence, model, src_tokenizer, tgt_tokenizer)

#     return result
# print("슝=3")
# def eval_bleu(src_corpus, tgt_corpus, verbose=True):
#     total_score = 0.0
#     sample_size = len(tgt_corpus)

#     for idx in tqdm_notebook(range(sample_size)):
#         src_tokens = src_corpus[idx]
#         tgt_tokens = tgt_corpus[idx]

#         src_sentence = tokenizer.decode_ids((src_tokens.tolist()))  
#         tgt_sentence = tokenizer.decode_ids((tgt_tokens.tolist()))

#         reference = preprocess_sentence(tgt_sentence).split()
#         candidate = translate(src_sentence, transformer, tokenizer, tokenizer).split()

#         score = sentence_bleu([reference], candidate,
#                               smoothing_function=SmoothingFunction().method1)
#         total_score += score

#         if verbose:
#             print("Source Sentence: ", src_sentence)
#             print("Model Prediction: ", candidate)
#             print("Real: ", reference)
#             print("Score: %lf\n" % score)

#     print("Num of Sample:", sample_size)
#     print("Total Score:", total_score / sample_size)
# print("슝=3")

#### 저는 제가 정의한 트랜스포머를 쓸것이기 때문에, 위 함수를 쓰지 않겠습니다.


In [4]:
# 고친버전
def eval_bleu(src_corpus, tgt_corpus, model, verbose=True):
    total_score = 0.0
    sample_size = len(tgt_corpus)

    for idx in tqdm_notebook(range(sample_size)):
        src_tokens = src_corpus[idx]
        tgt_tokens = tgt_corpus[idx]

        src_sentence = tokenizer.decode_ids((src_tokens.tolist()))  
        tgt_sentence = tokenizer.decode_ids((tgt_tokens.tolist()))

        reference = preprocess_sentence(tgt_sentence).split()
        candidate = model.translate(src_sentence, tokenizer, tokenizer, _print=False).split()

        score = sentence_bleu([reference], candidate,
                              smoothing_function=SmoothingFunction().method1)
        total_score += score

        if verbose:
            print("Source Sentence: ", src_sentence)
            print("Model Prediction: ", candidate)
            print("Real: ", reference)
            print("Score: %lf\n" % score)

    print("Num of Sample:", sample_size)
    print("Total Score:", total_score / sample_size)
print("히히")


히히


In [18]:
eval_bleu(enc_val[:3], dec_val[:3], transformer, True)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


  0%|          | 0/3 [00:00<?, ?it/s]

Source Sentence:  the river swelled rapidly because of the heavy rain ..................................
Model Prediction:  ['t', 's', ',', 'no', 'es', 's', 'ng', 'ng', 'ng', 'ng', 'ng', 'ng', 'ng', 'ng', 'ng', 'ng', 'ng', 'ng', 'ng', 'ng', 'ng', 'ng', 'ng', 'ng', 'ng', 'ingenuo']
Real:  ['el', 'r', 'o', 'subi', 'r', 'pidamente', 'a', 'causa', 'de', 'la', 'lluvia', 'fuerte', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.']
Score: 0.000000

Source Sentence:  i thought you wanted to be a dancer .....................................
Model Prediction:  ['t', 's', ',', 'no', 'es', 's', 'ng', 'ng', 'ng', 'ng', 'ng', 'ng', 'ng', 'ng', 'ng', 'ng', 'ng', 'ng', 'ng', 'ng', 'ng', 'ng', 'ng', 'ng', 'ng', 'ingenuo']
Real:  ['pens', 'que', 'quer', 'as', 'ser', 'bailarina', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.',

In [20]:
eval_bleu(enc_val[::50], dec_val[::50], transformer, verbose=False)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


  0%|          | 0/24 [00:00<?, ?it/s]

Num of Sample: 24
Total Score: 0.0018504091418684698


In [21]:
import math

def beam_search_decoder(prob, beam_size):
    sequences = [[[], 1.0]]  # 생성된 문장과 점수를 저장

    for tok in prob:
        all_candidates = []

        for seq, score in sequences:
            for idx, p in enumerate(tok): # 각 단어의 확률을 총점에 누적 곱
                candidate = [seq + [idx], score * -math.log(-(p-1))]
                all_candidates.append(candidate)

        ordered = sorted(all_candidates,
                         key=lambda tup:tup[1],
                         reverse=True) # 총점 순 정렬
        sequences = ordered[:beam_size] # Beam Size에 해당하는 문장만 저장 

    return sequences
print("슝=3")

슝=3


In [22]:
vocab = {
    0: "<pad>",
    1: "까요?",
    2: "커피",
    3: "마셔",
    4: "가져",
    5: "될",
    6: "를",
    7: "한",
    8: "잔",
    9: "도",
}

prob_seq = [[0.01, 0.01, 0.60, 0.32, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01],
            [0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.75, 0.01, 0.01, 0.17],
            [0.01, 0.01, 0.01, 0.35, 0.48, 0.10, 0.01, 0.01, 0.01, 0.01],
            [0.24, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.68],
            [0.01, 0.01, 0.12, 0.01, 0.01, 0.80, 0.01, 0.01, 0.01, 0.01],
            [0.01, 0.81, 0.01, 0.01, 0.01, 0.01, 0.11, 0.01, 0.01, 0.01],
            [0.70, 0.22, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01],
            [0.91, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01],
            [0.91, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01],
            [0.91, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01]]

prob_seq = np.array(prob_seq)
beam_size = 3

result = beam_search_decoder(prob_seq, beam_size)

for seq, score in result:
    sentence = ""

    for word in seq:
        sentence += vocab[word] + " "

    print(sentence, "// Score: %.4f" % score)

커피 를 가져 도 될 까요? <pad> <pad> <pad> <pad>  // Score: 42.5243
커피 를 마셔 도 될 까요? <pad> <pad> <pad> <pad>  // Score: 28.0135
마셔 를 가져 도 될 까요? <pad> <pad> <pad> <pad>  // Score: 17.8983


In [5]:
def calc_prob(src_ids, tgt_ids, model):
    enc_padding_mask, combined_mask, dec_padding_mask = \
    generate_masks(src_ids, tgt_ids)

    predictions, enc_attns, dec_attns, dec_enc_attns =\
    model(src_ids, 
            tgt_ids,
            enc_padding_mask,
            combined_mask,
            dec_padding_mask)
    
    return tf.math.softmax(predictions, axis=-1)

In [6]:
def beam_search_decoder(sentence, 
                        src_len,
                        tgt_len,
                        model,
                        src_tokenizer,
                        tgt_tokenizer,
                        beam_size):
    sentence = preprocess_sentence(sentence)

    pieces = src_tokenizer.encode_as_pieces(sentence)
    tokens = src_tokenizer.encode_as_ids(sentence)

    src_in = tf.keras.preprocessing.sequence.pad_sequences([tokens],
                                                            maxlen=src_len,
                                                            padding='post')

    pred_cache = np.zeros((beam_size * beam_size, tgt_len), dtype=np.long)
    pred = np.zeros((beam_size, tgt_len), dtype=np.long)

    eos_flag = np.zeros((beam_size, ), dtype=np.long)
    scores = np.ones((beam_size, ))

    pred[:, 0] = tgt_tokenizer.bos_id()

    dec_in = tf.expand_dims(pred[0, :1], 0)
    prob = calc_prob(src_in, dec_in, model)[0, -1].numpy()

    for seq_pos in range(1, tgt_len):
        score_cache = np.ones((beam_size * beam_size, ))

        # init
        for branch_idx in range(beam_size):
            cache_pos = branch_idx*beam_size

            score_cache[cache_pos:cache_pos+beam_size] = scores[branch_idx]
            pred_cache[cache_pos:cache_pos+beam_size, :seq_pos] = \
            pred[branch_idx, :seq_pos]

        for branch_idx in range(beam_size):
            cache_pos = branch_idx*beam_size

            if seq_pos != 1:   # 모든 Branch를 로 시작하는 경우를 방지
                dec_in = pred_cache[branch_idx, :seq_pos]
                dec_in = tf.expand_dims(dec_in, 0)

                prob = calc_prob(src_in, dec_in, model)[0, -1].numpy()

            for beam_idx in range(beam_size):
                max_idx = np.argmax(prob)

                score_cache[cache_pos+beam_idx] *= prob[max_idx]
                pred_cache[cache_pos+beam_idx, seq_pos] = max_idx

                prob[max_idx] = -1

        for beam_idx in range(beam_size):
            if eos_flag[beam_idx] == -1: continue

            max_idx = np.argmax(score_cache)
            prediction = pred_cache[max_idx, :seq_pos+1]

            pred[beam_idx, :seq_pos+1] = prediction
            scores[beam_idx] = score_cache[max_idx]
            score_cache[max_idx] = -1

            if prediction[-1] == tgt_tokenizer.eos_id():
                eos_flag[beam_idx] = -1

    return pred

In [7]:
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.bleu_score import SmoothingFunction

def calculate_bleu(reference, candidate, weights=[0.25, 0.25, 0.25, 0.25]):
    return sentence_bleu([reference],
                            candidate,
                            weights=weights,
                            smoothing_function=SmoothingFunction().method1)

def beam_bleu(reference, ids, tokenizer):
    reference = reference.split()

    total_score = 0.0
    for _id in ids:
        candidate = tokenizer.decode_ids(_id.tolist()).split()
        score = calculate_bleu(reference, candidate)

        print("Reference:", reference)
        print("Candidate:", candidate)
        print("BLEU:", calculate_bleu(reference, candidate))

        total_score += score
        
    return total_score / len(ids)

In [26]:
idx = 324

ids = \
beam_search_decoder(tokenizer.decode_ids(enc_val[idx].tolist()),
                    enc_train.shape[-1],
                    dec_train.shape[-1],
                    transformer,
                    tokenizer,
                    tokenizer,
                    beam_size=5)

bleu = beam_bleu(tokenizer.decode_ids(dec_val[idx].tolist()), ids, tokenizer)

Reference: ['nadie', 'sabe', 'c', 'mo', 'me', 'siento', '.........................................']
Candidate: ['t', 's', ',', 'no', 'es', 's', 'ng', 'ng', 'ng', 'ng', 'ng', 'ng', 'ng', 'ng', 'ng', 'ng', 'ng', 'ng', 'ng', 'ng', 'ng', 'ng', 'ng', 'ng', 'ng', 'ingenuo']
BLEU: 0
Reference: ['nadie', 'sabe', 'c', 'mo', 'me', 'siento', '.........................................']
Candidate: ['t', 's', ',', 'no', 'es', 's', 'ng', 'ng', 'ng', 'ng', 'ng', 'ng', 'ng', 'ng', 'ng', 'ng', 'ng', 'ng', 'ng', 'ng', 'ng', 'ng', 'ingenuong', 'ng', 'ng', 'ingenuo']
BLEU: 0
Reference: ['nadie', 'sabe', 'c', 'mo', 'me', 'siento', '.........................................']
Candidate: ['t', 's', ',', 'no', 'es', 's', '!', 'ng', 'ng', 'ng', 'ng', 'ng', 'ng', 'ng', 'ng', 'ng', 'ng', 'ng', 'ng', 'ng', 'ng', 'ng', 'ng', 'ng', 'ng', 'ingenuo']
BLEU: 0
Reference: ['nadie', 'sabe', 'c', 'mo', 'me', 'siento', '.........................................']
Candidate: ['t', 's', ',', 'no', 'es', 's', '!', 'ng', 'ng'

In [27]:
import gensim.downloader as api

wv = api.load('glove-wiki-gigaword-300')





In [28]:
wv.most_similar("banana")

[('bananas', 0.6691170930862427),
 ('mango', 0.5804104208946228),
 ('pineapple', 0.5492372512817383),
 ('coconut', 0.5462778806686401),
 ('papaya', 0.541056752204895),
 ('fruit', 0.52181077003479),
 ('growers', 0.4877638816833496),
 ('nut', 0.48399588465690613),
 ('peanut', 0.48062023520469666),
 ('potato', 0.48061180114746094)]

In [29]:
import random

sample_sentence = "you know ? all you need is attention ."
sample_tokens = sample_sentence.split()

selected_tok = random.choice(sample_tokens)

result = ""
for tok in sample_tokens:
    if tok is selected_tok:
        result += wv.most_similar(tok)[0][0] + " "

    else:
        result += tok + " "

print("From:", sample_sentence)
print("To:", result)

From: you know ? all you need is attention .
To: you know ? all you needs is attention . 


In [30]:
def lexical_sub(sentence, word2vec):
    import random
    
    res = ""
    toks = sentence.split()

    try:
        _from = random.choice(toks)
        _to = word2vec.most_similar(_from)[0][0]
        
    except:   # 단어장에 없는 단어
        return None

    for tok in toks:
        if tok is _from: res += _to + " "
        else: res += tok + " "

    return res


from tqdm import tqdm

new_corpus = []

for idx in tqdm(range(3000)):
    old_src = tokenizer.decode_ids(src_corpus[idx])

    new_src = lexical_sub(old_src, wv)

    if new_src is not None: new_corpus.append(new_src)

    new_corpus.append(old_src)

print(new_corpus[:10])

100%|██████████| 3000/3000 [02:12<00:00, 22.60it/s]

['i m not as rich also i once was . ', 'i m not as rich as i once was .', 'tom talked to mary wednesday the telephone . ', 'tom talked to mary on the telephone .', 'i thought that he would come , ', 'i thought that he would come .', 'tom doesn t think for certain when mary will arrive . ', 'tom doesn t know for certain when mary will arrive .', 'i don shirts know where i am . ', 'i don t know where i am .']





In [3]:
import pandas as pd
data_path = "aiffel/Chatbot_data/ChatbotData.csv"
chat_data = pd.read_csv(data_path)
chat_data.head()

Unnamed: 0,Q,A,label
0,12시 땡!,하루가 또 가네요.,0
1,1지망 학교 떨어졌어,위로해 드립니다.,0
2,3박4일 놀러가고 싶다,여행은 언제나 좋죠.,0
3,3박4일 정도 놀러가고 싶다,여행은 언제나 좋죠.,0
4,PPL 심하네,눈살이 찌푸려지죠.,0


In [4]:
count =0
for i in zip(chat_data["Q"],chat_data["A"]):
    print(*i, sep=f"{' '*(30-len(i[0])*2)}\t:\t")
    count +=1
    if count==5:break

# chat_data[["Q", "A"]]

12시 땡!                  	:	하루가 또 가네요.
1지망 학교 떨어졌어        	:	위로해 드립니다.
3박4일 놀러가고 싶다      	:	여행은 언제나 좋죠.
3박4일 정도 놀러가고 싶다	:	여행은 언제나 좋죠.
PPL 심하네                	:	눈살이 찌푸려지죠.


In [5]:
from aiffel.CloudData.nn.transformer import preprocess_sentence, tokenize
clean_corpus = []
for i,j in zip(chat_data["Q"],chat_data["A"]):
    q = mecab.morphs(preprocess_sentence(i))
    a = mecab.morphs(preprocess_sentence(j))
    clean_corpus.append((q,a))


In [6]:
q,a= zip(*clean_corpus)
q[:5]


(['12', '시', '땡', '!'],
 ['1', '지망', '학교', '떨어졌', '어'],
 ['3', '박', '4', '일', '놀', '러', '가', '고', '싶', '다'],
 ['3', '박', '4', '일', '정도', '놀', '러', '가', '고', '싶', '다'],
 ['ppl', '심하', '네'])

In [7]:
qmax = 0
amax = 0

for i in clean_corpus:
    if len(i[0]) > qmax: qmax = len(i[0])
    if len(i[1]) > amax: amax = len(i[1])

print(qmax, amax, len(q))


32 40 11823


In [8]:
qa = list(q)
add_a = list(map(lambda x:['<bos>']+x+['<eos>'], list(a)))

qa.extend(add_a)
QA, tokenizer = tokenize(qa,42,20000)
Q = QA[:11823]
A = QA[11823:]

In [9]:
A[:3]

array([[   2,  246,    9,  145,    9,   35,    1,    3,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0],
       [   2,  520,   14, 1495,    1,    3,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0],
       [   2,  235,   17,  701,   12,   33,    1,    3,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0]],
      dtype=int32)

In [10]:
from aiffel.CloudData.nn.transformer import Transformer, loss_function

START_TOKEN = [2]
END_TOKEN = [3]

transformer = Transformer(n_layers=4, d_model=512, n_heads=8, dff=2048,
                          src_vocab_size=20000, tgt_vocab_size=20000,
                          pos_len=100, loss_function=loss_function, dropout=0.3, shared=True)

In [24]:
hi = transformer.fit(1, Q[:1000],A[:1000])

Epoch  1: 100%|██████████| 8/8 [00:13<00:00,  1.68s/it, Loss 4.0167]


In [25]:
model_dir = "aiffel/CloudData/Model/transformer/weight"

model_path = os.path.join(model_dir, "2layer3Epoch.h5")
transformer.save_weights(model_path)


In [27]:
Transformer = None

In [9]:

from aiffel.CloudData.nn.transformer import Transformer, loss_function



transformer = Transformer(n_layers=4, d_model=512, n_heads=8, dff=2048,
                          src_vocab_size=20000, tgt_vocab_size=20000,
                          pos_len=100, loss_function=loss_function, dropout=0.3, shared=True)
transformer.fit(1,Q[:1000],A[:1000])
model_dir = "aiffel/CloudData/Model/transformer/weight"

model_path = os.path.join(model_dir, "2layer3Epoch.h5")
transformer.load_weights(model_path)

Epoch  1:  12%|█▎        | 1/8 [00:23<02:42, 23.17s/it, Loss 10.7816]


ResourceExhaustedError: 2 root error(s) found.
  (0) Resource exhausted:  OOM when allocating tensor with shape[128,41,20000] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc
	 [[node strided_slice_3 (defined at /aiffel/aiffel/CloudData/nn/transformer.py:608) ]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.

	 [[gradient_tape/transformer/embedding_1/embedding_lookup/Reshape/_30]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.

  (1) Resource exhausted:  OOM when allocating tensor with shape[128,41,20000] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc
	 [[node strided_slice_3 (defined at /aiffel/aiffel/CloudData/nn/transformer.py:608) ]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.

0 successful operations.
0 derived errors ignored. [Op:__inference_train_step_11885]

Errors may have originated from an input operation.
Input Source operations connected to node strided_slice_3:
 transformer/dense_64/BiasAdd (defined at /aiffel/aiffel/CloudData/nn/transformer.py:563)

Input Source operations connected to node strided_slice_3:
 transformer/dense_64/BiasAdd (defined at /aiffel/aiffel/CloudData/nn/transformer.py:563)

Function call stack:
train_step -> train_step


In [None]:
START_TOKEN = 2
END_TOKEN = 3

result = transformer.translate("하.. 오늘 뭐먹지?", tokenizer,tokenizer)
result = transformer.translate("퇴근하고 싶다ㅠㅠ", tokenizer,tokenizer)
result = transformer.translate("넌 이름이 뭐니?", tokenizer,tokenizer)
# sentence = "하.. 오늘 뭐먹지?"

# pieces = mecab.morphs(sentence)
# tokens = tokenizer.texts_to_sequences([pieces])
# tokens
# _input = tf.keras.preprocessing.sequence.pad_sequences([tokens],
#                                                     maxlen=42,
#                                                     padding='post')

Input: 하.. 오늘 뭐먹지?
Predicted translation: ['저 도 하 는 게 좋 아요 . <eos> 아요 . <eos> 아요 . <eos> 아요 . <eos> 아요 . <eos> 아요 . <eos> 아요 . <eos> <eos> <eos> <eos> <eos> 아요 . <eos> . <eos> 아요 . <eos> 거 예요 .']
Input: 퇴근하고 싶다ㅠㅠ
Predicted translation: ['저 도 있 을 거 예요 . <eos> 아요 . <eos> 아요 . <eos> 아요 . <eos> 아요 . <eos> 아요 . <eos> . <eos> 아요 . <eos> <eos> <eos> <eos> <eos> <eos> . <eos> . <eos> 거 예요 . <eos> <eos>']
Input: 넌 이름이 뭐니?
Predicted translation: ['저 도 있 을 거 예요 . <eos> 아요 . <eos> 아요 . <eos> 아요 . <eos> 아요 . <eos> 아요 . <eos> 는 게 좋 아요 . <eos> <eos> <eos> 아요 . <eos> . <eos> . <eos> 거 예요 . <eos>']
