<a href="https://colab.research.google.com/github/Nobu90/scaling-broccoli/blob/main/chatbot2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

1. 챗봇 훈련데이터 전처리 과정이 체계적으로 진행되었는가?

챗봇 훈련데이터를 위한 전처리와 augmentation이 적절히 수행되어 3만개 가량의 훈련데이터셋이 구축되었다.

2. transformer 모델을 활용한 챗봇 모델이 과적합을 피해 안정적으로 훈련되었는가?

과적합을 피할 수 있는 하이퍼파라미터 셋이 적절히 제시되었다.

3. 챗봇이 사용자의 질문에 그럴듯한 형태로 답하는 사례가 있는가?

주어진 예문을 포함하여 챗봇에 던진 질문에 적절히 답하는 사례가 제출되었다.

# 환경 설정

In [1]:
!curl -s https://raw.githubusercontent.com/teddylee777/machine-learning/master/99-Misc/01-Colab/mecab-colab.sh | bash

--2023-03-17 03:23:11--  https://www.dropbox.com/s/9xls0tgtf3edgns/mecab-0.996-ko-0.9.2.tar.gz?dl=1
Resolving www.dropbox.com (www.dropbox.com)... 162.125.69.18, 2620:100:6035:18::a27d:5512
Connecting to www.dropbox.com (www.dropbox.com)|162.125.69.18|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: /s/dl/9xls0tgtf3edgns/mecab-0.996-ko-0.9.2.tar.gz [following]
--2023-03-17 03:23:12--  https://www.dropbox.com/s/dl/9xls0tgtf3edgns/mecab-0.996-ko-0.9.2.tar.gz
Reusing existing connection to www.dropbox.com:443.
HTTP request sent, awaiting response... 302 Found
Location: https://uc1a6fe6aac082f0cf72205fc0a7.dl.dropboxusercontent.com/cd/0/get/B4a8h74t7K_bXCDJS8sgHtOJQBAGeElNyp_K2xW_kHYdyDOkNgA8PHyLs9BoIXUSBOZmyr9hc8g-6qzPO42exphKEqH5VO2cZuQLcaa0hIUZgGliacLeNsNwf0U8Huhlk9_QBM6zSUqWE_-6CSoM8NU87T4ugrg1mBtBo6mEfa1vr76Bdj6QY9izazjD26xudQs/file?dl=1# [following]
--2023-03-17 03:23:12--  https://uc1a6fe6aac082f0cf72205fc0a7.dl.dropboxusercontent.com/cd/0/get/B4a8h74t7

In [2]:
import numpy as np
import pandas as pd
import tensorflow as tf
import nltk
import gensim
import re
import random

from konlpy.tag import Mecab
from pandas.core.computation.parsing import token
from tqdm.notebook import tqdm

print(np.__version__)
print(pd.__version__)
print(tf.__version__)
print(nltk.__version__)
print(gensim.__version__)

1.22.4
1.4.4
2.11.0
3.7
3.6.0


# 드라이브 연결

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# 데이터 읽기

In [4]:
file_path = '/content/drive/MyDrive/Colab Notebooks/aiffel/ GD12/data/ChatbotData.csv'

raw_data = pd.read_csv(file_path)

que_sentences = raw_data['Q']
ans_sentences = raw_data['A']
label_data = raw_data['label']

print(len(que_sentences))
print(len(ans_sentences))
print(len(label_data))
print(que_sentences[:5])
print(ans_sentences[:5])

11823
11823
11823
0             12시 땡!
1        1지망 학교 떨어졌어
2       3박4일 놀러가고 싶다
3    3박4일 정도 놀러가고 싶다
4            PPL 심하네
Name: Q, dtype: object
0     하루가 또 가네요.
1      위로해 드립니다.
2    여행은 언제나 좋죠.
3    여행은 언제나 좋죠.
4     눈살이 찌푸려지죠.
Name: A, dtype: object


# 전처리 & 토큰화

In [5]:
def preprocess_sentence(sentences):
  sentences = sentences.lower()
  sentences = re.sub('[^a-zA-Z0-9가-힣ㄱ-ㅎㅏ-ㅣ!@#$%^&*(),.?":{}|<>_+-=]', '', sentences)
  return sentences

In [6]:
def tokenizer(sentences, max_token_count):
  tokenizer = Mecab()
  tokenized_sentences = []

  for sentence in sentences:
      tokenized_sentence = tokenizer.morphs(sentence)
     
      if len(tokenized_sentence) <= max_token_count:
        tokenized_sentences.append(tokenized_sentence)
      else: 
        tokenized_sentences.append([''])
        
  return tokenized_sentences

In [7]:
def unique_sentence(sentences):
  unique_sentences = set()
  result = []

  for sentence in sentences:
    if sentence not in unique_sentences:
        result.append(sentence)
        unique_sentences.add(sentence)
    else:
        result.append("")

  return result

In [8]:
def build_corpus(que_sentences, ans_sentences, max_token_count):
  que_preprocessed = list(map(preprocess_sentence, que_sentences))
  ans_preprocessed = list(map(preprocess_sentence, ans_sentences))
  
  que_unique = unique_sentence(que_preprocessed)
  ans_unique = unique_sentence(ans_preprocessed)

  que_corpus = tokenizer(que_unique, max_token_count)
  ans_corpus = tokenizer(ans_unique, max_token_count)

  return que_corpus, ans_corpus

In [9]:
max_token_count = 20

que_corpus, ans_corpus = build_corpus(que_sentences, ans_sentences, max_token_count)

print(len(que_corpus))
print(len(ans_corpus))
print(que_corpus[:5])
print(ans_corpus[:5])

11823
11823
[['12', '시', '땡', '!'], ['1', '지망', '학교', '떨어졌', '어'], ['3', '박', '4', '일', '놀', '러', '가', '고', '싶', '다'], ['3', '박', '4', '일', '정도', '놀', '러', '가', '고', '싶', '다'], ['ppl', '심하', '네']]
[['하루', '가', '또', '가', '네요', '.'], ['위로', '해', '드립니다', '.'], ['여행', '은', '언제나', '좋', '죠', '.'], [], ['눈살', '이', '찌푸려', '지', '죠', '.']]


# w2v 불러오기

In [10]:
word2vec_model = gensim.models.Word2Vec.load("/content/drive/MyDrive/Colab Notebooks/aiffel/ GD12/data/ko.bin")

In [11]:
word2vec_model.most_similar("학교")

  word2vec_model.most_similar("학교")


[('학교의', 0.7560996413230896),
 ('강습소', 0.7425637245178223),
 ('중고등학교', 0.7386142015457153),
 ('전문학교', 0.7356827855110168),
 ('사립학교', 0.7347193956375122),
 ('소학교', 0.7305554747581482),
 ('여학교', 0.7091007232666016),
 ('사범학교', 0.6901223659515381),
 ('대학', 0.6897724866867065),
 ('학원', 0.6869212985038757)]

# 데이터 증강

In [12]:
def lexical_sub(sentence, word2vec):
    res = []

    try:
        _from = random.choice(sentence)
        _to = word2vec_model.most_similar(_from)[0][0]
    except:   
        return None

    for tok in sentence:
        if tok == _from:
            res.append(_to)
        else:
            res.append(tok)
    return res

In [13]:
def augment_corpus(que_corpus, ans_corpus, word2vec_model):
    new_corpus = []
    new_target = [] 
  
    for i in tqdm(range(len(que_corpus))):
      old_src = que_corpus[i]
      old_tgt = ans_corpus[i]

      new_src = lexical_sub(old_src, word2vec_model)
      new_tgt = lexical_sub(old_tgt, word2vec_model)
                         
      if new_src is not None: 
        new_corpus.append(new_src)
        new_target.append(old_tgt)

      if new_tgt is not None: 
         new_corpus.append(old_src)
         new_target.append(new_tgt)

    return new_corpus, new_target

In [14]:
first_aug_que, first_aug_ans = augment_corpus(que_corpus, ans_corpus, word2vec_model)
second_aug_que, second_aug_ans = augment_corpus(que_corpus, ans_corpus, word2vec_model)
third_aug_que, third_aug_ans= augment_corpus(que_corpus, ans_corpus, word2vec_model)

  0%|          | 0/11823 [00:00<?, ?it/s]

  _to = word2vec_model.most_similar(_from)[0][0]


  0%|          | 0/11823 [00:00<?, ?it/s]

  0%|          | 0/11823 [00:00<?, ?it/s]

In [15]:
agumented_que = first_aug_que + second_aug_que + third_aug_que
agumented_ans = first_aug_ans + second_aug_ans + third_aug_ans

print(len(agumented_que))
print(len(agumented_ans))

49530
49530


# 벡터화

In [16]:
for i in range(len(ans_corpus)):
    ans_corpus[i] = ["<start>"] + ans_corpus[i] + ["<end>"]

In [65]:
VOCAB_SIZE = 10000

total_corpus = que_corpus + ans_corpus

from collections import Counter
word_counts = Counter()
for sentence in total_corpus:
    for word in sentence:
        word_counts[word] += 1

vocab = [word for word, count in word_counts.most_common(VOCAB_SIZE)]

def vectorize(sentences, word2idx, max_len):
    vectors = []
    for sentence in sentences:
        vector = [word2idx[word] for word in sentence if word in word2idx]
        vector = vector[:max_len]
        vector += [word2idx['<pad>']] * (max_len - len(vector))
        vectors.append(vector)
    return np.array(vectors)

word2idx = {word: i+4 for i, word in enumerate(vocab)}
word2idx['<pad>'] = 0
word2idx['<start>'] = 1
word2idx['<end>'] = 2
word2idx['<unk>'] = 3

max_len = 10
enc_train = vectorize(que_corpus, word2idx, max_len)
dec_train = vectorize(ans_corpus, word2idx, max_len)

In [66]:
type(enc_train)

numpy.ndarray

In [57]:
BATCH_SIZE = 64
train_dataset = tf.data.Dataset.from_tensor_slices((enc_train, dec_train)).batch(BATCH_SIZE)
train_dataset

<BatchDataset element_spec=(TensorSpec(shape=(None, 10), dtype=tf.int64, name=None), TensorSpec(shape=(None, 10), dtype=tf.int64, name=None))>

# 트랜스포머 구현

## 포지셔널 인코딩

In [48]:
def positional_encoding(pos, d_model):
    def cal_angle(position, i):
        return position / np.power(10000, (2*(i//2)) / np.float32(d_model))

    def get_posi_angle_vec(position):
        return [cal_angle(position, i) for i in range(d_model)]

    sinusoid_table = np.array([get_posi_angle_vec(pos_i) for pos_i in range(pos)])

    sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2])
    sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2])

    return sinusoid_table

## 패딩 마스크

In [49]:
def generate_padding_mask(seq):
    seq = tf.cast(tf.math.equal(seq, 0), tf.float32)
    return seq[:, tf.newaxis, tf.newaxis, :]

def generate_lookahead_mask(size):
    mask = 1 - tf.linalg.band_part(tf.ones((size, size)), -1, 0)
    return mask

def generate_masks(src, tgt):
    enc_mask = generate_padding_mask(src)
    dec_enc_mask = generate_padding_mask(src)

    dec_lookahead_mask = generate_lookahead_mask(tgt.shape[1])
    dec_tgt_padding_mask = generate_padding_mask(tgt)
    dec_mask = tf.maximum(dec_tgt_padding_mask, dec_lookahead_mask)

    return enc_mask, dec_enc_mask, dec_mask

## 멀티 헤드 어텐션

In [50]:
class MultiHeadAttention(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads):
        super(MultiHeadAttention, self).__init__()
        self.num_heads = num_heads
        self.d_model = d_model
        
        self.depth = d_model // self.num_heads
        
        self.W_q = tf.keras.layers.Dense(d_model)
        self.W_k = tf.keras.layers.Dense(d_model)
        self.W_v = tf.keras.layers.Dense(d_model)
        
        self.linear = tf.keras.layers.Dense(d_model)

    def scaled_dot_product_attention(self, Q, K, V, mask):
        d_k = tf.cast(K.shape[-1], tf.float32)
        QK = tf.matmul(Q, K, transpose_b=True)

        scaled_qk = QK / tf.math.sqrt(d_k)

        if mask is not None: scaled_qk += (mask * -1e9)  

        attentions = tf.nn.softmax(scaled_qk, axis=-1)
        out = tf.matmul(attentions, V)

        return out, attentions
        

    def split_heads(self, x):
        bsz = x.shape[0]
        split_x = tf.reshape(x, (bsz, -1, self.num_heads, self.depth))
        split_x = tf.transpose(split_x, perm=[0, 2, 1, 3])

        return split_x

    def combine_heads(self, x):
        bsz = x.shape[0]
        combined_x = tf.transpose(x, perm=[0, 2, 1, 3])
        combined_x = tf.reshape(combined_x, (bsz, -1, self.d_model))

        return combined_x

    
    def call(self, Q, K, V, mask):
        WQ = self.W_q(Q)
        WK = self.W_k(K)
        WV = self.W_v(V)
        
        WQ_splits = self.split_heads(WQ)
        WK_splits = self.split_heads(WK)
        WV_splits = self.split_heads(WV)
        
        out, attention_weights = self.scaled_dot_product_attention(
            WQ_splits, WK_splits, WV_splits, mask)
                        
        out = self.combine_heads(out)
        out = self.linear(out)
            
        return out, attention_weights

## FFN

In [51]:
class PoswiseFeedForwardNet(tf.keras.layers.Layer):
    def __init__(self, d_model, d_ff):
        super(PoswiseFeedForwardNet, self).__init__()
        self.d_model = d_model
        self.d_ff = d_ff

        self.fc1 = tf.keras.layers.Dense(d_ff, activation='relu')
        self.fc2 = tf.keras.layers.Dense(d_model)

    def call(self, x):
        out = self.fc1(x)
        out = self.fc2(out)
            
        return out

## 인코더&디코더

In [52]:
class EncoderLayer(tf.keras.layers.Layer):
    def __init__(self, d_model, n_heads, d_ff, dropout):
        super(EncoderLayer, self).__init__()

        self.enc_self_attn = MultiHeadAttention(d_model, n_heads)
        self.ffn = PoswiseFeedForwardNet(d_model, d_ff)

        self.norm_1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.norm_2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)

        self.do = tf.keras.layers.Dropout(dropout)
        
    def call(self, x, mask):
        '''
        Multi-Head Attention
        '''
        residual = x
        out = self.norm_1(x)
        out, enc_attn = self.enc_self_attn(out, out, out, mask)
        out = self.do(out)
        out += residual
        
        '''
        Position-Wise Feed Forward Network
        '''
        residual = out
        out = self.norm_2(out)
        out = self.ffn(out)
        out = self.do(out)
        out += residual
        
        return out, enc_attn

In [53]:
class DecoderLayer(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads, d_ff, dropout):
        super(DecoderLayer, self).__init__()

        self.dec_self_attn = MultiHeadAttention(d_model, num_heads)
        self.enc_dec_attn = MultiHeadAttention(d_model, num_heads)

        self.ffn = PoswiseFeedForwardNet(d_model, d_ff)

        self.norm_1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.norm_2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.norm_3 = tf.keras.layers.LayerNormalization(epsilon=1e-6)

        self.do = tf.keras.layers.Dropout(dropout)
    
    def call(self, x, enc_out, dec_enc_mask, padding_mask):
        '''
        Masked Multi-Head Attention
        '''
        residual = x
        out = self.norm_1(x)
        out, dec_attn = self.dec_self_attn(out, out, out, padding_mask)
        out = self.do(out)
        out += residual

        '''
        Multi-Head Attention
        '''
        residual = out
        out = self.norm_2(out)
        # Q, K, V 순서에 주의하세요!
        out, dec_enc_attn = self.enc_dec_attn(Q=out, K=enc_out, V=enc_out, mask=dec_enc_mask)
        out = self.do(out)
        out += residual
        
        '''
        Position-Wise Feed Forward Network
        '''
        residual = out
        out = self.norm_3(out)
        out = self.ffn(out)
        out = self.do(out)
        out += residual

        return out, dec_attn, dec_enc_attn

In [54]:
class Encoder(tf.keras.Model):
    def __init__(self,
                    n_layers,
                    d_model,
                    n_heads,
                    d_ff,
                    dropout):
        super(Encoder, self).__init__()
        self.n_layers = n_layers
        self.enc_layers = [EncoderLayer(d_model, n_heads, d_ff, dropout) 
                        for _ in range(n_layers)]
    
        self.do = tf.keras.layers.Dropout(dropout)
        
    def call(self, x, mask):
        out = x
    
        enc_attns = list()
        for i in range(self.n_layers):
            out, enc_attn = self.enc_layers[i](out, mask)
            enc_attns.append(enc_attn)
        
        return out, enc_attns

In [55]:
class Decoder(tf.keras.Model):
    def __init__(self,
                    n_layers,
                    d_model,
                    n_heads,
                    d_ff,
                    dropout):
        super(Decoder, self).__init__()
        self.n_layers = n_layers
        self.dec_layers = [DecoderLayer(d_model, n_heads, d_ff, dropout) 
                            for _ in range(n_layers)]
                            
    def call(self, x, enc_out, dec_enc_mask, padding_mask):
        out = x
    
        dec_attns = list()
        dec_enc_attns = list()
        for i in range(self.n_layers):
            out, dec_attn, dec_enc_attn = \
            self.dec_layers[i](out, enc_out, dec_enc_mask, padding_mask)

            dec_attns.append(dec_attn)
            dec_enc_attns.append(dec_enc_attn)

        return out, dec_attns, dec_enc_attns

## 트랜스포머

In [56]:
class Transformer(tf.keras.Model):
    def __init__(self,
                    n_layers,
                    d_model,
                    n_heads,
                    d_ff,
                    src_vocab_size,
                    tgt_vocab_size,
                    pos_len,
                    dropout=0.2,
                    shared_fc=True,
                    shared_emb=False):
        super(Transformer, self).__init__()
        
        self.d_model = tf.cast(d_model, tf.float32)

        if shared_emb:
            self.enc_emb = self.dec_emb = \
            tf.keras.layers.Embedding(src_vocab_size, d_model)
        else:
            self.enc_emb = tf.keras.layers.Embedding(src_vocab_size, d_model)
            self.dec_emb = tf.keras.layers.Embedding(tgt_vocab_size, d_model)

        self.pos_encoding = positional_encoding(pos_len, d_model)
        self.do = tf.keras.layers.Dropout(dropout)

        self.encoder = Encoder(n_layers, d_model, n_heads, d_ff, dropout)
        self.decoder = Decoder(n_layers, d_model, n_heads, d_ff, dropout)

        self.fc = tf.keras.layers.Dense(tgt_vocab_size)

        self.shared_fc = shared_fc

        if shared_fc:
            self.fc.set_weights(tf.transpose(self.dec_emb.weights))

    def embedding(self, emb, x):
        seq_len = x.shape[1]

        out = emb(x)

        if self.shared_fc: out *= tf.math.sqrt(self.d_model)

        out += self.pos_encoding[np.newaxis, ...][:, :seq_len, :]
        out = self.do(out)

        return out

## 하이퍼 파라미터 설정


In [58]:
transformer = Transformer(
    n_layers=1,
    d_model=368,
    n_heads=8,
    d_ff=1024,
    src_vocab_size=VOCAB_SIZE,
    tgt_vocab_size=VOCAB_SIZE,
    pos_len=200,
    dropout=0.2,
    shared_fc=True,
    shared_emb=True)
		
d_model = 368

## 학습율 설정

In [59]:
class LearningRateScheduler(tf.keras.optimizers.schedules.LearningRateSchedule):
    def __init__(self, d_model, warmup_steps=4000):
        super(LearningRateScheduler, self).__init__()
        
        self.d_model = d_model
        self.warmup_steps = warmup_steps
    
    def __call__(self, step):
        arg1 = tf.cast(step, tf.float32) ** -0.5
        arg2 = tf.cast(step, tf.float32) * (self.warmup_steps ** -1.5)
        
        return (self.d_model ** -0.5) * tf.math.minimum(arg1, arg2)

In [60]:
learning_rate = LearningRateScheduler(d_model)

optimizer = tf.keras.optimizers.Adam(learning_rate,
                                        beta_1=0.9,
                                        beta_2=0.98, 
                                        epsilon=1e-9)

## 손실 함수

In [61]:
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')

def loss_function(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_ = loss_object(real, pred)

    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask

    return tf.reduce_sum(loss_)/tf.reduce_sum(mask)

# 학습

In [62]:
@tf.function()
def train_step(src, tgt, model, optimizer):
    tgt_in = tgt[:, :-1]  
    gold = tgt[:, 1:]    

    enc_mask, dec_enc_mask, dec_mask = generate_masks(src, tgt_in)

    with tf.GradientTape() as tape:
        predictions, enc_attns, dec_attns, dec_enc_attns = \
        model(src, tgt_in, enc_mask, dec_enc_mask, dec_mask)
        loss = loss_function(gold, predictions)

    gradients = tape.gradient(loss, model.trainable_variables)    
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))

    return loss, enc_attns, dec_attns, dec_enc_attns

In [67]:
EPOCHS = 3

for epoch in range(EPOCHS):
    total_loss = 0
    
    dataset_count = tf.data.experimental.cardinality(train_dataset).numpy()
    tqdm_bar = tqdm(total=dataset_count)
    for step, (enc_batch, dec_batch) in enumerate(train_dataset):
        batch_loss, enc_attns, dec_attns, dec_enc_attns = \
        train_step(enc_batch,
                    dec_batch,
                    transformer,
                    optimizer)

        total_loss += batch_loss
        
        tqdm_bar.set_description_str('Epoch %2d' % (epoch + 1))
        tqdm_bar.set_postfix_str('Loss %.4f' % (total_loss.numpy() / (step + 1)))
        tqdm_bar.update()

  0%|          | 0/185 [00:00<?, ?it/s]

TypeError: ignored