<a href="https://colab.research.google.com/github/Vaycold/Python_DL/blob/main/Text_Classification/seq2seq.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Goal
   - Seq2Seq model
   - word embedding
   - time series data

## Library

In [3]:
import tensorflow as tf
from tensorflow import keras
from keras.layers import Dense
from keras.models import Sequential

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from IPython.display import Image

warnings.filterwarnings('ignore')
SEED=34

In [6]:
# DATA SET
import random
def make_raw_text(count=50000) :
    train_text = []
    train_answer = []

    for _ in range(count) :
        t = random.randint(0,3)
        if t == 0 :
            a = random.randint(0,10)
        else :
            a = random.randint(0,100)

        if t == 0 :
            b = random.randint(0,10)
        else :
            b = random.randint(0,100)
        if random.randint(0,2) == 0 :
            train = f'{a} + {b}'
            answer = f'{a+b}'
        else :
            train = f'{a} - {b}'
            answer = f'{a - b}'
        train_text.append(train)
        train_answer.append(answer)
    return train_text, train_answer

In [9]:
train_text, train_answer = make_raw_text()

In [16]:
print(train_text[:10], '\n',train_answer[:10])

['48 - 69', '16 + 40', '72 - 88', '5 - 5', '8 - 6', '56 + 23', '23 - 68', '57 - 42', '7 + 8', '4 - 6'] 
 ['-21', '56', '-16', '0', '2', '79', '-45', '15', '15', '-2']


In [17]:
train_text[0], train_answer[0]

('48 - 69', '-21')

## Preprocessing

In [18]:
# Bag of Words 
# train_text, train_answer 에 나오는 모든 token을 [token , id ] 형태로 변경 // + : 10, - : 11, PAD : 12, EOS : 13
vocab= {str(i) : i for i in range(10)}
vocab.update(
    { "+" : 10,
      "-" : 11, 
      "PAD" : 12,
      "EOS" : 13}
)
vocab

{'+': 10,
 '-': 11,
 '0': 0,
 '1': 1,
 '2': 2,
 '3': 3,
 '4': 4,
 '5': 5,
 '6': 6,
 '7': 7,
 '8': 8,
 '9': 9,
 'EOS': 13,
 'PAD': 12}

In [34]:
# 위 vocab의 역형태 
invocab = { v:k for k,v in vocab.items()}
invocab

{0: '0',
 1: '1',
 2: '2',
 3: '3',
 4: '4',
 5: '5',
 6: '6',
 7: '7',
 8: '8',
 9: '9',
 10: '+',
 11: '-',
 12: 'PAD',
 13: 'EOS'}

In [43]:
# bow형태로 변경 // EOS : End of Sentence
def plain2bow(text, vocab) :
    return np.array([vocab[ch] for word in text.split() for ch in word] + [vocab['EOS']])

In [44]:
train_bow_text = [plain2bow(text, vocab) for text in train_text]
train_bow_answer = [plain2bow(text, vocab) for text in train_answer]

In [45]:
train_bow_text[:10]

[array([ 4,  8, 11,  6,  9, 13]),
 array([ 1,  6, 10,  4,  0, 13]),
 array([ 7,  2, 11,  8,  8, 13]),
 array([ 5, 11,  5, 13]),
 array([ 8, 11,  6, 13]),
 array([ 5,  6, 10,  2,  3, 13]),
 array([ 2,  3, 11,  6,  8, 13]),
 array([ 5,  7, 11,  4,  2, 13]),
 array([ 7, 10,  8, 13]),
 array([ 4, 11,  6, 13])]

In [48]:
# padding 을 추가하여 완전한 np.array로 만들자
train_bow_text=tf.keras.preprocessing.sequence.pad_sequences(train_bow_text, value = vocab['PAD'])
train_bow_answer=tf.keras.preprocessing.sequence.pad_sequences(train_bow_answer, padding = 'post' ,value = vocab['PAD'])

In [49]:
train_bow_text.shape, train_bow_answer.shape

((50000, 8), (50000, 5))

In [52]:
train_bow_text[:10]

array([[12, 12,  4,  8, 11,  6,  9, 13],
       [12, 12,  1,  6, 10,  4,  0, 13],
       [12, 12,  7,  2, 11,  8,  8, 13],
       [12, 12, 12, 12,  5, 11,  5, 13],
       [12, 12, 12, 12,  8, 11,  6, 13],
       [12, 12,  5,  6, 10,  2,  3, 13],
       [12, 12,  2,  3, 11,  6,  8, 13],
       [12, 12,  5,  7, 11,  4,  2, 13],
       [12, 12, 12, 12,  7, 10,  8, 13],
       [12, 12, 12, 12,  4, 11,  6, 13]], dtype=int32)

In [54]:
train_bow_answer[:10]

array([[11,  2,  1, 13, 12],
       [ 5,  6, 13, 12, 12],
       [11,  1,  6, 13, 12],
       [ 0, 13, 12, 12, 12],
       [ 2, 13, 12, 12, 12],
       [ 7,  9, 13, 12, 12],
       [11,  4,  5, 13, 12],
       [ 1,  5, 13, 12, 12],
       [ 1,  5, 13, 12, 12],
       [11,  2, 13, 12, 12]], dtype=int32)

## Split the data

## Preprocessing

## Preprocessing

## Preprocessing