In [1]:
import os
import tensorflow as tf
from tqdm import tqdm
import pandas as pd
import numpy as np

from tokenization_kobert import KoBertTokenizer

In [2]:
tokenizer = KoBertTokenizer.from_pretrained('monologg/kobert')

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'BertTokenizer'. 
The class this function is called from is 'KoBertTokenizer'.


In [3]:
Folder_Path = 'D:/Code/01_Project/02_Study/2022_스터디/04_RNN/00_Data/'
Train_Data_Path = os.path.join(Folder_Path, "nsmc_ratings_train.txt")
Test_Data_Path = os.path.join(Folder_Path, "nsmc_ratings_test.txt")

In [4]:
Train_Data = pd.read_csv(Train_Data_Path, sep = "\t", encoding = "utf-8").dropna().reset_index(drop = True)
Test_Data = pd.read_csv(Test_Data_Path, sep = "\t", encoding = "utf-8").dropna().reset_index(drop = True)

In [5]:
Train_Text = list(Train_Data.document)
Train_Label = list(Train_Data.label)

In [6]:
Train_Data.groupby('label').count()

Unnamed: 0_level_0,id,document
label,Unnamed: 1_level_1,Unnamed: 2_level_1
0,75170,75170
1,74825,74825


In [30]:
vocab_size = tokenizer.vocab_size +1
hidden_size = 64
batch_size = 512
max_length = 128
learning_rate = 0.001

In [8]:
Train_Sequence = []
for i in tqdm(range(len(Train_Text))):
    sentence = tokenizer.tokenize('[CLS] ' + Train_Text[i] + '[SEP]')
    Train_Sequence.append(tokenizer.convert_tokens_to_ids(sentence))

100%|███████████████████████████████████████████████████████████████████████| 149995/149995 [00:12<00:00, 11933.75it/s]


In [9]:
Sequence = tf.keras.preprocessing.sequence.pad_sequences(Train_Sequence, padding='post', maxlen = max_length)
Label = np.expand_dims(np.array(Train_Label), axis=1)

In [10]:
Sequence.shape, Label.shape

((149995, 128), (149995, 1))

In [36]:
Inputs = tf.keras.Input(shape = (max_length))
Embedding_Layers = tf.keras.layers.Embedding(vocab_size, hidden_size)(Inputs)
Layer01_RNN_Forward = tf.keras.layers.SimpleRNN(hidden_size, return_sequences=False, go_backwards=False)(Embedding_Layers)
Classifier = tf.keras.layers.Dense(1, activation = 'sigmoid')(Layer01_RNN_Forward)

In [37]:
model1 = tf.keras.Model(Inputs, Classifier)
model1.summary()

Model: "model_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_5 (InputLayer)        [(None, 128)]             0         
                                                                 
 embedding_4 (Embedding)     (None, 128, 64)           512192    
                                                                 
 simple_rnn_4 (SimpleRNN)    (None, 64)                8256      
                                                                 
 dense_4 (Dense)             (None, 1)                 65        
                                                                 
Total params: 520,513
Trainable params: 520,513
Non-trainable params: 0
_________________________________________________________________


In [38]:
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience = 2)
model1.compile(optimizer = tf.keras.optimizers.Adam(learning_rate = learning_rate),
              loss = tf.keras.losses.BinaryCrossentropy(),
              metrics=['accuracy'])

In [39]:
model1.fit(Sequence, Label, epochs=20, batch_size= batch_size, validation_split=0.2, callbacks=[early_stopping])

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20


<keras.callbacks.History at 0x16a5a20bee0>

In [35]:
Test_Text = list(Test_Data.document)
Test_Label = list(Test_Data.label)

Test_Sequence = []
for i in tqdm(range(len(Test_Text))):
    sentence = tokenizer.tokenize('[CLS] ' + Test_Text[i] + '[SEP]')
    Test_Sequence.append(tokenizer.convert_tokens_to_ids(sentence))
    
test_text = tf.keras.preprocessing.sequence.pad_sequences(Test_Sequence, padding='post', maxlen = max_length)
test_label = np.expand_dims(np.array(Test_Label), axis=1)

100%|█████████████████████████████████████████████████████████████████████████| 49997/49997 [00:04<00:00, 11964.28it/s]


In [40]:
model1.evaluate(test_text, test_label)



[0.5962127447128296, 0.6896213889122009]

In [68]:
vocab_size = tokenizer.vocab_size +1
hidden_size = 32
batch_size = 512
max_length = 128
learning_rate = 0.001

In [79]:
Inputs = tf.keras.Input(shape = (max_length))
Embedding_Layers = tf.keras.layers.Embedding(vocab_size, hidden_size)(Inputs)
Layer01_GRU_Forward = tf.keras.layers.GRU(hidden_size, activation='relu', return_sequences=True, go_backwards=False, dropout = 0.1)(Embedding_Layers)

Layer02_GRU_Backward = tf.keras.layers.GRU(hidden_size, activation='relu', return_sequences=True, go_backwards=True, dropout = 0.1)(Embedding_Layers)

Layer_Concatenate = tf.keras.layers.Concatenate()([Layer01_GRU_Forward, Layer02_GRU_Backward])

Flatten = tf.keras.layers.Flatten()(Layer_Concatenate)

Classifier = tf.keras.layers.Dense(1, activation = 'sigmoid')(Flatten)



In [80]:
model2 = tf.keras.Model(Inputs, Classifier)
model2.summary()

Model: "model_7"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_19 (InputLayer)          [(None, 128)]        0           []                               
                                                                                                  
 embedding_18 (Embedding)       (None, 128, 32)      256096      ['input_19[0][0]']               
                                                                                                  
 gru_14 (GRU)                   (None, 128, 32)      6336        ['embedding_18[0][0]']           
                                                                                                  
 gru_15 (GRU)                   (None, 128, 32)      6336        ['embedding_18[0][0]']           
                                                                                            

In [81]:
model2.compile(optimizer = tf.keras.optimizers.Adam(learning_rate = learning_rate),
              loss = tf.keras.losses.BinaryCrossentropy(),
              metrics=['accuracy'])

In [82]:
model2.fit(Sequence, Label, epochs=20, batch_size= batch_size, validation_split=0.2, callbacks=[early_stopping])

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20


<keras.callbacks.History at 0x16a61018e20>

In [84]:
model2.evaluate(test_text, test_label)



[0.3662944436073303, 0.8387903571128845]

--------------------

In [27]:
Num_Sequence = []
for i in tqdm(range(len(Train_Sequence))):
    Num_Sequence.append(len(Train_Sequence[i]))

100%|█████████████████████████████████████████████████████████████████████| 149995/149995 [00:00<00:00, 2584332.06it/s]


In [32]:
temp_num_len = np.array(Num_Sequence)

In [39]:
np.percentile(temp_num_len, [25, 50, 75, 95, 99, 100])

array([ 12.,  19.,  29.,  66.,  88., 143.])