In [1]:
import numpy as np
import os
import preprocessing 

import tensorflow as tf
from tensorflow import keras
import time
import re

In [2]:
import pandas as pd
train_dataset = pd.read_csv('../DATA/train_spacing.csv')
test_dataset = pd.read_csv('../DATA/test_spacing.csv')

In [3]:
from konlpy.tag import Mecab
mecab = Mecab()

In [4]:
%%time
def pos(x):
    try:
        text = ''
        for word, pos in mecab.pos(str(x)):
            if pos[0] not in ['J','I','E']:
                if type(re.search("\W+|[0-9]", word))!=re.Match: 
                    # and len(word)!=1:
                    text+=" "+word
        return text.strip()
    
    except:
        pass

train_dataset["pos"] = train_dataset["document"].apply(pos)
test_dataset["pos"] = test_dataset["document"].apply(pos)

CPU times: user 12 s, sys: 37.4 ms, total: 12 s
Wall time: 12 s


In [5]:
vocab_size = [] 

for line in train_dataset['pos']:
    vocab_size.extend(str(line).split())
vocab_size = len(set(vocab_size))

In [6]:
# tokenizing
from tensorflow.keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer(oov_token='<oov>')
tokenizer.fit_on_texts(train_dataset['pos'])
word_index = tokenizer.word_index
vocabulary_inv = tokenizer.index_word

# padding
from tensorflow.keras.preprocessing.sequence import pad_sequences
train_seq = tokenizer.texts_to_sequences(train_dataset['pos'])
test_seq = tokenizer.texts_to_sequences(test_dataset['pos'])
train_pad = pad_sequences(train_seq, maxlen=40, padding='pre', truncating='pre')
test_pad = pad_sequences(test_seq, maxlen=40, padding='pre', truncating='pre')

In [7]:
embedding_dim = 200
filter_sizes = (2, 3, 4, 5)
num_filters = 100
dropout = 0.5
hidden_dims = 100

batch_size = 50
num_epochs = 10
min_word_count = 1
context = 10

In [8]:
from gensim.models import word2vec, fasttext

In [9]:
from gensim.models import Word2Vec, FastText

In [10]:
embedding_model = word2vec.Word2Vec.load("../DATA/ko.bin")

In [11]:
vocabulary_inv.update({0:'pad'})

In [12]:
same_variance = np.var(embedding_model.syn1neg)

  same_variance = np.var(embedding_model.syn1neg)


In [13]:
embedding_weights = {key: embedding_model[word] if word in embedding_model else np.random.uniform(-same_variance, same_variance, embedding_model.vector_size) for key, word in vocabulary_inv.items()}

  embedding_weights = {key: embedding_model[word] if word in embedding_model else np.random.uniform(-same_variance, same_variance, embedding_model.vector_size) for key, word in vocabulary_inv.items()}
  embedding_weights = {key: embedding_model[word] if word in embedding_model else np.random.uniform(-same_variance, same_variance, embedding_model.vector_size) for key, word in vocabulary_inv.items()}


In [14]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, GlobalAveragePooling1D, Flatten, Dropout
from tensorflow.keras import layers, models

In [22]:
tf.random.set_seed(2021)
np.random.seed(2021)

In [23]:
# Convolutional block
input_shape=(40, )
conv_blocks = []

model_input = keras.layers.Input(shape=input_shape)

z = keras.layers.Embedding(len(word_index)+1, embedding_dim, input_length=len(train_dataset['label']), name="embedding")(model_input)
z = keras.layers.Dropout(dropout)(z)
z.shape

TensorShape([None, 40, 200])

In [24]:
for sz in filter_sizes:
    conv = keras.layers.Conv1D(filters=num_filters,
                         kernel_size=sz,
                         padding="Same",
                         activation="relu",
                         strides=1)(z)
    conv = keras.layers.MaxPooling1D(pool_size=2)(conv)
    conv = keras.layers.Flatten()(conv)
    conv_blocks.append(conv)
    
z = keras.layers.Concatenate()(conv_blocks) if len(conv_blocks) > 1 else conv_blocks[0]
z = keras.layers.Dropout(dropout)(z)
z = keras.layers.Dense(512, activation="relu")(z)
z = keras.layers.Dropout(dropout)(z)
# z = keras.layers.Dense(256, activation="relu")(z)
# z = keras.layers.Dropout(dropout)(z)
z = keras.layers.Dense(128, activation="relu")(z)
z = keras.layers.Dropout(dropout)(z)
model_output = keras.layers.Dense(1, activation="sigmoid")(z)

model = keras.Model(model_input, model_output)

In [25]:
model.compile(loss="binary_crossentropy", optimizer="nadam", metrics=["accuracy"])

In [26]:
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            [(None, 40)]         0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 40, 200)      9284600     input_2[0][0]                    
__________________________________________________________________________________________________
dropout_4 (Dropout)             (None, 40, 200)      0           embedding[0][0]                  
__________________________________________________________________________________________________
conv1d_4 (Conv1D)               (None, 40, 100)      40100       dropout_4[0][0]                  
____________________________________________________________________________________________

In [27]:
weights = np.array([v for v in embedding_weights.values()])
print("Initializing embedding layer with word2vec weights, shape", weights.shape)
embedding_layer = model.get_layer("embedding")
embedding_layer.set_weights([weights])

Initializing embedding layer with word2vec weights, shape (46423, 200)


In [28]:
model.fit(train_pad, train_dataset['label'], batch_size=500, epochs=23, validation_data=(test_pad, test_dataset['label']),verbose=2)

Epoch 1/23
300/300 - 74s - loss: 0.7034 - accuracy: 0.5684 - val_loss: 0.5719 - val_accuracy: 0.7329
Epoch 2/23
300/300 - 73s - loss: 0.5332 - accuracy: 0.7330 - val_loss: 0.4629 - val_accuracy: 0.7991
Epoch 3/23
300/300 - 73s - loss: 0.4631 - accuracy: 0.7833 - val_loss: 0.4349 - val_accuracy: 0.8204
Epoch 4/23
300/300 - 73s - loss: 0.4271 - accuracy: 0.8054 - val_loss: 0.4028 - val_accuracy: 0.8299
Epoch 5/23
300/300 - 73s - loss: 0.4076 - accuracy: 0.8186 - val_loss: 0.4016 - val_accuracy: 0.8346
Epoch 6/23
300/300 - 73s - loss: 0.3916 - accuracy: 0.8279 - val_loss: 0.3899 - val_accuracy: 0.8386
Epoch 7/23
300/300 - 73s - loss: 0.3784 - accuracy: 0.8362 - val_loss: 0.3762 - val_accuracy: 0.8419
Epoch 8/23
300/300 - 73s - loss: 0.3670 - accuracy: 0.8419 - val_loss: 0.3696 - val_accuracy: 0.8452
Epoch 9/23
300/300 - 73s - loss: 0.3569 - accuracy: 0.8475 - val_loss: 0.3635 - val_accuracy: 0.8460
Epoch 10/23
300/300 - 73s - loss: 0.3489 - accuracy: 0.8515 - val_loss: 0.3565 - val_accura

<tensorflow.python.keras.callbacks.History at 0x7fb84266d8b0>