In [1]:
import numpy as np
import os
import preprocessing 

import tensorflow as tf
from tensorflow import keras
import time
import re

In [2]:
import pandas as pd
train_dataset = pd.read_csv('../DATA/train_spacing.csv')
test_dataset = pd.read_csv('../DATA/test_spacing.csv')

In [3]:
from konlpy.tag import Mecab
mecab = Mecab()

In [4]:
%%time
def pos(x):
    try:
        text = ''
        for word, pos in mecab.pos(str(x)):
            if pos[0] not in ['J','I','E']:
                if type(re.search("\W+|[0-9]", word))!=re.Match: 
                    # and len(word)!=1:
                    text+=" "+word
        return text.strip()
    
    except:
        pass

train_dataset["pos"] = train_dataset["document"].apply(pos)
test_dataset["pos"] = test_dataset["document"].apply(pos)

CPU times: user 12.1 s, sys: 42.5 ms, total: 12.2 s
Wall time: 12.2 s


In [5]:
vocab_size = [] 

for line in train_dataset['pos']:
    vocab_size.extend(str(line).split())
vocab_size = len(set(vocab_size))

In [6]:
# tokenizing
from tensorflow.keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer(oov_token='<oov>')
tokenizer.fit_on_texts(train_dataset['pos'])
word_index = tokenizer.word_index
vocabulary_inv = tokenizer.index_word

# padding
from tensorflow.keras.preprocessing.sequence import pad_sequences
train_seq = tokenizer.texts_to_sequences(train_dataset['pos'])
test_seq = tokenizer.texts_to_sequences(test_dataset['pos'])
train_pad = pad_sequences(train_seq, maxlen=40, padding='pre', truncating='pre')
test_pad = pad_sequences(test_seq, maxlen=40, padding='pre', truncating='pre')

In [7]:
embedding_dim = 200
filter_sizes = (2, 3, 4, 5)
num_filters = 100
dropout = 0.5
hidden_dims = 100

batch_size = 50
num_epochs = 10
min_word_count = 1
context = 10

In [8]:
from gensim.models import word2vec, fasttext

In [9]:
from gensim.models import Word2Vec, FastText

In [10]:
embedding_model = word2vec.Word2Vec.load("../DATA/ko.bin")

In [11]:
vocabulary_inv.update({0:'pad'})

In [12]:
same_variance = np.var(embedding_model.syn1neg)

  same_variance = np.var(embedding_model.syn1neg)


In [13]:
embedding_weights = {key: embedding_model[word] if word in embedding_model else np.random.uniform(-same_variance, same_variance, embedding_model.vector_size) for key, word in vocabulary_inv.items()}

  embedding_weights = {key: embedding_model[word] if word in embedding_model else np.random.uniform(-same_variance, same_variance, embedding_model.vector_size) for key, word in vocabulary_inv.items()}
  embedding_weights = {key: embedding_model[word] if word in embedding_model else np.random.uniform(-same_variance, same_variance, embedding_model.vector_size) for key, word in vocabulary_inv.items()}


In [14]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, GlobalAveragePooling1D, Flatten, Dropout
from tensorflow.keras import layers, models

In [15]:
tf.random.set_seed(2021)
np.random.seed(2021)

In [16]:
# Convolutional block
input_shape=(40, )
conv_blocks = []

model_input = keras.layers.Input(shape=input_shape)

z = keras.layers.Embedding(len(word_index)+1, embedding_dim, input_length=len(train_dataset['label']), name="embedding")(model_input)
z = keras.layers.Dropout(dropout)(z)
z.shape

2021-08-02 02:34:24.642822: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


TensorShape([None, 40, 200])

In [17]:
for sz in filter_sizes:
    conv = keras.layers.Conv1D(filters=num_filters,
                         kernel_size=sz,
                         padding="Same",
                         activation="relu",
                         strides=1)(z)
    conv = keras.layers.MaxPooling1D(pool_size=2)(conv)
    conv = keras.layers.Flatten()(conv)
    conv_blocks.append(conv)
    
z = keras.layers.Concatenate()(conv_blocks) if len(conv_blocks) > 1 else conv_blocks[0]
z = keras.layers.Dropout(dropout)(z)
z = keras.layers.Dense(512, activation="relu")(z)
z = keras.layers.Dropout(dropout)(z)
z = keras.layers.Dense(256, activation="relu")(z)
z = keras.layers.Dropout(dropout)(z)
z = keras.layers.Dense(128, activation="relu")(z)
z = keras.layers.Dropout(dropout)(z)
model_output = keras.layers.Dense(1, activation="sigmoid")(z)

model = keras.Model(model_input, model_output)

In [18]:
model.compile(loss="binary_crossentropy", optimizer="nadam", metrics=["accuracy"])

In [19]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 40)]         0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 40, 200)      9284600     input_1[0][0]                    
__________________________________________________________________________________________________
dropout (Dropout)               (None, 40, 200)      0           embedding[0][0]                  
__________________________________________________________________________________________________
conv1d (Conv1D)                 (None, 40, 100)      40100       dropout[0][0]                    
______________________________________________________________________________________________

In [20]:
weights = np.array([v for v in embedding_weights.values()])
print("Initializing embedding layer with word2vec weights, shape", weights.shape)
embedding_layer = model.get_layer("embedding")
embedding_layer.set_weights([weights])

Initializing embedding layer with word2vec weights, shape (46423, 200)


In [None]:
model.fit(train_pad, train_dataset['label'], batch_size=500, epochs=22, validation_data=(test_pad, test_dataset['label']),verbose=2)

2021-08-02 02:34:32.193622: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:176] None of the MLIR Optimization Passes are enabled (registered 2)


Epoch 1/22
300/300 - 76s - loss: 0.7167 - accuracy: 0.5094 - val_loss: 0.6669 - val_accuracy: 0.5986
Epoch 2/22
300/300 - 75s - loss: 0.6155 - accuracy: 0.6492 - val_loss: 0.5332 - val_accuracy: 0.7788
Epoch 3/22
300/300 - 74s - loss: 0.5049 - accuracy: 0.7600 - val_loss: 0.4781 - val_accuracy: 0.8111
Epoch 4/22
