In [1]:
import numpy as np
import os
import preprocessing 

import tensorflow as tf
from tensorflow import keras
import time
import re

train = open('../DATA/cnnpaper/ratings_train.txt', 'r')
train = train.readlines()
test = open('../DATA/cnnpaper/ratings_test.txt', 'r')
test = test.readlines()

train = [line.strip().split('\t') for line in train]
test = [line.strip().split('\t') for line in test]

train_df = pd.DataFrame(train[1:], columns=train[0])
test_df = pd.DataFrame(test[1:], columns=test[0])

spacing = Spacing()
train_df['document'] = [spacing(str(sent)) for sent in train_df['document']]
test_df['document'] = [spacing(str(sent)) for sent in test_df['document']]

train_df.to_csv('train_spacing.csv')
test_df.to_csv('test_spacing.csv')

In [2]:
import pandas as pd
train_dataset = pd.read_csv('../DATA/train_spacing.csv')
test_dataset = pd.read_csv('../DATA/test_spacing.csv')

In [3]:
from konlpy.tag import Mecab
mecab = Mecab()

In [4]:
%%time
def pos(x):
    try:
        text = ''
        for word, pos in mecab.pos(str(x)):
            if pos[0] not in ['J','I','E']:
                if type(re.search("\W+|[0-9]", word))!=re.Match: 
                    # and len(word)!=1:
                    text+=" "+word
        return text.strip()
    
    except:
        pass

train_dataset["pos"] = train_dataset["document"].apply(pos)
test_dataset["pos"] = test_dataset["document"].apply(pos)

CPU times: user 12 s, sys: 37.4 ms, total: 12 s
Wall time: 12 s


In [5]:
vocab_size = [] 

for line in train_dataset['pos']:
    vocab_size.extend(str(line).split())
vocab_size = len(set(vocab_size))

In [6]:
# tokenizing
from tensorflow.keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer(oov_token='<oov>')
tokenizer.fit_on_texts(train_dataset['pos'])
word_index = tokenizer.word_index
vocabulary_inv = tokenizer.index_word

# padding
from tensorflow.keras.preprocessing.sequence import pad_sequences
train_seq = tokenizer.texts_to_sequences(train_dataset['pos'])
test_seq = tokenizer.texts_to_sequences(test_dataset['pos'])
train_pad = pad_sequences(train_seq, maxlen=40, padding='pre', truncating='pre')
test_pad = pad_sequences(test_seq, maxlen=40, padding='pre', truncating='pre')

In [7]:
embedding_dim = 200
filter_sizes = (2, 3, 4, 5)
num_filters = 100
dropout = 0.5
hidden_dims = 100

batch_size = 50
num_epochs = 10
min_word_count = 1
context = 10

In [8]:
from gensim.models import word2vec, fasttext

In [9]:
from gensim.models import Word2Vec, FastText

In [10]:
embedding_model = word2vec.Word2Vec.load("../DATA/ko.bin")

In [11]:
vocabulary_inv.update({0:'pad'})

In [12]:
same_variance = np.var(embedding_model.syn1neg)

  same_variance = np.var(embedding_model.syn1neg)


In [13]:
embedding_weights = {key: embedding_model[word] if word in embedding_model else np.random.uniform(-same_variance, same_variance, embedding_model.vector_size) for key, word in vocabulary_inv.items()}

  embedding_weights = {key: embedding_model[word] if word in embedding_model else np.random.uniform(-same_variance, same_variance, embedding_model.vector_size) for key, word in vocabulary_inv.items()}
  embedding_weights = {key: embedding_model[word] if word in embedding_model else np.random.uniform(-same_variance, same_variance, embedding_model.vector_size) for key, word in vocabulary_inv.items()}


In [37]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, GlobalAveragePooling1D, Flatten, Dropout
from tensorflow.keras import layers, models

In [38]:
tf.random.set_seed(2021)
np.random.seed(2021)

In [51]:
# Convolutional block
input_shape=(40, )
conv_blocks = []

model_input = keras.layers.Input(shape=input_shape)

z = keras.layers.Embedding(len(word_index)+1, embedding_dim, input_length=len(train_dataset['label']), name="embedding")(model_input)
z = keras.layers.Dropout(dropout)(z)
z.shape

TensorShape([None, 40, 200])

In [52]:
for sz in filter_sizes:
    conv = keras.layers.Conv1D(filters=num_filters,
                         kernel_size=sz,
                         padding="Same",
                         activation="relu",
                         strides=1)(z)
    conv = keras.layers.MaxPooling1D(pool_size=2)(conv)
    conv = keras.layers.Flatten()(conv)
    conv_blocks.append(conv)
    
z = keras.layers.Concatenate()(conv_blocks) if len(conv_blocks) > 1 else conv_blocks[0]
z = keras.layers.Dropout(dropout)(z)
z = keras.layers.Dense(512, activation="relu")(z)
z = keras.layers.Dropout(dropout)(z)
z = keras.layers.Dense(256, activation="relu")(z)
z = keras.layers.Dropout(dropout)(z)
# z = keras.layers.Dense(128, activation="relu")(z)
# z = keras.layers.Dropout(dropout)(z)
model_output = keras.layers.Dense(1, activation="sigmoid")(z)

model = keras.Model(model_input, model_output)

In [53]:
model.compile(loss="binary_crossentropy", optimizer="nadam", metrics=["accuracy"])

In [54]:
model.summary()

Model: "model_5"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_6 (InputLayer)            [(None, 40)]         0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 40, 200)      9284600     input_6[0][0]                    
__________________________________________________________________________________________________
dropout_22 (Dropout)            (None, 40, 200)      0           embedding[0][0]                  
__________________________________________________________________________________________________
conv1d_20 (Conv1D)              (None, 40, 100)      40100       dropout_22[0][0]                 
____________________________________________________________________________________________

In [55]:
weights = np.array([v for v in embedding_weights.values()])
print("Initializing embedding layer with word2vec weights, shape", weights.shape)
embedding_layer = model.get_layer("embedding")
embedding_layer.set_weights([weights])

Initializing embedding layer with word2vec weights, shape (46423, 200)


In [None]:
model.fit(train_pad, train_dataset['label'], batch_size=500, epochs=23, validation_data=(test_pad, test_dataset['label']),verbose=2)

Epoch 1/23
300/300 - 84s - loss: 0.7046 - accuracy: 0.5620 - val_loss: 0.5666 - val_accuracy: 0.7205
Epoch 2/23
300/300 - 81s - loss: 0.5448 - accuracy: 0.7236 - val_loss: 0.4799 - val_accuracy: 0.7969
Epoch 3/23
300/300 - 83s - loss: 0.4667 - accuracy: 0.7800 - val_loss: 0.4603 - val_accuracy: 0.8197
Epoch 4/23
300/300 - 83s - loss: 0.4289 - accuracy: 0.8041 - val_loss: 0.4247 - val_accuracy: 0.8282
Epoch 5/23
300/300 - 84s - loss: 0.4084 - accuracy: 0.8173 - val_loss: 0.4252 - val_accuracy: 0.8336
Epoch 6/23
300/300 - 84s - loss: 0.3914 - accuracy: 0.8268 - val_loss: 0.4066 - val_accuracy: 0.8370
Epoch 7/23
300/300 - 81s - loss: 0.3787 - accuracy: 0.8342 - val_loss: 0.4007 - val_accuracy: 0.8411
Epoch 8/23
300/300 - 83s - loss: 0.3674 - accuracy: 0.8405 - val_loss: 0.3940 - val_accuracy: 0.8454
Epoch 9/23
300/300 - 82s - loss: 0.3579 - accuracy: 0.8455 - val_loss: 0.3929 - val_accuracy: 0.8458
Epoch 10/23
300/300 - 81s - loss: 0.3483 - accuracy: 0.8504 - val_loss: 0.3727 - val_accura