## 기본 모듈 불러오기

In [5]:
import pandas as pd
import numpy as np
import re

## 데이터 불러오기

In [59]:
train = pd.read_csv("./open/train.csv")
test = pd.read_csv("./open/test.csv")

In [60]:
train = train.sort_values(["label"], ascending=[True])[70000:]

## tensorflow 모델 불러오기

In [61]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, GRU, Dropout
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer

## train 데이터 만들기

In [62]:
from konlpy.tag import Mecab
mecab = Mecab()

In [63]:
def pos(x):
    try:
        text = ''
        for word, pos in mecab.pos(str(x)):
            if pos in ["NNG", "NNP", "VV", "VA", "VCP", "VCN", "MM", "MAG", "XPN", "SL", "SH"]:
                if type(re.search("\W+|[0-9]", word))!=re.Match:
                    text+=" "+word
        return text.strip()
    
    except:
        print(f"error {x}")

train["과제명"] = train["과제명"].apply(pos)
train["요약문_연구목표"] = train["요약문_연구목표"].apply(pos)
train["요약문_연구내용"] = train["요약문_연구내용"].apply(pos)
train["요약문_기대효과"] = train["요약문_기대효과"].apply(pos)
train["요약문_한글키워드"] = train["요약문_한글키워드"].apply(pos)

test["과제명"] = test["과제명"].apply(pos)
test["요약문_연구목표"] = test["요약문_연구목표"].apply(pos)
test["요약문_연구내용"] = test["요약문_연구내용"].apply(pos)
test["요약문_기대효과"] = test["요약문_기대효과"].apply(pos)
test["요약문_한글키워드"] = test["요약문_한글키워드"].apply(pos)

In [64]:
train["concat"] = train["과제명"]+" "+train["요약문_연구목표"]+" "+train["요약문_한글키워드"]
test["concat"] = test["과제명"]+" "+test["요약문_연구목표"]+" "+test["요약문_한글키워드"]

In [65]:
train_data = list(train["concat"])
test_data = list(test["concat"])

In [66]:
leng = [len(x) for x in train_data]

In [67]:
from collections import Counter
length = dict(Counter(leng))

In [68]:
length = sorted(length.items(), key=lambda x : x[0])

In [69]:
length

[(11, 13),
 (12, 13),
 (13, 10),
 (14, 17),
 (15, 9),
 (16, 32),
 (17, 26),
 (18, 27),
 (19, 39),
 (20, 413),
 (21, 35),
 (22, 45),
 (23, 44),
 (24, 68),
 (25, 82),
 (26, 71),
 (27, 52),
 (28, 89),
 (29, 111),
 (30, 76),
 (31, 80),
 (32, 94),
 (33, 70),
 (34, 65),
 (35, 82),
 (36, 42),
 (37, 48),
 (38, 42),
 (39, 34),
 (40, 29),
 (41, 25),
 (42, 29),
 (43, 28),
 (44, 8),
 (45, 30),
 (46, 7),
 (47, 18),
 (48, 14),
 (49, 13),
 (50, 25),
 (51, 32),
 (52, 11),
 (53, 17),
 (54, 32),
 (55, 33),
 (56, 22),
 (57, 18),
 (58, 27),
 (59, 22),
 (60, 31),
 (61, 47),
 (62, 55),
 (63, 124),
 (64, 178),
 (65, 77),
 (66, 117),
 (67, 133),
 (68, 93),
 (69, 116),
 (70, 88),
 (71, 113),
 (72, 107),
 (73, 129),
 (74, 150),
 (75, 119),
 (76, 138),
 (77, 171),
 (78, 166),
 (79, 166),
 (80, 161),
 (81, 183),
 (82, 159),
 (83, 186),
 (84, 164),
 (85, 252),
 (86, 230),
 (87, 206),
 (88, 231),
 (89, 211),
 (90, 221),
 (91, 209),
 (92, 202),
 (93, 231),
 (94, 226),
 (95, 249),
 (96, 246),
 (97, 267),
 (98, 245),


In [70]:
#텐서플로의 전처리 모듈을 활용해 토크나이징 객체를 만든 후 인덱스 벡터로 전환
tokenizer=Tokenizer()
tokenizer.fit_on_texts(train_data)

train_sequences=tokenizer.texts_to_sequences(train_data)
test_sequences=tokenizer.texts_to_sequences(test_data)
word_vocab=tokenizer.word_index

#패딩 처리
train_inputs=pad_sequences(train_sequences, maxlen=700, padding='post')
test_inputs=pad_sequences(test_sequences, maxlen=700, padding='post')

In [71]:
len(word_vocab)

88811

In [72]:
print(train_inputs.shape)
print(test_inputs.shape)

(104304, 700)
(43576, 700)


In [73]:
import numpy as np
labels = np.array(train['label'].append(train['label']).append(train['label']).append(train['label']).append(train['label']))

import pickle

with open("dataset.pkl", 'wb') as f:
    pickle.dump(train_inputs, f)
    pickle.dump(test_inputs, f)
    pickle.dump(labels, f)

import pickle
with open("dataset.pkl", 'rb') as f:
    train_inputs = pickle.load(f)
    test_inputs = pickle.load(f)
    labels = pickle.load(f)

## GRU 모델 학습 (optimizer = nadam / acivation = relu / softmax)

In [None]:
model = Sequential()
model.add(Embedding(len(word_vocab)+1, 256, input_length=700))
model.add(GRU(units=256, activation="relu", return_sequences=True))
model.add(Dropout(0.2))
model.add(GRU(units=128, activation="relu"))
model.add(Dropout(0.2))
model.add(Dense(46, activation='softmax'))
model.compile(optimizer="nadam", loss="sparse_categorical_crossentropy", metrics=["acc"])
history = model.fit(train_inputs, labels, epochs=50, batch_size=256, validation_split=0.2, verbose=2)

Epoch 1/50
326/326 - 1037s - loss: 0.8276 - acc: 0.8650 - val_loss: 8.3094 - val_acc: 0.0000e+00
Epoch 2/50
326/326 - 1030s - loss: nan - acc: 0.8697 - val_loss: nan - val_acc: 0.0000e+00
Epoch 3/50
326/326 - 1035s - loss: nan - acc: 0.8697 - val_loss: nan - val_acc: 0.0000e+00
Epoch 4/50
326/326 - 1037s - loss: nan - acc: 0.8697 - val_loss: nan - val_acc: 0.0000e+00
Epoch 5/50
326/326 - 1033s - loss: nan - acc: 0.8697 - val_loss: nan - val_acc: 0.0000e+00
Epoch 6/50
326/326 - 1028s - loss: nan - acc: 0.8697 - val_loss: nan - val_acc: 0.0000e+00
Epoch 7/50
326/326 - 1027s - loss: nan - acc: 0.8697 - val_loss: nan - val_acc: 0.0000e+00
Epoch 8/50
326/326 - 1032s - loss: nan - acc: 0.8697 - val_loss: nan - val_acc: 0.0000e+00
Epoch 9/50
326/326 - 1033s - loss: nan - acc: 0.8697 - val_loss: nan - val_acc: 0.0000e+00
Epoch 10/50
326/326 - 1039s - loss: nan - acc: 0.8697 - val_loss: nan - val_acc: 0.0000e+00
Epoch 11/50
326/326 - 1032s - loss: nan - acc: 0.8697 - val_loss: nan - val_acc: 0.

## 모델 저장

from keras.models import load_model

model.save('GRU_model_concat.h5')

## 모델 예측

In [None]:
answer = model.predict(test_inputs)

In [None]:
import tensorflow as tf

In [None]:
pred=tf.argmax(answer, axis=1)

In [None]:
sum(pred!=0)

In [None]:
sample_submission=pd.read_csv('./open/sample_submission.csv')
sample_submission['label']=pred
sample_submission.to_csv('GRU_concat_0drop.csv', index=False)