## 기본 모듈 불러오기

In [1]:
import pandas as pd
import numpy as np
import re

## 데이터 불러오기

In [2]:
train = pd.read_csv("./open/train.csv")
test = pd.read_csv("./open/test.csv")

## tensorflow 모델 불러오기

In [3]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, GRU, Dropout
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer

## train 데이터 만들기

In [4]:
from konlpy.tag import Mecab
mecab = Mecab()

In [6]:
def pos(x):
    try:
        text = ''
        for word, pos in mecab.pos(str(x)):
            if pos[0] in ["N","V","M","E","X"]:
                if type(re.search("\W+|[A-z0-9]", word))!=re.Match and len(word)!=1:
                    text+=" "+word
        return text.strip()
    
    except:
        print(f"error {x}")

train["과제명"] = train["과제명"].apply(pos)
train["요약문_연구목표"] = train["요약문_연구목표"].apply(pos)
train["요약문_연구내용"] = train["요약문_연구내용"].apply(pos)
train["요약문_기대효과"] = train["요약문_기대효과"].apply(pos)
train["요약문_한글키워드"] = train["요약문_한글키워드"].apply(pos)

test["과제명"] = test["과제명"].apply(pos)
test["요약문_연구목표"] = test["요약문_연구목표"].apply(pos)
test["요약문_연구내용"] = test["요약문_연구내용"].apply(pos)
test["요약문_기대효과"] = test["요약문_기대효과"].apply(pos)
test["요약문_한글키워드"] = test["요약문_한글키워드"].apply(pos)

In [47]:
train_data = train["과제명"].append(train["요약문_연구목표"],ignore_index=True).append(train["요약문_연구내용"],ignore_index=True).append(train["요약문_기대효과"],ignore_index=True).append(train["요약문_한글키워드"],ignore_index=True)
test_data = test["과제명"].append(test["요약문_연구목표"],ignore_index=True).append(test["요약문_연구내용"],ignore_index=True).append(test["요약문_기대효과"],ignore_index=True).append(test["요약문_한글키워드"],ignore_index=True)

In [48]:
train_data = list(train_data)
test_data = list(test_data)

In [49]:
leng = [len(x) for x in train_data]

In [50]:
from collections import Counter
length = dict(Counter(leng))

In [51]:
length = sorted(length.items(), key=lambda x : x[0])

In [56]:
#텐서플로의 전처리 모듈을 활용해 토크나이징 객체를 만든 후 인덱스 벡터로 전환
tokenizer=Tokenizer()
tokenizer.fit_on_texts(train_data)

train_sequences=tokenizer.texts_to_sequences(train_data)
test_sequences=tokenizer.texts_to_sequences(test_data)
word_vocab=tokenizer.word_index

#패딩 처리
train_inputs=pad_sequences(train_sequences, maxlen=1000, padding='post')
test_inputs=pad_sequences(test_sequences, maxlen=1000, padding='post')

In [57]:
len(word_vocab)

82664

In [58]:
print(train_inputs.shape)
print(test_inputs.shape)

(871520, 1000)
(217880, 1000)


In [59]:
import numpy as np
labels = np.array(train['label'].append(train['label']).append(train['label']).append(train['label']).append(train['label']))

import pickle

with open("dataset.pkl", 'wb') as f:
    pickle.dump(train_inputs, f)
    pickle.dump(test_inputs, f)
    pickle.dump(labels, f)

import pickle
with open("dataset.pkl", 'rb') as f:
    train_inputs = pickle.load(f)
    test_inputs = pickle.load(f)
    labels = pickle.load(f)

## GRU 모델 학습 (optimizer = nadam / acivation = relu / softmax)

In [None]:
model = Sequential()
model.add(Embedding(len(word_vocab)+1, 512, input_length=1000))
model.add(GRU(units=512, activation="relu", return_sequences=True))
model.add(Dropout(0.2))
model.add(GRU(units=256, activation="relu"))
model.add(Dropout(0.2))
model.add(Dense(46, activation='softmax'))
model.compile(optimizer="nadam", loss="sparse_categorical_crossentropy", metrics=["acc"])
history = model.fit(train_inputs, labels, epochs=30, batch_size=256, validation_split=0.2, verbose=2)

Epoch 1/30


## 모델 저장

from keras.models import load_model

model.save('GRU_model_concat.h5')

## 모델 예측

In [None]:
answer = model.predict(test_inputs)

In [None]:
import tensorflow as tf

In [None]:
pred=tf.argmax(answer, axis=1)

In [None]:
pred[:500]

In [None]:
sample_submission=pd.read_csv('./open/sample_submission.csv')
sample_submission['label']=pred
sample_submission.to_csv('GRU_concat_column.csv', index=False)