In [41]:
!pip install soynlp



In [42]:
import pandas as pd
import tensorflow as tf
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from konlpy.tag import Okt, Mecab
from soynlp.word import WordExtractor

# 데이터
- 전처리 및 맞춤법 검사가 된 문장

In [110]:
data = pd.read_csv('/aiffel/train_10000_mk2.csv')
data.head()

Unnamed: 0,non_label_sentence,label_sentence,class,binary_class
0,해병대는 자기가 지원해서 가는 거잖아\n응응 시험 같은 것도 있지 않아\n어떻게 해...,1:해병대는 자기가 지원해서 가는 거잖아\n2:응응 시험 같은 것도 있지 않아\n1...,일반 대화,일반 대화
1,오늘 서울은 하루종일 꾸물하다 날씨가\n요즘 날씨가 계속 꾸물하고 비오고\n언제 선...,1:오늘 서울은 하루종일 꾸물하다 날씨가\n2:요즘 날씨가 계속 꾸물하고 비오고\n...,일반 대화,일반 대화
2,남자들은 전립선 비대증이 큰일이에요\n진작에 관리 안 한 걸 후회한다\n치료도 잘 ...,1:남자들은 전립선 비대증이 큰일이에요\n2:진작에 관리 안 한 걸 후회한다\n1:...,일반 대화,일반 대화
3,안녕\n나는 이 날씨에 서핑하러 간다\n와 이제 곧 겨울인데 서핑을 해\n당연히 제...,1:안녕\n1:나는 이 날씨에 서핑하러 간다\n2:와 이제 곧 겨울인데 서핑을 해\...,일반 대화,일반 대화
4,넌 몇살에 결혼 하고 싶어\n35살 난 최대한 즐기고 결혼 할 거야\n그럼 애기는 ...,1:넌 몇살에 결혼 하고 싶어\n2:35살 난 최대한 즐기고 결혼 할 거야\n1:그...,일반 대화,일반 대화


In [111]:
sentences = data.non_label_sentence
sentences = [sen for sen in sentences]

from soynlp.tokenizer import LTokenizer
vocab_size = 30000

word_extractor = WordExtractor(
    min_frequency=100, # example
    min_cohesion_forward=0.05,
    min_right_branching_entropy=0.0
)

word_extractor.train(sentences)
words = word_extractor.extract()

cohesion_score = {word:score.cohesion_forward for word, score in words.items()}
tokenizer = LTokenizer(scores=cohesion_score)

sentences = [tokenizer.tokenize(sen) for sen in sentences]
tokenizer_tf = Tokenizer(num_words=vocab_size)
tokenizer_tf.fit_on_texts(sentences)
word_dic = tokenizer_tf.word_index
sequences = tokenizer_tf.texts_to_sequences(sentences)
padded = pad_sequences(sequences)
np.shape(padded)

training was done. used memory 3.252 Gbory 3.144 Gb
all cohesion probabilities was computed. # words = 2803
all branching entropies was computed # words = 58000
all accessor variety was computed # words = 58000


(13870, 195)

In [112]:
len(word_dic)

126458

In [113]:
train_data = padded
train_label = data['class']
print(len(train_data), len(train_label))

13870 13870


In [114]:
labels = {'직장 내 괴롭힘 대화': 2, '기타 괴롭힘 대화': 3, '갈취 대화': 1, '협박 대화': 0, '일반 대화': 4}
train_label = train_label.apply(lambda x: labels[x])
train_label = pd.get_dummies(train_label)

from sklearn.model_selection import train_test_split

train_X, test_X, train_Y, test_Y = train_test_split(train_data, train_label, test_size=0.2, random_state=22)
valid_X, test_X, valid_Y, test_Y = train_test_split(test_X, test_Y, test_size=0.5, random_state=22)

print(len(train_X), len(valid_X), len(test_X))
print(len(train_Y), len(valid_Y), len(test_Y))

11096 1387 1387
11096 1387 1387


# 모델 
- LSTM 사용

In [115]:
word_vector_dim = 1024
labels_size = len(labels)
hidden_size = 128

model = tf.keras.Sequential()
model.add(tf.keras.layers.Embedding(vocab_size, word_vector_dim, input_shape=(None,)))
model.add(tf.keras.layers.LSTM(hidden_size, return_sequences=True))
model.add(tf.keras.layers.LSTM(hidden_size, return_sequences=True))
model.add(tf.keras.layers.LSTM(hidden_size//2))
model.add(tf.keras.layers.Dense(vocab_size, activation='relu'))
model.add(tf.keras.layers.Dropout(0.4))
model.add(tf.keras.layers.Dense(labels_size, activation='softmax'))

model.summary()

Model: "sequential_6"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_6 (Embedding)      (None, None, 1024)        30720000  
_________________________________________________________________
lstm_18 (LSTM)               (None, None, 128)         590336    
_________________________________________________________________
lstm_19 (LSTM)               (None, None, 128)         131584    
_________________________________________________________________
lstm_20 (LSTM)               (None, 64)                49408     
_________________________________________________________________
dense_12 (Dense)             (None, 30000)             1950000   
_________________________________________________________________
dropout_6 (Dropout)          (None, 30000)             0         
_________________________________________________________________
dense_13 (Dense)             (None, 5)                

In [None]:
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])
              
epochs=10

history = model.fit(train_X,
                    train_Y,
                    epochs=epochs,
                    batch_size=256,
                    validation_data=(valid_X, valid_Y),
                    verbose=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10

In [None]:
test_file_path = '/aiffel/test.json'
with open(test_file_path, mode='rt', encoding='utf-8') as f:
    test_dataset = pd.read_json(f)
    
test_data = test_dataset.transpose()
test_data

In [None]:
sentences = [sen for sen in test_data['text']]
sentences = [tokenizer.tokenize(sen) for sen in sentences]
sequences = tokenizer_tf.texts_to_sequences(sentences)
padded = pad_sequences(sequences)
np.shape(padded)

In [None]:
pred = model.predict(padded)

from sklearn.metrics import classification_report

pred_label = []

for i in range(len(pred)):
    pred_label.append((str)(np.argmax(pred[i])))

pred_label[:10]

In [None]:
label_change = {'0':'00', '1':'01', '2':'02', '3':'03', '4':'04'}

sub_label = []

for pre in pred_label:
    sub_label.append(label_change[pre])

sub_label[:10]

In [None]:
submission = test_data.assign(CLASS=sub_label)
submission = submission.rename(columns={'CLASS':'class'})

In [None]:
submission.head()

In [None]:
submission.drop(['text'], axis=1, inplace=True)
submission = submission.transpose()
submission

In [None]:
import json

submission_file_path = '/aiffel/submission_LSTM_10000'
result = submission.to_json(submission_file_path)

with open(submission_file_path) as f:
    parsed = json.load(f)

with open(submission_file_path, 'w') as f:
    json.dump(parsed, f, indent=4)