In [90]:
import pandas as pd
import numpy as np
import tensorflow as tf

In [95]:
data = pd.read_csv('/aiffel/aiffel/aiffelthon/train_normalization.csv')
data.rename(columns={'non_label_sentence':'sentence'}, inplace=True)
data.head()

Unnamed: 0,class,label_sentence,sentence,binary_class
0,직장 내 괴롭힘 대화,1:길동 씨 이번에 이것 좀 처리해요\n2:이거 제가 한 게 아닌데요\n1:팀에서 ...,길동 씨 이번에 이것 좀 처리해요\n이거 제가 한 게 아닌데요\n팀에서 내가 네가 ...,공격 대화
1,기타 괴롭힘 대화,1:야 너 앞니 왜 그렇게 튀어나왔냐\n2:태어날 때부터 그랬어 물어보지 마\n1:...,야 너 앞니 왜 그렇게 튀어나왔냐\n태어날 때부터 그랬어 물어보지 마\n아 그럼 태...,공격 대화
2,갈취 대화,1:원후야 학원 교재 사야 되는데 8만 원만 주면 안 돼\n2:미안 나 오늘은 진짜...,원후야 학원 교재 사야 되는데 8만 원만 주면 안 돼\n미안 나 오늘은 진짜 돈이 ...,공격 대화
3,갈취 대화,1:너 저번에 술 먹은 날 기억해\n2:아니 왜\n1:야 300만 원만 가져와\n2...,너 저번에 술 먹은 날 기억해\n아니 왜\n야 300만 원만 가져와\n뭔 소리야 ...,공격 대화
4,협박 대화,1:너 또 내말 무시하냐\n1:이 새끼 널 좆으로 보나 본데\n2:아냐 진짜 시간이...,너 또 내말 무시하냐\n이 새끼 널 좆으로 보나 본데\n아냐 진짜 시간이 없었어 미...,공격 대화


In [106]:
def random_combi(origin, aug1, aug2, aug3, normal, nor_num):
    """
    input : dataframe * 4, normal_conversation_num
    output : concated_dataframe
    """
    choice_num = int(len(origin)*0.33)
    nor_num = nor_num
    
    ori = origin.copy()
    aug1 = aug1.copy()
    aug2 = aug2.copy()
    aug3 = aug3.copy()
    nor = normal.copy()
    
    from random import sample
    sampled_aug1, sampled_aug2, sampled_aug3, sampled_nor = \
    aug1.sample(choice_num), aug2.sample(choice_num), aug3.sample(choice_num), nor.sample(nor_num)
    
    total = pd.concat([ori, sampled_aug1, sampled_aug2, sampled_aug3, sampled_nor], axis=0)
    
    return total

In [107]:
def generate_input(Input):
    """
    input : dataframe
    output : train_x, train_y
    """
    data = Input
    sentences = data['sentence']
    sentences = [sen for sen in sentences]

    from soynlp.tokenizer import LTokenizer
    vocab_size = 30000
    
    from soynlp.word import WordExtractor
    word_extractor = WordExtractor(
        min_frequency=100, 
        min_cohesion_forward=0.05,
        min_right_branching_entropy=0.0
    )

    word_extractor.train(sentences)
    words = word_extractor.extract()

    cohesion_score = {word:score.cohesion_forward for word, score in words.items()}
    tokenizer = LTokenizer(scores=cohesion_score)
    
    from tensorflow.keras.preprocessing.text import Tokenizer
    tokenizer_tf = Tokenizer(num_words=vocab_size)
    
    tokenizer_tf.fit_on_texts(sentences)
    sequences = tokenizer_tf.texts_to_sequences(sentences)
    
    from tensorflow.keras.preprocessing.sequence import pad_sequences
    train_data = pad_sequences(sequences)
    
    train_label = data['class']
    labels = {'협박 대화': 0, '갈취 대화': 1, '직장 내 괴롭힘 대화': 2, '기타 괴롭힘 대화': 3, '일반 대화': 4}
    train_label = train_label.apply(lambda x: labels[x])
    train_label = pd.get_dummies(train_label)
    
    print('train_x_shape :',train_data.shape)
    print('train_y_length :',len(train_label))
    
    return train_data, train_label

In [108]:
total = random_combi(data, data, data, data, data, 1000)
x, y = generate_input(total)

training was done. used memory 0.563 Gbry 0.549 Gb
all cohesion probabilities was computed. # words = 1511
all branching entropies was computed # words = 71544
all accessor variety was computed # words = 71544
train_x_shape : (8701, 177)
train_y_length : 8701


In [109]:
for _ in range(5):
    total = random_combi(data, data, data, data, data, 1000)
    x, y = generate_input(total)

training was done. used memory 0.588 Gbry 0.569 Gb
all cohesion probabilities was computed. # words = 1515
all branching entropies was computed # words = 70478
all accessor variety was computed # words = 70478
train_x_shape : (8701, 177)
train_y_length : 8701
[    0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0    