# 의도 분류 모델 학습 데이터 생성하기

## 0. Import

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from konlpy.tag import Komoran
import matplotlib.pyplot as plt

## 1. 데이터 불러오기

In [2]:
movie = pd.read_csv("../../변형데이터/영화리뷰.csv")
purpose = pd.read_csv("../../변형데이터/용도별목적대화데이터.csv")
topic = pd.read_csv("../../변형데이터/주제별일상대화데이터.csv")
common_sense = pd.read_csv("../../변형데이터/일반상식.csv")

movie.dropna(inplace=True)
purpose.dropna(inplace=True)
topic.dropna(inplace=True)
common_sense.dropna(inplace=True)

In [3]:
print(f"movie shape => {movie.shape}\n\
purpose shape => {purpose.shape}\n\
topic shape => {topic.shape}\n\
common_sense shape => {common_sense.shape}")

movie shape => (199992, 3)
purpose shape => (480494, 1)
topic shape => (1445760, 1)
common_sense shape => (68538, 3)


In [4]:
movie.columns

Index(['id', 'document', 'label'], dtype='object')

In [5]:
purpose.columns

Index(['text'], dtype='object')

In [6]:
topic.columns

Index(['text'], dtype='object')

In [7]:
common_sense.columns

Index(['intent', 'query', 'answer'], dtype='object')

In [8]:
all_data = list(movie['document']) + list(purpose['text']) + list(topic['text']) + list(common_sense['query']) + list(common_sense['answer'])

In [9]:
len(all_data)

2263322

In [10]:
# 통합본 생성하고 저장하기
total = pd.DataFrame({'text': all_data})
total.to_csv("../../변형데이터/통합본데이터.csv", index=False)

## 2. 의도 분류 데이터 생성하기

0 -> 번호, 1 -> 장소, 2 -> 시간, 3 -> 기타  
기타는 추후에 다시 생성 예정

In [None]:
number = []
place = []
time = []
etc = []

In [None]:
for i in all_data:
    if ('어디' or '장소' or '위치' or '주소') in i: place.append(i)
    elif ('번호' or '전화') in i: number.append(i)
    elif ('시작' or '마감' or '언제' or '기간' or '시간') in i: time.append(i)
    else: etc.append(i)

In [None]:
len(number)

In [None]:
number_label = []
for _ in range(len(number)):
    number_label.append(0)
len(number_label)

In [None]:
len(place)

In [None]:
place_label = []
for _ in range(len(place)):
    place_label.append(1)
len(place_label)

In [None]:
len(time)

In [None]:
time_label = []
for _ in range(len(time)):
    time_label.append(2)
len(time_label)

In [None]:
#import random
#random.seed(42)
#etc_sample = random.sample(etc, 20000)

In [None]:
#etc_sample_label = []
#for _ in range(len(etc_sample)):
#    etc_sample_label.append(3)
#len(etc_sample_label)

In [None]:
train_df = pd.DataFrame({'text':number+place+time,
                         'label':number_label+place_label+time_label})

In [None]:
train_df.head()

In [None]:
train_df.tail()

In [None]:
train_df[train_df['label']==0]

In [None]:
train_df[train_df['label']==1]

In [None]:
train_df[train_df['label']==2]

In [None]:
#train_df[train_df['label']==3]

In [None]:
train_df.reset_index(drop=True, inplace=True)

In [None]:
train_df.tail()

In [None]:
train_df.to_csv("train_data.csv", index=False)

## 3. 적절한 패딩 길이 구하기

In [None]:
data = pd.read_csv('train_data.csv')

In [None]:
data.shape

In [None]:
tokenizer = Komoran()

In [None]:
data_tokenized = [[token+"/"+POS for token, POS in tokenizer.pos(text_)] for text_ in data['text']]

exclusion_tags = [
    'JKS', 'JKC', 'JKG', 'JKO', 'JKB', 'JKV', 'JKQ',
            'JX', 'JC',
            'SF', 'SP', 'SS', 'SE', 'SO',
            'EP', 'EF', 'EC', 'ETN', 'ETM',
            'XSN', 'XSV', 'XSA'
]

f = lambda x: x in exclusion_tags

data_list = []
for i in range(len(data_tokenized)):
        temp = []
        for j in range(len(data_tokenized[i])):
            if f(data_tokenized[i][j].split('/')[1]) is False:
                temp.append(data_tokenized[i][j].split('/')[0])
        data_list.append(temp)

In [None]:
num_tokens = [len(tokens) for tokens in data_list]
num_tokens = np.array(num_tokens)

# 평균값, 최댓값, 표준편차
print(f"토큰 길이 평균: {np.mean(num_tokens)}")
print(f"토큰 길이 최대: {np.max(num_tokens)}")
print(f"토큰 길이 표준편차: {np.std(num_tokens)}")

In [None]:
plt.title('all text length')
plt.hist(num_tokens, bins=100)
plt.xlabel('length of samples')
plt.ylabel('number of samples')
plt.show()

In [None]:
select_length = 25

def below_threshold_len(max_len, nested_list):
    cnt = 0
    for s in nested_list:
        if(len(s) <= max_len):
            cnt = cnt + 1
        
    print('전체 샘플 중 길이가 %s 이하인 샘플의 비율: %s'%(max_len, (cnt / len(nested_list))))
    
below_threshold_len(select_length, data_list)