In [1]:
def read_data(filename):
    with open(filename, 'r') as f:
        data = [line.split('\t') for line in f.read().splitlines()]
        # txt 파일의 헤더(id document label)는 제외하기
        data = data[1:]
    return data

train_data = read_data('nsmc/ratings_train.txt')
test_data = read_data('nsmc/ratings_test.txt')

In [2]:
print(len(train_data))
print(len(train_data[0]))
print(len(test_data))
print(len(test_data[0]))

150000
3
50000
3


In [3]:
train_data[0]

['9976970', '아 더빙.. 진짜 짜증나네요 목소리', '0']

In [4]:
import konlpy

konlpy.__version__

'0.5.1'

In [5]:
from konlpy.tag import Okt

okt = Okt()
print(okt.pos(u'이 밤 그날의 반딧불을 당신의 창 가까이 보낼게요'))

-------------------------------------------------------------------------------
Deprecated: convertStrings was not specified when starting the JVM. The default
behavior in JPype will be False starting in JPype 0.8. The recommended setting
for new code is convertStrings=False.  The legacy value of True was assumed for
please file a ticket with the developer.
-------------------------------------------------------------------------------

  """)


[('이', 'Noun'), ('밤', 'Noun'), ('그날', 'Noun'), ('의', 'Josa'), ('반딧불', 'Noun'), ('을', 'Josa'), ('당신', 'Noun'), ('의', 'Josa'), ('창', 'Noun'), ('가까이', 'Noun'), ('보낼게요', 'Verb')]


In [7]:
import json
import os
from pprint import pprint

def tokenize(doc):
    # norm은 정규화, stem은 근어로 표시하기를 나타냄
    return ['/'.join(t) for t in okt.pos(doc, norm=True, stem=False)]

# if os.path.isfile('nsmc/train_docs.json'):
#     with open('nsmc/train_docs.json') as f:
#         train_docs = json.load(f)
#     with open('nsmc/test_docs.json') as f:
#         test_docs = json.load(f)
# else:
train_docs = [(tokenize(row[1]), row[2]) for row in train_data]
test_docs = [(tokenize(row[1]), row[2]) for row in test_data]
# JSON 파일로 저장
with open('nsmc/train_docs.json', 'w', encoding="utf-8") as make_file:
    json.dump(train_docs, make_file, ensure_ascii=False, indent="\t")
with open('nsmc/test_docs.json', 'w', encoding="utf-8") as make_file:
    json.dump(test_docs, make_file, ensure_ascii=False, indent="\t")

# 예쁘게(?) 출력하기 위해서 pprint 라이브러리 사용
pprint(train_docs[0])

(['아/Exclamation',
  '더빙/Noun',
  '../Punctuation',
  '진짜/Noun',
  '짜증나네요/Adjective',
  '목소리/Noun'],
 '0')


In [8]:
tokens = [t for d in train_docs for t in d[0]]
print(len(tokens))

2159921


In [9]:
import nltk
text = nltk.Text(tokens, name='NMSC')
print(text)

<Text: NMSC>


In [10]:
# 전체 토큰의 개수
print(len(text.tokens))

# 중복을 제외한 토큰의 개수
print(len(set(text.tokens)))            

# 출현 빈도가 높은 상위 토큰 10개
pprint(text.vocab().most_common(10))

2159921
104812
[('./Punctuation', 67778),
 ('영화/Noun', 50818),
 ('이/Josa', 38540),
 ('의/Josa', 30188),
 ('../Punctuation', 29055),
 ('가/Josa', 26627),
 ('에/Josa', 26468),
 ('을/Josa', 23118),
 ('.../Punctuation', 22795),
 ('도/Josa', 20037)]


In [18]:
selected_words = [f[0] for f in text.vocab().most_common(9999)]

def term_frequency(doc):
    return [doc.count(word) for word in selected_words]

train_x = [term_frequency(d) for d, _ in train_docs]
test_x = [term_frequency(d) for d, _ in test_docs]
train_y = [c for _, c in train_docs]
test_y = [c for _, c in test_docs]

In [25]:
selected_words = [f[0] for f in text.vocab().most_common(9999)]

#train data
train_x = []
for i in range(len(train_docs)):
    _ = []
    for word in train_docs[i][0] :
        try : 
            _.append(selected_words.index(word))
        except :
            _.append(9999)
    train_x.append(_)
    
#test data
test_x = []
for i in range(len(test_docs)):
    _ = []
    for word in test_docs[i][0] :
        try : 
            _.append(selected_words.index(word))
        except :
            _.append(9999)
    test_x.append(_) 

In [29]:
selected_words.append('***/') # 10000위 안에 들지 못하는 단어들은 모두 ***라고 표기함

In [30]:
selected_words[9998]

'반가운/Adjective'

In [31]:
selected_words[9999]

'***/'

In [32]:
import pickle

with open("nsmc_selected_words.txt", "wb") as fp:
    pickle.dump(selected_words, fp)  
with open("nsmc_train_x.txt", "wb") as fp:
    pickle.dump(train_x, fp)
with open("nsmc_test_x.txt", "wb") as fp:
    pickle.dump(test_x, fp)
with open("nsmc_train_y.txt", "wb") as fp:
    pickle.dump(train_y, fp)
with open("nsmc_test_y.txt", "wb") as fp:
    pickle.dump(test_y, fp)  