In [1]:
import gensim



# Step1: prepare the corpus for training

In [2]:
# Step 1. 주어진 data로 gensim을 활용하여 word2vec 모델 학습

# 학습을 위한 데이터 로딩 -- Data 준비
class TextIterator(object):
	def __init__(self, fname):
		self.fname = fname

	def __iter__(self):
		for line in open(self.fname):
			yield line.split()

filename = 'newskor.txt'
sentences = TextIterator(filename)

# Step 2, 3: Training & Load Word2Vec model

In [16]:
# Hyperparams
train = True # train flag (True: train model / False: load trained model)
SIZE = 300 # vector size
WINDOW = 5 # context window
SG = 1 # 1 for skip-gram / otherwise cbow
MIN_COUNT = 10 # ignores all words appearing lower than min_count
WORKERS = 20 # cpu cores

In [17]:
if train:
    model = gensim.models.Word2Vec(
        vector_size=SIZE, window=WINDOW, sg=SG, 
        min_count=MIN_COUNT, workers=WORKERS
    )
    model.build_vocab(sentences) # prepare model vocab
    model.train(sentences, total_examples=model.corpus_count, epochs=5)
    model.save('newskor.model')
else:
    model = gensim.models.Word2Vec.load('newskor.model')

In [11]:
model.wv.index_to_key # See vocabs

['하',
 '이',
 '.',
 '는',
 '을',
 'ㄴ',
 '다',
 '의',
 '에',
 '를',
 '은',
 '어',
 '있',
 '고',
 '으로',
 '가',
 '였',
 'ㄹ',
 '되',
 ',',
 '에서',
 '었',
 ')',
 '(',
 '로',
 '것',
 '도',
 '등',
 '과',
 '들',
 '지',
 '와',
 '여',
 '일',
 '기',
 '·',
 'ㄴ다',
 '적',
 '수',
 '아',
 '%',
 '게',
 '원',
 '년',
 '2',
 '았',
 '3',
 '1',
 '다고',
 '“',
 '”',
 '월',
 '위하',
 '대하',
 '말하',
 '시장',
 '면',
 '업체',
 '따르',
 '하고',
 '않',
 '만',
 '까지',
 '‘',
 '’',
 '밝히',
 '명',
 '및',
 '부터',
 '다가',
 '미국',
 '며',
 '이라고',
 '4',
 '개',
 '대',
 '나',
 '오',
 '대표',
 '국내',
 '한국',
 '5',
 '다는',
 '던',
 '서비스',
 '습니다',
 '개발',
 '계획',
 '인',
 '주',
 '통하',
 '제품',
 '없',
 '또',
 '화',
 '정부',
 '면서',
 '최근',
 '한',
 '으며',
 '6',
 '지난해',
 '중',
 '그',
 '사업',
 '올해',
 '중국',
 '보이',
 '크',
 '받',
 '일본',
 '이번',
 '10',
 '보다',
 '내',
 '기업',
 '경우',
 '보',
 '대통령',
 '같',
 '-',
 '관련',
 '기술',
 '에게',
 '지만',
 '때문',
 '전',
 '전망',
 'LG',
 '라고',
 '7',
 '달러',
 '현재',
 '예정',
 '지나',
 '어서',
 'ㄴ다는',
 '특히',
 '이상',
 '8',
 '북한',
 '사',
 '시스템',
 '문제',
 '김',
 '삼성전자',
 '겠',
 '늘',
 '간',
 '나서',
 '관계자',
 '잇',
 '9',
 '아니',


In [21]:
## check word embed result
word = '버스'
print(model.wv[word])
print('size of vector: ', len(model.wv[word]))

[-1.28682405e-01  5.66599131e-01  5.75886369e-01  2.05671415e-01
 -1.72221541e-01  3.49116772e-01 -3.24239224e-01  7.51419187e-01
 -6.87707663e-02  3.15315202e-02  2.46695161e-01 -6.65922165e-02
 -3.45187187e-01 -2.83971041e-01 -1.36189714e-01 -1.99062064e-01
 -3.15497130e-01  1.30809888e-01  4.94904025e-03 -3.91583219e-02
  7.37383589e-02  2.59026163e-03  1.92904159e-01 -6.42431453e-02
  1.27675921e-01 -2.45442927e-01 -1.81874096e-01  1.80150792e-01
 -7.89898783e-02  1.73874304e-01  1.37507200e-01 -2.25318745e-01
  5.39747834e-01 -3.03388089e-01 -3.09321105e-01  1.16513215e-01
  5.43617122e-02 -4.63663757e-01  5.56160696e-02  2.39711702e-01
  1.05050929e-01 -1.72165722e-01  1.26839712e-01  5.35689354e-01
  6.05753779e-01  7.69956827e-01 -4.84324954e-02 -3.56478523e-03
 -2.95956852e-03 -3.81394811e-02  1.91248447e-01  1.67245209e-01
  5.35007529e-02  1.44819096e-01  2.94060051e-01  8.69548880e-03
  1.89116925e-01  1.04932394e-02  2.53554583e-02 -2.55335987e-01
 -5.11766076e-01  4.68417

# Step4: Get word similarity

In [22]:
#word1 = '한국'
#word2 = '북한'
print ("Caculate the similarity between word 1 and word2")
word1 = input("word1: ")
word2 = input("word2: ")

# check the words are in the vocabulary
no_problem = True
vocab = model.wv.index_to_key

if word1 not in vocab:
	print ('the word ' + word1 + ' is not in the vocabulary')
	no_problem = False

if word2 not in vocab:
	print ('the word ' + word2 + ' is not in the vocabulary')
	no_problem = False

if no_problem:
	similarity = model.wv.similarity(word1, word2)
	print ('the similarity between ' + word1 + ' and ' + word2 + ' : ', similarity)

Caculate the similarity between word 1 and word2
word1: 한국
word2: 북한
the similarity between 한국 and 북한 :  0.28036028


# Step5: Find mismatch word

In [23]:
#words = '소프트웨어 네트워크 프로그램 가방'
print("Find mismatched word in the words")
text = input("text(words): ")
words = text.split()

# check the words are in the vocabulary
no_problem = True
vocab = model.wv.index_to_key

for word in words:
	if word not in vocab:
		print('the word ' + word + ' is not in the vocabulary')
		no_problem = False
		break;

if no_problem:
	mismatched = model.wv.doesnt_match(words)
	print ('the mismatch word between ' + text +' is', mismatched)

Find mismatched word in the words
text(words): 소프트웨어 네트워크 프로그램 가방
the mismatch word between 소프트웨어 네트워크 프로그램 가방 is 가방


# Step 6. Find the top-N most similar words

In [24]:
print("Print the most similar words")
word = input("word: ")

no_problem = True
vocab = model.wv.index_to_key

if word not in vocab:
	print ('the word ' + word + ' is not in the vocabulary')
	no_problem = False

if no_problem:
    print(model.wv.most_similar(positive=[word]))

Print the most similar words
word: 인간
[('동물', 0.617953896522522), ('배아', 0.6104809641838074), ('생쥐', 0.5970973968505859), ('존엄성', 0.5852876901626587), ('본성', 0.5809779167175293), ('인류', 0.5760883092880249), ('생명체', 0.5717364549636841), ('유기체', 0.5703120827674866), ('핵이식', 0.5635316371917725), ('욕망', 0.561828076839447)]


# Step 7: Vector calculation

In [25]:
#word_a = '한국'
#word_b = '아시아'
#word_c = '유럽'
print('Find the most similar word with the result of [ a - b + c ]')
word_a = input("a: ")
word_b = input("b: ")
word_c = input("c: ")

# check the words are in the vocabulary
no_problem = True
vocab = model.wv.index_to_key

if word_a not in vocab:
	print ('the word ' + word_a + ' is not in the vocabulary')
	no_problem = False

if word_b not in vocab:
	print ('the word ' + word_b + ' is not in the vocabulary')
	no_problem = False

if word_c not in vocab:
	print ('the word ' + word_c + ' is not in the vocabulary')
	no_problem = False

if no_problem:
	mostsimilar = model.wv.most_similar(positive=[word_a, word_c], negative=[word_b], topn=5)
	print ('most similar word of ' + word_a + ' - ' + word_b + ' + ' + word_c + ' is', mostsimilar[0][0], mostsimilar[1][0], mostsimilar[2][0])

Find the most similar word with the result of [ a - b + c ]
a: 독도
b: 한국
c: 일본
most similar word of 독도 - 한국 + 일본 is 다케시마 영유권 울릉도
