In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
import time
import re
from konlpy.tag import Okt

In [2]:
# 뉴스 기사 데이터 불러오기
data = pd.read_csv('news_data.csv', encoding='utf-8')

In [3]:
# 불용어 정의
stopwords = ["년", "월", "일", "시", "분", "초", "뉴스"]

# 형태소 분석기 OKT를 사용한 토큰화 작업
okt = Okt()

tokenized_data = []
for sentence in tqdm(data['cleaned_document']):
    tokenized_sentence = okt.morphs(sentence, stem=True) # 토큰화
    stopwords_removed_sentence = [word for word in tokenized_sentence if len(word) > 1 and not word in stopwords] # 불용어 제거
    tokenized_data.append(stopwords_removed_sentence)

100%|██████████████████████████████████████████████████████████████████████████████| 1595/1595 [01:03<00:00, 25.04it/s]


In [8]:
# gensim 라이브러리의 Word2Vec 이용하기
from gensim.models import Word2Vec

In [50]:
# Word2Vec model 훈련시키기(CBOW)
model = Word2Vec(tokenized_data, vector_size=100, window=5, min_count=5)

In [64]:
# 컴퓨터와 유사한 단어 찾기
model.wv.most_similar('컴퓨터')

[('기계', 0.951115608215332),
 ('굴착기', 0.9456576704978943),
 ('직할', 0.9444350600242615),
 ('이색', 0.9401832818984985),
 ('거대', 0.938080370426178),
 ('마크', 0.9369220733642578),
 ('생물다양성', 0.9356628060340881),
 ('스타일', 0.9352216124534607),
 ('간의', 0.934786856174469),
 ('문화유산', 0.9328493475914001)]

In [54]:
# Skip-gram Word2Vec model 훈련시키기
model_skip = Word2Vec(tokenized_data, vector_size=100, window=5, min_count=5, sg=1)

In [63]:
# 컴퓨터와 유사한 단어 찾기
model_skip.wv.most_similar('컴퓨터')

[('비톤', 0.9409067034721375),
 ('고성능', 0.8958309888839722),
 ('프로세서', 0.884769856929779),
 ('인텔', 0.8739143013954163),
 ('레이', 0.8694416284561157),
 ('유니콘', 0.8649833798408508),
 ('왕실', 0.8608004450798035),
 ('철학', 0.8603827953338623),
 ('숙련', 0.8600847125053406),
 ('범용', 0.8588161468505859)]