# Study script of Word2Vec models

In [3]:
import numpy as np
import pandas as pd

## Load Data

In [6]:
from pymongo import MongoClient

In [7]:
client = MongoClient('localhost', 27017)
db = client.get_database('somanews')
articles = db.get_collection('articles')

In [9]:
train = pd.DataFrame(list(articles.find()))
client.close()

In [10]:
train = train.drop(['_id', 'author', 'description', 'link', 'imageURL', 'providerNewsID'], axis=1)
train.head()

Unnamed: 0,category,content,provider,publishedAt,title
0,사회 > 카드뉴스,과연 진실은... \n[ 조선일보 카드뉴스가 더 보고 싶다면 ?] \n\n[ 조선일...,chosun,2016-09-21 19:28:00,[카드뉴스] 대통령은 왜 흙을 밟지 않았을까요?
1,사회 > 카드뉴스,'세기의 연인'이라 불렸던 커플\n\n[ 조선일보 카드뉴스가 더 보고 싶다면 ?] ...,chosun,2016-09-21 19:10:00,[카드뉴스] '브란젤리나'의 시작과 끝
2,사회 > 카드뉴스,당신의 목소리는 그 누구의 목소리보다 귀하게 쓰였습니다. 기사보기 : https:...,chosun,2016-09-21 19:03:00,[카드뉴스] 목소리를 가장 귀하게 쓴 한 성우 지망생
3,문화 > 오늘의 운세,36년생 정도를 지키면 무난한 날. 48년생 결정할 일 결정하도록. 60년생 가뭄에...,chosun,2016-09-21 18:00:00,[오늘의 운세] 9월 22일 목요일(음력 8월 22일 丁未)
4,스포츠ㆍ연예 > 스포츠 > 종합,,chosun,2016-09-21 03:00:00,[오늘의 경기] 2016년 9월 21일


In [11]:
train = train[pd.to_datetime(train.publishedAt).dt.year == 2016]
train.publishedAt.head()

0   2016-09-21 19:28:00
1   2016-09-21 19:10:00
2   2016-09-21 19:03:00
3   2016-09-21 18:00:00
4   2016-09-21 03:00:00
Name: publishedAt, dtype: datetime64[ns]

## Preprocessing
1. Remove stopwords (regex, hanja)
2. POS Tagging with KoNLPy, Mecab
3. NNP to English

In [54]:
import datetime
from konlpy.tag import Twitter
from konlpy.corpus import kobill
import hanja
import re

In [14]:
dstart = datetime.date(2016,8,29)
dend = dstart + datetime.timedelta(weeks=1)
train = train[(train["publishedAt"] > dstart) & (train["publishedAt"] < dend)]
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3068 entries, 2471 to 20909
Data columns (total 5 columns):
category       3068 non-null object
content        3068 non-null object
provider       3068 non-null object
publishedAt    3068 non-null datetime64[ns]
title          3068 non-null object
dtypes: datetime64[ns](1), object(4)
memory usage: 143.8+ KB


In [68]:
twitter = Twitter()
print twitter.nouns(u'네, 안녕하세요')

[u'\ub124']


In [69]:
pos = lambda d: ['/'.join(p) for p in twitter.nouns(d)]

In [70]:
docs_ko = [kobill.open(i).read() for i in kobill.fileids()]
texts_ko = [pos(doc) for doc in docs_ko]

## Training
Word Embedding : Word2Vec

In [71]:
from gensim.models import word2vec
from konlpy.utils import pprint

In [72]:
wv_model_ko = word2vec.Word2Vec(texts_ko)
wv_model_ko.init_sims(replace=True)
wv_model_ko.save('ko_word2vec.model')

In [73]:
pprint(wv_model_ko.most_similar(pos(u'정부')))
pprint(wv_model_ko.most_similar(pos(u'초등학교')))

[(경/우, 0.9996417164802551),
 (제, 0.9996381998062134),
 (등, 0.9996241331100464),
 (결/혼, 0.9996216893196106),
 (파/견, 0.9996136426925659),
 (년, 0.9996079206466675),
 (이/상, 0.9996066093444824),
 (신, 0.9995951056480408),
 (의, 0.9995935559272766),
 (관/련, 0.9995920062065125)]
[(제, 0.9996270537376404),
 (파/견, 0.999594509601593),
 (결/혼, 0.9995840787887573),
 (및, 0.9995807409286499),
 (경/우, 0.9995760917663574),
 (항, 0.9995741248130798),
 (등, 0.9995730519294739),
 (예/고, 0.9995695948600769),
 (안, 0.9995682239532471),
 (의, 0.9995660781860352)]
