# Baseline script of SomaNews Clustering

In [1]:
import numpy as np
import pandas as pd

## Load Data
Load Data from database

In [2]:
from pymongo import MongoClient

In [3]:
client = MongoClient('localhost', 27017)
db = client.get_database('somanews')
articles = db.get_collection('articles')

In [4]:
train = pd.DataFrame(list(articles.find()))
client.close()

In [5]:
train.head(2)

Unnamed: 0,_id,author,category,content,description,imageURL,link,provider,providerNewsID,publishedAt,title
0,57e2c716149c2181df5b8b95,인턴 변호재 디자이너 김은경,사회 > 카드뉴스,과연 진실은... \n[ 조선일보 카드뉴스가 더 보고 싶다면 ?] \n\n[ 조선일...,,,http://news.chosun.com/site/data/html_dir/2016...,chosun,2016092103107,2016-09-21 19:28:00,[카드뉴스] 대통령은 왜 흙을 밟지 않았을까요?
1,57e2c716149c2181df5b8b96,오로라 기자 디자이너 김은경,사회 > 카드뉴스,'세기의 연인'이라 불렸던 커플\n\n[ 조선일보 카드뉴스가 더 보고 싶다면 ?] ...,,,http://news.chosun.com/site/data/html_dir/2016...,chosun,2016092103074,2016-09-21 19:10:00,[카드뉴스] '브란젤리나'의 시작과 끝


## Preprocessing
1. Remove stopwords (regex, hanja)
2. POS Tagging with KoNLPy, Mecab

In [6]:
from konlpy.tag import Mecab
import hanja
import re

In [7]:
mecab = Mecab()
print(mecab.pos(u'네, 안녕하세요'))

[('네', 'IC'), (',', 'SC'), ('안녕', 'NNG'), ('하', 'XSV'), ('세요', 'EP+EF')]


In [8]:
hanja.translate(u'大韓民國은 民主共和國이다.', 'substitution')

'대한민국은 민주공화국이다.'

In [9]:
def text_cleaning(text):
    text = re.sub('[^가-힝0-9a-zA-Z\\s]', ',', text)
    text = text.replace("카드뉴스", '')
    text = hanja.translate(text, 'substitution')
    return text

In [10]:
def tokenize(data):
    return [' '.join(str(e) for e in mecab.pos(data))]

In [11]:
train['title'] = train['title'].astype('U')
train['title'] = train['title'].apply(lambda text: text_cleaning(text))
title = [tokenize(each[1]['title']) for each in train.iterrows()]
title

[["(',', 'SC') (',', 'SC') ('대통령', 'NNG') ('은', 'JX') ('왜', 'MAG') ('흙', 'NNG') ('을', 'JKO') ('밟', 'VV') ('지', 'EC') ('않', 'VX') ('았', 'EP') ('을까요', 'EF') (',', 'SC')"],
 ["(',', 'SC') (',', 'SY') (',', 'SC') ('브란', 'NNP') ('젤리', 'NNG') ('나', 'JC') (',', 'SC') ('의', 'JKG') ('시작', 'NNG') ('과', 'JC') ('끝', 'NNG')"],
 ["(',', 'SC') (',', 'SC') ('목소리', 'NNG') ('를', 'JKO') ('가장', 'MAG') ('귀하', 'VA') ('게', 'EC') ('쓴', 'VV+ETM') ('한', 'MM') ('성우', 'NNG') ('지망생', 'NNG')"],
 ["(',', 'SC') ('오늘', 'NNG') ('의', 'JKG') ('운세', 'NNG') (',', 'SC') ('9', 'SN') ('월', 'NNBC') ('22', 'SN') ('일', 'NNBC') ('목요일', 'NNG') (',', 'SC') ('음력', 'NNG') ('8', 'SN') ('월', 'NNBC') ('22', 'SN') ('일', 'NNBC') (',', 'SC') (',', 'SC') (',', 'SC')"],
 ["(',', 'SC') ('오늘', 'NNG') ('의', 'JKG') ('경기', 'NNG') (',', 'SC') ('2016', 'SN') ('년', 'NNB') ('9', 'SN') ('월', 'NNBC') ('21', 'SN') ('일', 'NNBC')"],
 ["('리콜', 'NNG') ('대상', 'NNG') ('인데', 'VCP+EC') ('도', 'JX') ('버젓이', 'MAG') (',', 'SC') ('결함', 'NNG') ('차량', 'NNG') ('72', 'S

## Training
1. Tf-idf and cosine similarity
2. K-Means Algorithm
3. Topic modeling
 - Latent Dirichlet allocation (LDA)
 - Latent semantic indexing (LSI)
 - Hierarchical Dirichlet process (HDP)
4. Word embedding
 - word2vec

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans
# from gensim import models

In [13]:
vectorizer = TfidfVectorizer(lowercase=False)
x_list = vectorizer.fit_transform(title)

TypeError: expected string or bytes-like object

In [None]:
dist = 1 - cosine_similarity(x_list)
print(dist)

In [15]:
km = KMeans(n_clusters=8)
km.fit(x_list)

## Scoring

## Results