# Script for counting words

In [1]:
import numpy as np
import pandas as pd

## Load Data

In [2]:
from pymongo import MongoClient

In [3]:
client = MongoClient('mongodb://ssomanews:ssomanews1029@ds033987-a0.mlab.com:33987/somanews', 33987)
db = client.get_database('somanews')
articles = db.get_collection('articles')

In [4]:
train = pd.DataFrame(list(articles.find()))
client.close()

## Preprocessing
1. Remove stopwords (regex, hanja)
2. POS Tagging with KoNLPy, Mecab
3. NNP to English

In [5]:
from konlpy.tag import Mecab
import hanja
import cnouns
import re

In [6]:
mecab = Mecab()
pos = lambda d: ['/'.join(p) for p in mecab.pos(d)]

In [7]:
def text_cleaning(text):
    text = hanja.translate(text, 'substitution')
    text = re.sub('[^가-힝0-9a-zA-Z\\s]', ' ', text)
    text = text.replace(u"카드뉴스", ' ').replace(u"조선일보", ' ').replace(u"오늘", ' ')
    return text

In [8]:
train['content'] = train['content'].apply(lambda text: text_cleaning(text))
content = [mecab.nouns(each[1]['content']) for each in train.iterrows()]

## Counting words in each cluster

In [9]:
from collections import Counter
from konlpy.utils import pprint

In [10]:
contents = []
for name, group in train.groupby('cluster'):
    content = []
    for each in group.content:
        content.append(each)
    content_flat = ''.join(str(text) for text in content)
    contents.append(content_flat)

In [11]:
for each in contents:
    nouns = mecab.nouns(each)
    count = Counter(nouns)
    print(count.most_common(5))

[('년', 106), ('것', 80), ('명', 73), ('만', 61), ('세', 61)]
[('노트', 156), ('갤럭시', 146), ('배터리', 134), ('삼성전자', 130), ('일', 108)]
[('원', 61), ('만', 59), ('세트', 45), ('등', 41), ('찰보리', 39)]
[('만', 60), ('등', 55), ('호텔', 41), ('분양', 41), ('수', 40)]
[('경기', 24), ('월', 24), ('스포츠', 22), ('시', 12), ('등', 9)]
[('판사', 138), ('부장', 117), ('김', 94), ('대표', 85), ('검찰', 50)]
[('것', 54), ('운동', 47), ('수', 43), ('팬', 41), ('우리', 32)]
[('일', 74), ('추석', 72), ('등', 70), ('원', 44), ('만', 42)]
[('한진', 479), ('해운', 466), ('관리', 149), ('것', 145), ('법정', 138)]
[('대통령', 147), ('탄핵', 140), ('호세프', 116), ('브라질', 108), ('년', 99)]


## Visualization - Graph

In [12]:
import matplotlib.pyplot as plt