# Script for extract keywords

In [1]:
import numpy as np
import pandas as pd

## Load Data

In [2]:
from pymongo import MongoClient

In [3]:
client = MongoClient('mongodb://ssomanews:ssomanews1029@ds033987-a0.mlab.com:33987/somanews', 33987)
db = client.get_database('somanews')
articles = db.get_collection('articles')

In [4]:
train = pd.DataFrame(list(articles.find()))
client.close()

## Preprocessing
1. Remove stopwords (regex, hanja)
2. POS Tagging with KoNLPy, Mecab
3. NNP to English

In [5]:
from konlpy.tag import Mecab
import hanja
import cnouns
import re

In [6]:
mecab = Mecab()
pos = lambda d: ['/'.join(p) for p in mecab.pos(d)]

In [7]:
def text_cleaning(text):
    text = hanja.translate(text, 'substitution')
    text = re.sub('[^가-힝0-9a-zA-Z\\s]', ' ', text)
    text = text.replace(u"카드뉴스", ' ').replace(u"조선일보", ' ').replace(u"오늘", ' ')
    return text

In [8]:
train['content'] = train['content'].apply(lambda text: text_cleaning(text))
content = [mecab.nouns(each[1]['content']) for each in train.iterrows()]

## TF-IDF Vectorizer

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
from tqdm import tqdm

In [10]:
vectorizer = TfidfVectorizer(lowercase=False)

## NMF vs LDA
- 수행시간 : 0.005 vs 0.520
- 정확도 1 : [체코 도시 중세 동유럽 양식] vs [체코 도시 중세 관광 광장]
- 정확도 2 : [해운 현대 한진 상선 금융] vs [상선 현대 한진 해운 금융]

In [11]:
from sklearn.decomposition import NMF
from sklearn.decomposition import LatentDirichletAllocation

In [12]:
def top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        keywords = [" ".join([feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]])]
    return keywords

In [13]:
keywords = []
for i in tqdm(range(len(content))):
    if not content[i]: 
        keywords.append('NULL')
        continue
    x_list = vectorizer.fit_transform(content[i])
    nmf = NMF(n_components=1).fit(x_list)
    feature_names = vectorizer.get_feature_names()
    keywords.append(top_words(nmf, feature_names, 5))

100%|██████████| 179/179 [00:01<00:00, 98.72it/s]


## Add keywords to DataFrame

In [14]:
train['keywords'] = keywords