# Script for extract keywords

In [1]:
import numpy as np
import pandas as pd

## Load Data from MongoDB

In [2]:
from pymongo import MongoClient

In [3]:
client = MongoClient('mongodb://localhost:27017/somanews')
client.somanews.authenticate('ssomanews', 'ssomanews1029')
db = client.get_database('somanews')
crawled_collection = db.get_collection('crawledArticles')

In [4]:
train = pd.DataFrame(list(crawled_collection.find()))

## Preprocessing
1. Remove stopwords (regex, hanja)
2. POS Tagging with KoNLPy, Mecab
3. NNP to English

In [5]:
from konlpy.tag import Mecab
import cnouns

In [6]:
mecab = Mecab()

In [7]:
train['content'] = train['content'].apply(lambda text: cnouns.text_cleaning_without_special_ch(text))
content = [mecab.nouns(each[1]['content']) for each in train.iterrows()]

## TF-IDF Vectorizer

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
from tqdm import tqdm

In [9]:
vectorizer = TfidfVectorizer(lowercase=False)

## NMF vs LDA
- 수행시간 : 0.005 vs 0.520
- 정확도 1 : [체코 도시 중세 동유럽 양식] vs [체코 도시 중세 관광 광장]
- 정확도 2 : [해운 현대 한진 상선 금융] vs [상선 현대 한진 해운 금융]

In [10]:
from sklearn.decomposition import NMF
from sklearn.decomposition import LatentDirichletAllocation

In [11]:
def top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        keywords = [" ".join([feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]])]
    return keywords

In [12]:
keywords = []
for i in tqdm(range(len(content))):
    if not content[i]: 
        keywords.append('NULL')
        continue
    x_list = vectorizer.fit_transform(content[i])
    nmf = NMF(n_components=1).fit(x_list)
    feature_names = vectorizer.get_feature_names()
    keywords.append(top_words(nmf, feature_names, 5))

  3%|▎         | 4973/149055 [00:51<28:09, 85.29it/s]


ValueError: empty vocabulary; perhaps the documents only contain stop words

## Save keywords to MongoDB

In [None]:
train['keywords'] = keywords
# crawled_collectionled_collection.insert_many(train.to_dict(orient='records'))
# client.close()