In [45]:
!pip install feedparser
!pip install newspaper3k
!pip install konlpy



In [0]:
import feedparser
from newspaper import Article
from konlpy.tag import Okt
from collections import Counter
from operator import eq
import numpy as np

In [0]:
urls = (
    'http://rss.etnews.com/Section901.xml',
    None
)

# 'http://rss.etnews.com/Section902.xml',
# 'http://rss.etnews.com/Section903.xml',

In [0]:
def get_tags(text, ntags=50):
    num_unique_words = 0
    num_most_freq = 0
    splitter = Okt()
    nouns = splitter.nouns(text)
    count = Counter(nouns)
    return_list = []

    for n, c in count.most_common(ntags):
        temp = {'tag': n, 'count': c}
        return_list.append(temp)
        num_unique_words = num_unique_words + 1
        if num_unique_words == 1:
            num_most_freq = c

    return num_unique_words, num_most_freq, return_list

In [0]:
def TF(request, most_freq, tag):
	return 0.1 + 0.9*Howmanywords(request, tag)/most_freq

In [0]:
def Howmanywords(request, tag):
	nWords = 0
	for n in tag:
		noun = n['tag']
		count = n['count']
		if eq(noun, request):
			nWords = count
	return nWords

In [0]:
def crawl_article(url, language='ko'):
    print('[Crawl Article] ', url)
    var_article = Article(url, language=language)
    var_article.download()
    var_article.parse()
    
    return var_article.title, var_article.text

In [0]:
def crawl_rss(urls):
    arr_rss = []
    for url in urls:
        print('[Crawl RSS ', url)
        parse_rss = feedparser.parse(url)
        for p in parse_rss.entries:
            arr_rss.append({'title': p.title, 'link': p.link})

    return arr_rss

In [0]:
def main():
    splitter = Okt()
    article_list = crawl_rss(urls)
    print(article_list)

    for article in article_list:
        _, text = crawl_article(article['link'])
        article['text'] = text
    print(article_list)

    print('[Parsing Title]')
    noun_title = [splitter.nouns(article['title']) for article in article_list]
    print(noun_title)

    print('[Parsing Test]')
    noun_text = []
    for article in article_list:
        num_unique_words, num_most_freq, tags = get_tags(article['text'])
        noun_text.append({'num_unique_words': num_unique_words, 'num_most_freq': num_most_freq, 'tags': tags})
        print(noun_text)

    query = input()
    print('[ Parsing Query ]')
    noun_query = splitter.nouns(query)
    print(noun_query)
    tf_idf_title = []
    tf_idf_mean = []
    tf_idf_query = []
    tf_idf_mean_query = []
	
	# fishing article
    for i, noun in enumerate(noun_title):
        tfs = [TF(req, noun_text[i]['num_most_freq'], noun_text[i]['tags']) for req in noun]
        _tfidf = [tfs[j] for j, n in enumerate(noun)]
        tf_idf_title.append(_tfidf)
        tf_idf_mean.append(np.mean(_tfidf))
		
	# SEARCH ENGINE from RSS with given Query
    for i in range(len(article_list)):
        tfs_query = [TF(req, noun_text[i]['num_most_freq'], noun_text[i]['tags']) for req in noun_query]
        _tfidf_query = [tfs_query[j] for j, n in enumerate(noun_query)]
        tf_idf_query.append(_tfidf_query)
        tf_idf_mean_query.append(np.mean(_tfidf_query))
        
    for i,e in enumerate(tf_idf_mean):
        print("TF-IDF (fishing article): ", e, ", title: ", article_list[i]['title'])
    for i,e in enumerate(tf_idf_mean_query):
        print("TF-IDF-Query: ", e, ", title: ", article_list[i]['title'])
        

In [60]:
if __name__ == '__main__':
    main()

[Crawl RSS  http://rss.etnews.com/Section901.xml
[Crawl RSS  None
[{'title': "국내 최대 아마 e스포츠 대회 'KeG', 올해 예선 없이 본선", 'link': 'https://www.etnews.com/20200528000090'}, {'title': "[마켓트렌드]유통업계, '재미+즐거움' 잡은 콜라보 마케팅 활발", 'link': 'https://www.etnews.com/20200528000088'}, {'title': "'스파이더맨3'에 현대차 미래차 기술 등장한다…현대차·소니와 '맞손'", 'link': 'https://www.etnews.com/20200528000064'}, {'title': '[ICT산업 미래전략포럼]글로벌 기업 관건은 수요·파트너·인재', 'link': 'https://www.etnews.com/20200527000155'}, {'title': '[포스트 코로나]민관, \'건설기계 수출\' 힘모은다…성윤모 "코로나 이후 큰 시장 열린다"', 'link': 'https://www.etnews.com/20200527000140'}, {'title': '[포스트 코로나] 대학 실험실 창업 기업 제품으로 바이러스 퇴치한다', 'link': 'https://www.etnews.com/20200527000123'}, {'title': "교육부 나이스 발주 연기 업계 '일파만파'", 'link': 'https://www.etnews.com/20200527000240'}, {'title': 'SK브로드밴드, 언택트시대 클라우드PC 새 먹거리로', 'link': 'https://www.etnews.com/20200527000250'}, {'title': "문체부, IPTV 유통 한국영화에 '워터마크' 도입 추진", 'link': 'https://www.etnews.com/20200527000229'}, {'title': "'4세대 나이스' 개통 1년 연기...미래교육·기술 담아 새판