In [None]:
# IR Components
# 1. Crawler + Indexer -> Crawler / Indexer
# 2. Doc Analyzer -> (Improved) BoW
# 3. Query -> 2번과 동일 -> DTM(TDM)
# 4. Ranking Model

In [None]:
# 1-1. (Focused) Crawler (BFS) - 네이버 뉴스
from requests import get
from requests.compat import urljoin
from bs4 import BeautifulSoup

urls = ['https://news.naver.com']
visited = list()

path = './naver/'

while urls:
    seed = urls.pop(0) # Queue
    visited.append(seed)
    
    dom = BeautifulSoup(get(seed).text, 'html.parser')
    body = dom.select_one('#articleBodyContents')
    
    if body: # 뉴스
        cid = re.search('rankingSectionId=(\d+)', seed).group(1)
        aid = re.search('aid=(\d+)', seed).group(1)
        
        file = '{0}-{1}.txt'.format(cid, aid)
        with open(path+file, 'w', encoding='utf-8') as f:
            f.write(body.text)
    else: # 링크
        for a in dom.select('div[id^=ranking_] li > a'):
            link = urljoin(seed, a['href'])
            if link not in urls and link not in visited:
                urls.append(link)

In [None]:
import re
from string import punctuation
pattern1 = re.compile(r'[{}]'.format(re.escape(punctuation)))
pattern2 = re.compile(r'\b(\w|[.])+@(?:[.]?\w+)+\b')
pattern3 = re.compile(r'\bhttps?://\w+(?:[.]?\w+)+\b')
pattern4 = re.compile(r'[^A-Za-z0-9가-힣ㄱ-ㅎㅏ-ㅣ ]')
pattern5 = re.compile(r'\b[a-z][A-Za-z0-9]+\b')
pattern6 = re.compile(r'\s{2,}')

In [None]:
# 1-2. Indexer
from os import listdir
from konlpy.tag import Okt

okt = Okt()

def fileids(path = './naver/'):
    return [path+_ for _ in listdir(path)
            if re.search('[.]txt$', _)]

def cleaning(doc):
    return pattern6.sub(' ',
           pattern1.sub(' ',
           pattern5.sub(' ',
           pattern4.sub(' ',
           pattern2.sub(' ', doc))))).strip()

def tokenizer1(doc): # 어절
    return doc.split()

def tokenizer2(tokens, n=2): # 어절 Ngram
    ngram = list()
    for i in range(len(tokens) - (n-1)):
        ngram.append( )
    return ngram

def tokenizer3(doc, n=2): # 음절 Ngram
    ngram = list()
    for i in range(len(doc) - (n-1)):
        ngram.append(doc[i:i+n])
    return ngram

def tokenizer4(doc): # 형태소
    return [_ for _ in okt.morphs(doc) if 1 < len(_) < 8]

def tokenizer5(doc): # 명사
    return [_ for _ in okt.nouns(doc) if 1 < len(_) < 8]

In [153]:
def get_tokens(file):
    print(file)
    terms = defaultdict(lambda:0)
    
    with open(file, 'r') as f:
        news = cleaning(f.read())

    for _ in tokenizer1(news):
        terms[_] += 1
    for _ in tokenizer2(list(terms.keys())):
        terms[_] += 1
    for _ in tokenizer3(news):
        terms[_] += 1
    for _ in tokenizer4(news):
        terms[_] += 1
        
    return terms

def indexer(file):
    lexicon = dict()
    
    for k, v in get_tokens(file).items():
        if k not in lexicon:
            lexicon[k] = 1
                
    return lexicon

def mergeLexicon(Lexicon, i, LocalLexicon, invertedIndex):
    for k, v in LocalLexicon.items():
        if k in Lexicon.keys():
            termInfo = (i, Lexicon[k])
            invertedIndex.append(termInfo)
            Lexicon[k] = invertedIndex.index(termInfo)
        else:
            termInfo = (i, -1)
            invertedIndex.append(termInfo)
            Lexicon[k] = invertedIndex.index(termInfo)
            
    return Lexicon, invertedIndex

def sortedLexicon(Lexicon, invertedIndex):
    sortedIndex = list()
    
    for k, v in Lexicon.items():
        pos1 = v
        pos2 = len(sortedIndex)
        df = 0
        
        while pos1 > -1:
            termInfo = invertedIndex[pos1]
            pos1 = termInfo[1]
            df += 1
            sortedIndex.append(termInfo[0])
            
        Lexicon[k] = (df, pos2)
        
    return Lexicon, sortedIndex
            
# Liked List(동적 리스트)
# termInfo = (문서번호, 다음위치)
# Lexicon[k] -> (위치)(문서번호, 다음위치) -> (위치)(문서번호, 다음위치:-1)

# [37:39]
# Lexicon : 'ABC' -> ? (line:36)
#                    0 (line:39)

#                    (i, -1) (line:37)
# invertedIndex : 0:(i, -1) (line:38)
# [32:35]
#                    (i, Lexicon['ABC']:0) (line:33)
# invertedIndex : 0:(i, -1)
#                 1:(i,  0) (line:34)
# Lexicon : 'ABC' -> 1 (line:35)

# Lexicon, invertedIndex, sortedIndex
# Lexicon[k]                  invertedIndex           sortedIndex
# "ABC" -> 190031           190031:(60, 50)               60
#                               50:(2, -1)                 2
#       -> (2, 0)
#          df, pos [0:df+1]
#                   60, 2

In [None]:
Collection = fileids()
Lexicon = dict()
invertedIndex = list()

for i, doc in enumerate(Collection): # 전체 문서집합
    localLexicon = indexer(doc) # 문서 당 처리 -> 단어:빈도{0,1}
    Lexicon, invertedIndex = mergeLexicon(Lexicon,
                i, localLexicon, invertedIndex) # 문서 합치고(역색인구조)

Lexicon, invertedIndex = sortedLexicon(Lexicon, invertedIndex)

./naver/102-0003023007.txt
./naver/100-0003551166.txt
./naver/101-0004729825.txt
./naver/104-0002507337.txt
./naver/104-0003023003.txt
./naver/101-0003022999.txt
./naver/100-0004391547.txt
./naver/105-0001707186.txt
./naver/105-0002195462.txt
./naver/103-0003023010.txt
./naver/105-0000564412.txt
./naver/101-0003551211.txt
./naver/105-0001707231.txt
./naver/103-0001472759.txt
./naver/104-0003551212.txt
./naver/104-0003551206.txt
./naver/103-0000832054.txt
./naver/100-0011788427.txt
./naver/102-0011788086.txt
./naver/104-0000890782.txt
./naver/103-0004627628.txt
./naver/100-0004391479.txt


In [None]:
for k in list(Lexicon.keys())[:10]:
    termInfo = Lexicon[k]
    docList = invertedIndex[termInfo[1]:sum(termInfo)]
    print(k, termInfo, docList, len(docList))

In [None]:
voca, posting = indexer(fileids())

In [None]:
query = '대형 초밥 체인점 한국어 메뉴에만 냉수 180엔 부과재일 교포 A씨 트위터에 해당 사건 공유해 일파만파'
qterms = tokenizer1(query)
qterms += tokenizer2(qterms)
qterms += tokenizer3(query)
qterms += tokenizer4(query)
qterms += tokenizer5(query)

In [None]:
result = list()
for t in qterms:
    temp = list()
    if t in voca:
        print(t)
        pos = voca[t]
        while pos > -1:
            info = posting[pos]
            pos = info[-1]
            temp.append(str(info[1]))
            result.append(info[1])
    print(','.join(temp))
    print()

In [None]:
list(set(result))