![image](https://user-images.githubusercontent.com/40786348/58559511-3c32a700-825e-11e9-9d04-a37c35f5f029.png)

## preprocessing

In [290]:
from os import listdir
def fileids(path, ext='txt'):
    return [path+file for file in listdir(path) if file.split('.')[-1] == ext]

In [291]:
def ngram(term, n=2):
    return [term[i:i+n] for i in range(len(term) - n + 1)]

In [292]:
from nltk.tokenize import word_tokenize
from konlpy.tag import Komoran

ma = Komoran()

# preprocessing 리스트
for file in fileids('./data/'):
    with open(file, encoding='utf-8') as fp:
        content = fp.read()
    tokens1 = content.split() # 원시어절
    tokens2 = word_tokenize(content) # 구두점 분리 => 원시어절
    tokens3 = [_ for token in tokens2 for _ in ma.pos(token)] # 형태소-품사
    tokens4 = [token[0] for token in tokens3] # 형태소
    tokens5 = [token[0] for token in tokens3 if token[1].startswith('N')] # 명사
    tokens6 = [_ for token in tokens4 for _ in ngram(token)] # ngram

In [35]:
def filecontent(file):
    with open(file, encoding='utf-8') as fp:
        content = fp.read()
    return content

In [293]:
from string import punctuation
import re

pattern = dict()

pattern1 = re.compile(r'[{0}]'.format(re.escape(punctuation))) # 구두점
pattern['punc'] = pattern1

pattern2 = re.compile(r'[A-Za-z0-9]{7,}') # 불용어
pattern['stop'] = pattern2

# pattern3 = re.compile(r'\w{2,}@\w{3,}(.\w{2,})+')
pattern3 = re.compile(r'\w{2,}@(.?\w{2,})+')# 이메일
pattern['email'] = pattern3

pattern4 = re.compile(r'(.?\w{2,}){2,}') # 도메인
pattern['url'] = pattern4

pattern5 = re.compile(r'[^가-힣0-9]+') # 한글 이외
pattern['nonkorean'] = pattern5

pattern6 = re.compile(r"\s{2,}") # WhiteSpace
pattern['whitespace'] = pattern5

In [294]:
corpus = pattern1.sub(' ',corpus)
corpus = pattern2.sub(' ',corpus)
corpus = pattern3.sub(' ',corpus)
corpus = pattern4.sub(' ',corpus)
corpus = pattern5.sub(' ',corpus)
corpus = pattern6.sub(' ',corpus)

## Collection 생성

In [295]:
# 경로에 들어있는 총 기사갯수
len(fileids('./data/'))

375

In [296]:
# collection 생성
collection = defaultdict()
for _ in range(len(fileids('./data/'))):
    doc_id = fileids('./data/')[_].split('/')[-1]
    doc_content = filecontent(fileids('./data/')[_])
    
    for _ in ['punc', 'stop', 'email']:
        doc_content = pattern[_].sub(' ',doc_content)
    
    tokens = doc_content.split() # 원시어절
    tokens = [_ for token in tokens for _ in ngram(token)]
    
    collection[doc_id] = tokens

## DTM, TDM 생성

In [297]:
from collections import defaultdict
from math import log2
from math import sqrt
from konlpy.corpus import kobill
from konlpy.tag import Komoran

# DTM: keys=문서명, values=단어와 그에 따른 빈도
DTM = defaultdict(lambda:defaultdict(int))
for docName, docContent in collection.items():
#     for term in docContent.split():
    for term in docContent:    
        DTM[docName][term] += 1
        
TDM = defaultdict(lambda:defaultdict(int))
for idx, termList in DTM.items():
    for term, freq in termList.items():
        TDM[term][idx] = freq
        
TWM = defaultdict(lambda:defaultdict(float))
DVL = defaultdict(float)

N = len(DTM)
for idx, termList in DTM.items():
    maxTF = max(termList.values())
    for term, freq in termList.items():
        TF = freq/maxTF
        IDF = log2(N/len(TDM[term]))
        TWM[term][idx] = TF*IDF
        DVL[idx] += TWM[term][idx]**2
        
for idx, length in DVL.items():
    DVL[idx] = sqrt(length)  

## query 

In [298]:
query = '서울시에 거래되는 아파트 전세값은?.'

TQM = defaultdict(int)
QWM = defaultdict(float)

# bigram으로만 전처리 (doc, query 둘다 같은 preprocessing 적용해야 한다)
for term in query.split():
    for token in ngram(term):
        TQM[token] += 1
        
#     for token in word_tokenize(term):
#         TQM[token] += 1
#     TQM = word_tokenize(term)
   
alpha = 0.5
maxTF = max(TQM.values())
for term, ferq in TQM.items():
    TF = alpha + (1-alpha)*(freq/maxTF)
    DF = len(TWM[term]) if len(TWM[term]) > 0 else 1
#     DF = len(TWM[term])
    IDF = log2(N/DF)
    QWM[term] = TF*IDF

In [299]:
# document에서 후보자 sort해서 검색 결과 반환
candidateList = defaultdict(float)
for term, weight1 in QWM.items():
    for doc, weight2 in TWM[term].items():
        innerProduct = weight1 * weight2
        candidateList[doc] += innerProduct
        
for doc, sim in candidateList.items():
    candidateList[doc] = sim/DVL[doc]
    
K = 5
for doc, sim in sorted(candidateList.items(), key=lambda x:x[1], reverse=True)[:K]:
    print('문서이름:{0} / 유사도:{1:.4f}'.format(doc, sim))
    print(sent_tokenize(open('./data/' + doc, encoding='utf-8').read()[:40]))
    print()

문서이름:경제_0509n0160.txt / 유사도:1.0429
['1·2기 신도시 주민들 집단반발얼마 전까지 윗마을로 사람들이 빠져나가더니']

문서이름:사회_20190510221352333.txt / 유사도:0.9118
['대한애국당 오후 7시께 서울시와 경찰 저지에도 천막 설치 강행']

문서이름:경제_0509n1656.txt / 유사도:0.8892
['창릉 신도시 발표에 일산파주 주민들 분노의 소리 커져온라인 카페·김현미']

문서이름:사회_0428n0761.txt / 유사도:0.7951
['【진주뉴시스】 차용현 기자  지난 17일 경남 진주시 한 아파트에서 방화']

문서이름:경제_0510n0295.txt / 유사도:0.6794
['고양 창릉 지정으로 일산 반발 커삼송 개발로 타격 받은 적 있어지난해 정']



In [304]:
candidateList = defaultdict(float)
for term, docList in TWM.items():
    for doc, weight1 in docList.items():
        weight2 = QWM[term]
        candidateList[doc] += (weight1 - weight2)**2 
        
for doc, sim in candidateList.items():
    candidateList[doc] = sqrt(sim)

for doc in DTM:
    print(doc, len(open('./data/' + doc, encoding='utf-8').read().split()), len(DTM[doc]), sum(DTM[doc].values()))
candidateList

In [303]:
from nltk.tokenize import sent_tokenize

K = 5
for doc, sim in sorted(candidateList.items(), key=lambda x:x[1])[:K]:
    print('문서이름:{0} / 거리:{1:.4f}'.format(doc, sim))
    print(sent_tokenize(open('./data/' + doc, encoding='utf-8').read())[:3])
    print()