1. Indexer
    - Crawler (Focused) => Repository(Collection)
    - Documnet Analyzer => HTML, Tokenizing, Normalizing
                           Stemming(BPE), N-gram, MA, POS
                           Stopwords, RE, Phrases
                        => Preprocessing
    
    - Features => Lexicon (1)
    - Document(Query) Representation => BoW
    - Document-Term Mat., Term-Document Mat.(핵심)
        => Inverted Document Indexing (역문헌구조)
        
<br>    
    
2. Relevance(Ranking)

    크게 통계 / 확률 방법으로 나뉜다 
    <br>
    우리는 통계 방법론중 하나인 빈도를 기준으로 할 예정
    <br>
    - weighting(TF-IDF), Similarity(Euclidian, Cosine:0-1)
    - Sorting (코사인은 0-1 사이기 때문에 크면 클수록 좋다)
    
<br>
    
3. Results
    - 끝 (Top K)

## Lexicon

In [1]:
from konlpy.corpus import kobill

def getLexicon1():
    lexicon = list()

    for doc in [kobill.open(idx).read() for idx in kobill.fileids()]:
        for term in doc.split():
            if term not in lexicon:
                lexicon.append(term)
    return lexicon


def getLexicon2(): # set을 이용한 방법
    lexicon = list()

    for doc in [kobill.open(idx).read() for idx in kobill.fileids()]:
        for term in doc.split():
                lexicon.append(term)
    return list(set(lexicon))
            
    
lexicon = getLexicon2() # 아래 timeit으로 재보면 set으로 만든게 더 빠르다
            

In [2]:
%timeit getLexicon1()

130 ms ± 5.92 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [3]:
%timeit getLexicon2() # set

6.81 ms ± 390 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [4]:
#       1 2 3 ... 2638 => bag of words
    
# doc1  1 0 1 ... 1  => list(0 * 2638)   
# doc2
# ...
# doc10

    
def documentRepresentation1():
    
    documentList = list()
    
    for doc in [kobill.open(idx).read() for idx in kobill.fileids()]:
        
        bow = list(0 for _ in range(len(lexicon))) # 2638개 길이의 list
        
        for term in doc.split():
            bow[lexicon.index(term)] = 1
        
        documentList.append(bow)
    
    return documentList

In [5]:
%timeit documentRepresentation1()

310 ms ± 29.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [6]:
def documentRepresentationByDict():    # 공간을 낭비하지 말고 dict type으로 만든다면? 0인 부분이 날라가서 공간이 줄어든다
    
    documentList = list()
    
    for doc in [kobill.open(idx).read() for idx in kobill.fileids()]:
        
        bow = dict()
        
        for term in doc.split():
        
            bow[lexicon.index(term)] = 1
        
        documentList.append(bow)
    
    return documentList

In [7]:
%timeit documentRepresentationByDict()

299 ms ± 29.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [8]:
def documentRepresentationByDict1():   
    
    documentList = list()
    
    for doc in [kobill.open(idx).read() for idx in kobill.fileids()]:
        
        bow = dict()
        
        for term in doc.split():
        
            bow[term] = 1
        
        documentList.append(bow)
    
    return documentList

In [9]:
%timeit documentRepresentationByDict1()

5.46 ms ± 252 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [10]:
# 기존에 key값이 없으면 error가 나기 때문에 else 추가
def documentRepresentation2():    
    
    documentList = list()
    
    for doc in [kobill.open(idx).read() for idx in kobill.fileids()]:
        
        bow = dict()
        
        for term in doc.split():
            if term in bow.keys():
                bow[term] += 1
            else:
                bow[term] = 1
                                
        documentList.append(bow)
    
    return documentList

In [11]:
%timeit documentRepresentation2()


6.41 ms ± 557 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [12]:
from collections import defaultdict
                                
def documentRepresentationByDefaultDict():
    
    documentList = list()
    
    for doc in [kobill.open(idx).read() for idx in kobill.fileids()]:
        
        bow = defaultdict(int) # defaultdict는 type을 선언해줘야 한다. 
                               # dict와는 다르게 기존에 key값이 없어도 error가 나지 않는다
        for term in doc.split():
            bow[term] += 1
                                
        documentList.append(bow)
    
    return documentList

In [13]:
%timeit documentRepresentationByDefaultDict()

6.54 ms ± 747 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [14]:
# term에 해당하는 key (파일명)까지 저장
def documentRepresentationByDefaultDict1():
    
    documentList = defaultdict(lambda: defaultdict(int))
    
    for idx in kobill.fileids():
        for term in kobill.open(idx).read().split():
            documentList[idx][term] += 1
#             documentList.keys() => collection
#             set(documentList.values().keys()) => lexicon
            
    return documentList

In [15]:
%timeit documentRepresentationByDefaultDict1()

6.49 ms ± 413 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [16]:
docList = documentRepresentationByDefaultDict1()

In [54]:
### Boolean 검색 => 집합론
query = "국회 의원 국민"

def booleanResult():
    result = list()
    for term in query.split():
        searchResult = list()
        
        for idx, termList in docList.items():
            if term in termList.keys():
                searchResult.append(idx)
        result.append(searchResult)

    one = result.pop() 
    while result:
        temp = result.pop()
        one = list(set(one).intersection(temp))
    return one

In [21]:
%timeit booleanResult()

10.4 µs ± 565 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


In [70]:
def booleanResult2(): # TDM
    TDM = defaultdict(lambda: defaultdict(int))
    for idx, termList in docList.items():
        for term, freq in termList.items():
            TDM[term][idx] = freq
    
    result = list()
    for term in query.split():
        result.append(list(TDM[term].keys()))
        
    one = result.pop() 
    while result:
        temp = result.pop()
        one = list(set(one).intersection(temp))
    return one

In [55]:
TDM = defaultdict(lambda:defaultdict(int))
for idx, termList in docList.items():
    for term, freq in termList.items():
        TDM[term][idx] = freq
        
def booleanResult1():
    result = list()
    for term in query.split():
        result.append(list(TDM[term].keys()))
        
    one = result.pop()
    while result:
        temp = result.pop()
        one = list(set(one).intersection(temp))
    return one   

In [56]:
%timeit booleanResult1() # TDM이 더 빠른걸 확인할 수 있다.

4.02 µs ± 447 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
