# Chapter 8 - Information Retrieval
정보검색 - 정보접속

## 1 정보검색 소개
Introducing information retrieval

### 01 불용어 제거
Stop-Word

In [1]:
import nltk
from nltk.corpus import stopwords
stopwords.words('english')[:10]

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your']

In [2]:
# 주어진 단어중 불용어 비해당 확률
def not_stopwords(text):
    stopwords = nltk.corpus.stopwords.words('english')
    content = [w    for w in text    if w.lower() not in stopwords]
    return len(content) / len(text)

not_stopwords(nltk.corpus.reuters.words())

0.735240435097661

In [3]:
import nltk ,string
from collections import Counter
from nltk.corpus import stopwords

def get_tokens():
    with open('./data/stopwords.txt') as stopl:
        tokens = nltk.word_tokenize(stopl.read().lower().translate(string.punctuation))
    return tokens

#if __name__ == "__main__":
tokens = get_tokens()
tokens = tokens[0].split('+')
len(tokens)

print("tokens :", len(tokens), tokens[:10])

tokens : 63 ['', 'a', 'about', 'above', 'across', 'after', 'again', 'against', 'all', 'almost']


In [4]:
count1 = Counter(tokens)
print("before: len(count1) ",len(count1))

before: len(count1)  62


In [5]:
filtered1 = [w for w in tokens if not w in stopwords.words('english')]
print("filtered1 :",filtered1[:5])

filtered1 : ['', 'across', 'almost', 'alone', 'along']


In [6]:
count1 = Counter(filtered1)
print("after: len(count1) :",len(count1))

after: len(count1) : 39


In [7]:
count1.most_common(10)

[('', 2),
 ('across', 1),
 ('almost', 1),
 ('alone', 1),
 ('along', 1),
 ('already', 1),
 ('also', 1),
 ('although', 1),
 ('always', 1),
 ('among', 1)]

In [8]:
# nltk.pos_tag(filtered1)

### 02 벡터 공간 모델을 사용한 정보 검색
Term Frequency (용어 빈도) Inverse Document Frequency

https://gist.github.com/himzzz/4105717

In [9]:
# 1 문서의 토큰화
# 2 벡터 공간 모델의 계산
# 3 각 문서에 대한 TF-IDF를 계산

$TF(t,d) = 0.5 + (0.5 * f(t,d)) / max {f(w,d) : w \in d}$

$Mark Down Signs$
https://en.wikipedia.org/wiki/Help:Displaying_a_formula#Formatting_using_TeX

In [10]:
# 문서에서 각 용어에 대한 TF-IDF를 계산하는 코드
import re, nltk, math
from nltk.tokenize import RegexpTokenizer
from nltk import bigrams, trigrams

stopwords = nltk.corpus.stopwords.words('portuguese')
tokenizer = RegexpTokenizer("[\w’]+", flags=re.UNICODE)

In [11]:
def freq(word, doc):
    return doc.count(word)

def word_count(doc):
    return len(doc)

def tf(word, doc):
    return (freq(word, doc) / float(word_count(doc)))

In [12]:
def num_docs_containing(word, list_of_docs):
    count = 0
    for document in list_of_docs:
        if freq(word, document) > 0:
            count += 1
    return 1 + count

In [13]:
def idf(word, list_of_docs):
    return math.log(len(list_of_docs) /
            float(num_docs_containing(word, list_of_docs)))

def tf_idf(word, doc, list_of_docs):
    return (tf(word, doc) * idf(word, list_of_docs))

In [14]:
# 각 용어의 빈도를 계산한다
vocabulary, all_tips = [], []
docs = {}
for tip in (['documment 1', 'documment 2']):
    tokens = tokenizer.tokenize(tip)#.text)
    bi_tokens = bigrams(tokens)
    tri_tokens = trigrams(tokens)
    tokens = [token.lower() for token in tokens if len(token) > 2]
    tokens = [token for token in tokens if token not in stopwords]
    bi_tokens = [' '.join(token).lower() for token in bi_tokens]
    bi_tokens = [token for token in bi_tokens if token not in stopwords]
    tri_tokens = [' '.join(token).lower() for token in tri_tokens]
    tri_tokens = [token for token in tri_tokens if token not in stopwords]
    final_tokens = []
    final_tokens.extend(tokens)
    final_tokens.extend(bi_tokens)
    final_tokens.extend(tri_tokens)
    docs[tip] = {'freq': {}, 'tf': {}, 'idf': {},
                        'tf-idf': {}, 'tokens': []}
    for token in final_tokens:
        docs[tip]['freq'][token] = freq(token, final_tokens) # 각 팁에 대한 빈도를 계산
        docs[tip]['tf'][token] = tf(token, final_tokens)     # 용어의 빈도를 계산(정규화)
        docs[tip]['tokens'] = final_tokens
    vocabulary.append(final_tokens)

In [15]:
for doc in docs:
    for token in docs[doc]['tf']:
        docs[doc]['idf'][token]    = idf(token, vocabulary) # The Inverse-Document-Frequency
        docs[doc]['tf-idf'][token] = tf_idf(token, docs[doc]['tokens'], vocabulary) # The tf-idf

In [16]:
# tf-idf로 가장 관련된 단어를 찾아본다
words = {}
for doc in docs:
    for token in docs[doc]['tf-idf']:
        if token not in words:
            words[token] = docs[doc]['tf-idf'][token]
        else:
            if docs[doc]['tf-idf'][token] > words[token]:
                words[token] = docs[doc]['tf-idf'][token]
    print (doc)
    for token in docs[doc]['tf-idf']:
        print (token, docs[doc]['tf-idf'][token])

documment 1
documment -0.20273255405408222
documment 1 0.0
documment 2
documment -0.20273255405408222
documment 2 0.0


In [17]:
for item in sorted(words.items(), key=lambda x: x[1], reverse=True):
    print ("%f <= %s" % (item[1], item[0]))

0.000000 <= documment 1
0.000000 <= documment 2
-0.202733 <= documment


## 2 벡터 공간 스코링 및 질의 연산자의 상호작용
Vector space scoring

## 3 텍스트의 요약
Text summarization : 긴 텍스트의 요약을 생성한다

Luhn의 The AutoMatic Creation of Literature Abstracts (1958) : Naive Summ

In [18]:
from nltk.tokenize import sent_tokenize,word_tokenize
from nltk.corpus import stopwords
from collections import defaultdict
from string import punctuation
from heapq import nlargest
class Summarize_Frequency:   
    # 텍스트 요약자를 초기화 한다 ,cut_min 보다 작거나 / cut_max 보다 크면 제외
    def __init__(self, cut_min=0.2, cut_max=0.8):
        self._cut_min = cut_min
        self._cut_max = cut_max
        self._stopwords = set(stopwords.words('english') +
        list(punctuation))

In [19]:
    # 각 빈도를 계산, input : word_sent // output : freq, freq[w]
    def _compute_frequencies(self, word_sent): 
        freq = defaultdict(int)
        for s in word_sent:
            for word in s:
                if word not in self._stopwords:
                    freq[word] += 1
        # 빈도 정규화 및 필터링
        m = float(max(freq.values()))
        for w in freq.keys():
            freq[w] = freq[w]/m
            if freq[w] >= self._cut_max or freq[w] <= self._cut_min:
                del freq[w]
        return freq

In [20]:
    # 문장(n)의 목록을 반환한다
    def summarize(self, text, n): 
        sents = sent_tokenize(text)
        assert n <= len(sents)
        word_sent = [word_tokenize(s.lower()) for s in sents]
        self._freq = self._compute_frequencies(word_sent)
        ranking = defaultdict(int)
        for i,sent in enumerate(word_sent):
            for w in sent:
                if w in self._freq:
                    ranking[i] += self._freq[w]
        sents_idx = self._rank(ranking, n)
        return [sents[j] for j in sents_idx]

In [21]:
    # 순위가 가장 높은 첫번쨰 n 문장을 반환
    def _rank(self, ranking, n):
        return nlargest(n, ranking, key=ranking.get)

## 4 질의 응답 시스템
Question-answering system

In [22]:
import nltk ,string
from nltk import *
# question = input()
question = """tf–idf, short for term frequency–inverse document frequency is 
a numerical statistic that is intended"""
question = question.lower()
stopwords = nltk.corpus.stopwords.words('english')
cont = nltk.word_tokenize(question)
analysis_keywords = list( set(cont) - set(stopwords) ) # 불용어 제거 후 token
analysis_keywords

['numerical',
 'short',
 ',',
 'intended',
 'statistic',
 'frequency–inverse',
 'frequency',
 'document',
 'tf–idf',
 'term']