# Intro to Information Retrieval

## some examples

### the most basic info retrieval algorithm

In [3]:
docs_animals = ['cat', 'dog', 'zebra', 'monkey', 'cow', 'dog', 'zebra', 'cat', 'cat']

In [2]:
def quick_search(doc, query):
    return [keyword for keyword in doc if keyword == query]

In [4]:
print(quick_search(docs_animals, query='cat'))

['cat', 'cat', 'cat']


In [5]:
print(quick_search(docs_animals, query='dog'))

['dog', 'dog']


## what if we have docs that have more than one words?!

In [6]:
documents = [
    ['word1', 'word2', 'word3'],
    ['word1', 'word2'],
    ['word1', 'word3'],
    ['word1']
]

In [7]:
def quick_search_multi_words(docs, query):
    return [doc for doc in docs if query in doc]

In [8]:
print(quick_search_multi_words(documents, query='word1'))

[['word1', 'word2', 'word3'], ['word1', 'word2'], ['word1', 'word3'], ['word1']]


In [9]:
print(quick_search_multi_words(documents, query='word3'))

[['word1', 'word2', 'word3'], ['word1', 'word3']]


## Inverted Index

In [10]:
inverted_index = {
    'word1': [0, 1, 2, 3],
    'word2': [0, 1],
    'word3': [0, 2]
}

In [11]:
print(inverted_index['word1'])

[0, 1, 2, 3]


In [13]:
print(inverted_index['word3'])

[0, 2]


In [14]:
def indexed_search(docs, index, query):
    return [docs[doc_id] for doc_id in index[query]]

In [15]:
print(indexed_search(docs=documents, index=inverted_index, query='word2'))

[['word1', 'word2', 'word3'], ['word1', 'word2']]


## Query Processing

In [18]:
print(inverted_index)
print(documents)

{'word1': [0, 1, 2, 3], 'word2': [0, 1], 'word3': [0, 2]}
[['word1', 'word2', 'word3'], ['word1', 'word2'], ['word1', 'word3'], ['word1']]


In [20]:
def and_or_search(docs, index, queries, mode):
    doc_ids = set(index[queries[0]])
    for query in queries[1:]:
        if mode == 'or':
            doc_ids |= set(index[query])
        elif mode == 'and':
            doc_ids &= set(index[query])
    return [docs[doc_id] for doc_id in doc_ids]

In [21]:
print(and_or_search(documents, inverted_index, ['word1', 'word3'], mode='and'))

[['word1', 'word2', 'word3'], ['word1', 'word3']]


In [22]:
print(and_or_search(documents, inverted_index, ['word1', 'word3'], mode='or'))

[['word1', 'word2', 'word3'], ['word1', 'word2'], ['word1', 'word3'], ['word1']]


# Indexing

### collect documents:

In [24]:
document = "hey there, this is a text to work with. is he okay?"

### tokenize:

In [25]:
tokens = ['hey', 'there', 'this', 'is', 'a', 'text', 'to', 'work', 'with', 'is', 'he', 'okay']

In [27]:
types = list(set(tokens))
types

['a', 'he', 'work', 'there', 'is', 'text', 'with', 'okay', 'hey', 'this', 'to']

### normalize:

In [28]:
terms = ['he', 'work', 'text', 'okay', 'hey', 'this']

### index:

In [31]:
index = {
    'he':[0],
    'work':[0],
    'text':[0],
    'okay':[0],
    'hey':[0],
    'this':[0]
}

## Tokenization:

In [33]:
document

'hey there, this is a text to work with.'

In [34]:
document.split()

['hey', 'there,', 'this', 'is', 'a', 'text', 'to', 'work', 'with.']

## Normalization: