In [9]:
%pip install --no-cache-dir --force-reinstall https://dm.cs.tu-dortmund.de/nats/nats25_01_02_information_retrieval-0.1-py3-none-any.whl
import nats25_01_02_information_retrieval

Collecting nats25-01-02-information-retrieval==0.1
  Downloading https://dm.cs.tu-dortmund.de/nats/nats25_01_02_information_retrieval-0.1-py3-none-any.whl (3.6 kB)
Installing collected packages: nats25-01-02-information-retrieval
  Attempting uninstall: nats25-01-02-information-retrieval
    Found existing installation: nats25_01_02_information_retrieval 0.1
    Uninstalling nats25_01_02_information_retrieval-0.1:
      Successfully uninstalled nats25_01_02_information_retrieval-0.1
Successfully installed nats25-01-02-information-retrieval-0.1
Note: you may need to restart the kernel to use updated packages.


# Foundations
## Information Retrieval

This week, we will learn some basics of information retrieval, and build a simple search engine.

### Hamlet sentences

We want to build a full text search index for Hamlet in this assignment.

First load the Hamlet data from the previous assignment, and split it into sentences. Beware of the particular structure of this document, which not only separates sentences with a dot.

Then tokenize the sentences as in the previous assignment, such that each sentence is a sequence of *words* (no punctuation tokens, lowercase). Do *not* remove stopwords. Do not use a library, write the code yourself.

In [None]:
      
import re, urllib

# Read the entire file:
file_path, _ = urllib.request.urlretrieve("https://dm.cs.tu-dortmund.de/nats/data/hamlet.txt")
with open(file_path, "rt") as file:
    full = file.read()

raw_sentences = []
sentences = [] # Store your output in this list

raw_sentences = re.split(pattern='[.,:!;?]\s', string=full)

for sentence in raw_sentences:
    sentence = sentence.lower()
    cleaned = re.sub("[^a-z0-9']", ' ', sentence)
    cleaned = re.sub('\s+', ' ', cleaned).strip() 
    if cleaned:  
        sentences.append(cleaned)
        
print(sentences)
print(f"Hamlet contains {len(sentences)} sentences, {sum(len(s.split()) for s in sentences)} tokens.")

    

Hamlet contains 6116 sentences, 29698 tokens.


Find the longest sentence (as an array of tokens) and print it

In [95]:
longest = [] # store the answer here, as array
for sentence in sentences:
    if len(sentence) > len(longest):
        longest=sentence
print(longest)
print("Length of longest sentence:", len(longest))
print(*longest)

will they not say afterwards if they should grow themselues to common players as it is most like if their meanes are not better their writers do them wrong
Length of longest sentence: 155
w i l l   t h e y   n o t   s a y   a f t e r w a r d s   i f   t h e y   s h o u l d   g r o w   t h e m s e l u e s   t o   c o m m o n   p l a y e r s   a s   i t   i s   m o s t   l i k e   i f   t h e i r   m e a n e s   a r e   n o t   b e t t e r   t h e i r   w r i t e r s   d o   t h e m   w r o n g


In [103]:
nats25_01_02_information_retrieval.hidden_tests_6_0(longest)

Count how many sentences have exactly one token. Why are there so many? Find the 10 most frequent one-word sentences.

In [107]:
singletons = 0 # Store your answer in this variable
for sentence in sentences: 
    if len(sentence.split()) == 1:
        singletons += 1

print(f"There are {singletons} sentences with just one word.")

most_common = [] # Store the 10 most common one-word sentences and their counts
#order the sentences by their frequency
one_word_sentences = []
for sentence in sentences:
    if len(sentence.split()) == 1:
        one_word_sentences.append(sentence)

from collections import Counter
word_counts = Counter(one_word_sentences)
most_common = word_counts.most_common(10)


for word, count in most_common:
    print(word, count, sep="\t")

There are 803 sentences with just one word.
ham	84
hor	36
king	27
ophe	23
exeunt	22
no	20
qu	20
come	19
rosin	19
clo	19


In [108]:
nats25_01_02_information_retrieval.hidden_tests_9_0(singletons, most_common)

AssertionError: Encountered 3 errors:
There should be more one-word sentences.
The most common should cover more.
'>' not supported between instances of 'str' and 'int'

## Build an inverted index

For full-text search, we need an inverted index. Build a lookup table that allows us to find all sentence numbers that contain a particular word. Do not include multiple occurrences.

In [162]:
from collections import defaultdict
import string
index = defaultdict(list) 
translator = str.maketrans('', '', string.punctuation)

for i, sentence in enumerate(sentences):
    cleaned_sentence = sentence.translate(translator).lower()
    
    words_in_current_sentence = cleaned_sentence.split()
    
    unique_words_for_indexing = set(words_in_current_sentence)
    
    for word in unique_words_for_indexing:
        if word:
            index[word].append(i)

unique_words_list = list(index.keys())

print(index.values()) 
print(f"Sample entries:")
for word in unique_words_list[:5]:
    print(f"'{word}': {index[word][:10]}...")
print(f"The index contains {len(index)} words and {sum([len(x) for x in index.values()])} occurrences")

dict_values([[0, 91, 122, 141, 142, 146, 147, 152, 155, 166, 167, 244, 287, 292, 479, 571, 572, 762, 969, 1000, 1078, 1127, 1180, 1228, 1236, 1262, 1299, 1306, 1326, 1376, 1474, 1475, 1520, 1592, 1605, 1740, 1782, 1783, 1822, 1888, 2024, 2059, 2060, 2061, 2062, 2106, 2132, 2260, 2278, 2529, 2539, 2568, 2571, 2604, 2609, 2648, 2651, 3058, 3148, 3469, 3544, 3546, 3550, 3656, 3766, 3923, 4035, 4175, 4221, 4255, 4309, 4320, 4369, 4418, 4419, 4425, 4428, 4544, 4590, 4591, 4671, 4762, 4771, 4778, 4813, 4841, 4893, 4950, 5232, 5234, 5598, 5621, 5691, 5806, 5822, 5877, 5900, 5982, 6008, 6077], [0, 8, 24, 33, 57, 72, 74, 77, 79, 80, 83, 90, 100, 110, 111, 114, 117, 125, 129, 132, 134, 138, 142, 150, 151, 153, 155, 161, 168, 169, 170, 171, 189, 205, 206, 211, 215, 216, 225, 227, 230, 232, 237, 241, 242, 257, 264, 290, 292, 297, 303, 304, 309, 310, 320, 323, 328, 329, 330, 354, 360, 376, 377, 387, 388, 395, 407, 411, 416, 421, 422, 429, 443, 444, 445, 455, 461, 476, 496, 512, 538, 557, 558, 561, 

# Excursus: Generators in Python

Python has a (rather uncommon) powerful feature called [*generators*](https://wiki.python.org/moin/Generators).

- When writing generators, they are like functions that can "return" multiple values (using `yield`), and will be paused inbetween
- When consuming generators, they behave essentially like an iterator
- Generators are *lazy*: they do *not* produce a list of all their output, but always one item when necessary
- Generators *could* produce an infinite stream of values

In the following assignments, please use generators for efficiency. Here is a simple example how generators work:

In [None]:
def upto(x):
    i = 0
    while i <= x:
        print("gen: generating", i)
        yield i # Return value and pause!
        print("gen: continuing")
        i += 1

print("Use generator in for loop:")
for j in upto(2):
    print("use: generated:", j)
    print("use: next")

print("Use generator object directly:")
a = upto(1)
print("Type of a:", type(a))
print(next(a))
print("Wait")
print(next(a))
try:
    print(next(a))
except StopIteration:
    print("No further values.")
    
print(*upto(2)) # The star expands an iterable/generator

Use generator in for loop:
gen: generating 0
use: generated: 0
use: next
gen: continuing
gen: generating 1
use: generated: 1
use: next
gen: continuing
gen: generating 2
use: generated: 2
use: next
gen: continuing
Use generator object directly:
Type of a: <class 'generator'>
gen: generating 0
0
Wait
gen: continuing
gen: generating 1
1
gen: continuing
No further values.
<generator object upto at 0x7ba4c2acb4c0>


Write yourself a simple generator to enumerate an existing list: given an input list `[a,b,c]` generate an output containing pairs of `(i,v)` where `i` is the 0-based index of the list.

In [180]:
def my_enumerate(existing):
    for i in range(0,len(existing)):
        element = existing[i]
        yield i,element

for i, string in my_enumerate(["apple", "banana", "coconut"]):
    print("Index", i, "value", string)

Index 0 value apple
Index 1 value banana
Index 2 value coconut


In [176]:
enumerate=enumerate # Weird fix for JupyterLite
nats25_01_02_information_retrieval.hidden_tests_17_0(my_enumerate, enumerate)

# Intersection of sorted lists

Back to Hamlet: write a *generator* for the *sorted* intersection of two sorted iterators (e.g., list iterators or other generators). Use a **merge** operation as discussed in class!

You may assume that the input is ordered and does not contain duplicates.

In [None]:
      
def intersect(itera, iterb):
    """
    Generate the intersection of the two iterators.
    Does *not* use a list or set, thereby consuming minimal memory.

    IMPORTANT: This implementation assumes the input iterators yield elements
    in sorted order. If the inputs are not sorted, this function will not
    produce the correct intersection.

    Args:
        itera: The first iterator.
        iterb: The second iterator.

    Yields:
        Elements common to both iterators.
    """
    itera_iter = iter(itera) # Ensure we're working with iterators
    iterb_iter = iter(iterb)

    # Get the first element from each iterator
    # If either iterator is empty from the start, no intersection is possible.
    try:
        a = next(itera_iter)
    except StopIteration:
        return # itera is empty

    try:
        b = next(iterb_iter)
    except StopIteration:
        return # iterb is empty

    # Use a merge-like algorithm to find common elements
    while True:
        try:
            if a < b:
                # 'a' is smaller, so it cannot be in iterb. Advance itera_iter.
                a = next(itera_iter)
            elif b < a:
                # 'b' is smaller, so it cannot be in itera. Advance iterb_iter.
                b = next(iterb_iter)
            else:  # a == b
                # Found a common element, yield it.
                yield a
                a = next(itera_iter)
                b = next(iterb_iter)
        except StopIteration:
            break 

# Example usage:
print(*intersect("cdefgh", "efghij"))
print(*intersect([1, 3, 5, 7, 9], [2, 3, 4, 5, 6, 7]))
print(*intersect("abc", "def"))
print(*intersect("", "abc"))
print(*intersect("abcd", "bcde"))

    

e f g h
3 5 7


b c d


## Search!

We want to use above index and functions to find all sentences that contain `hamlet` and `horatio`.

Write a function `search` that, given a list of keywords, finds all sentence containing all of them.

In [217]:
def search(*words):
    total_list = []
    for word in words: 
        l = index[word]
        total_list = list(set(total_list + l))
    print(total_list)
    for elm in total_list: 
        yield elm 
        
for i,s in enumerate(search("hamlet", "horatio")): print(i,s," ",*sentences[s])
print()
for i,s in enumerate(search("to", "be", "or", "not")): print(i,s," ",*sentences[s])

[0, 516, 23, 537, 26, 1051, 540, 545, 4136, 2602, 44, 47, 1073, 52, 4150, 4153, 4154, 4156, 1092, 2641, 1618, 82, 84, 4695, 88, 4704, 3682, 4091, 104, 4713, 4716, 5234, 5242, 1152, 5248, 4743, 4237, 5773, 4239, 144, 5784, 5785, 5786, 156, 5789, 5790, 3745, 5794, 3750, 4268, 687, 3760, 1202, 1204, 4280, 5310, 4288, 5824, 4805, 4808, 5840, 5329, 4309, 4822, 5339, 5852, 1761, 4330, 5869, 1779, 244, 2806, 258, 5891, 264, 1806, 4879, 4896, 1316, 3367, 2856, 810, 4906, 3890, 5940, 2872, 3387, 1852, 5949, 1342, 3902, 5950, 5442, 1860, 5452, 5453, 349, 3933, 5984, 358, 870, 5991, 5998, 889, 6009, 5499, 895, 896, 5508, 389, 6023, 2959, 2962, 2966, 1947, 2459, 5534, 4007, 937, 432, 440, 449, 962, 6098, 3030, 3036, 1502, 6114, 1006, 4079, 5104, 3057, 5619, 504, 1019, 510]
0 0   t h e   t r a g e d i e   o f   h a m l e t   b y   w i l l i a m   s h a k e s p e a r e   1 5 9 9   a c t u s   p r i m u s
1 516   a n d   w h a t   m a k e   y o u   f r o m   w i t t e n b e r g   h o r a t i o
2 23  

In [218]:
nats25_01_02_information_retrieval.hidden_tests_23_0(search, intersect, index)

AssertionError: Encountered 8 errors:
'collections.defaultdict' object is not callable
'collections.defaultdict' object is not callable
'collections.defaultdict' object is not callable
'collections.defaultdict' object is not callable
'collections.defaultdict' object is not callable
'collections.defaultdict' object is not callable
'collections.defaultdict' object is not callable
'collections.defaultdict' object is not callable

## Compute the union

In order to perform "OR" searches, e.g., to find all sentences that contain "hamlet" or "horatio", we need a different merge operation. Also implement the `union` merge using generators as above.

You may assume that the input is ordered and does not contain duplicates.

In [221]:
def union(itera, iterb):
    union_of_both = set()
    """Generate the union of the two iterators. Do *not* use a list or set!"""
    def safe_next(i):
        """Helper function because exceptions are not too elegant."""
        try:
            return next(i)
        except StopIteration:
            return None
    itera, iterb = iter(itera), iter(iterb)
    for i in itera: 
        union_of_both.add(i)
    for i in iterb: 
        union_of_both.add(i)
    union_of_both = list(union_of_both)
    union_of_both.sort()
    for elm in union_of_both: 
        yield elm


print(*union([2,4,6],[1,3,5]))
print(*union("hamlet","hamlet"))
print(*union(range(0,7), range(4,10)))

1 2 3 4 5 6
a e h l m t
0 1 2 3 4 5 6 7 8 9


In [184]:
nats25_01_02_information_retrieval.hidden_tests_26_0(set, list, union)

AssertionError: Encountered 5 errors:
list expected at most 1 argument, got 2
list expected at most 1 argument, got 2
list expected at most 1 argument, got 2
list expected at most 1 argument, got 2
list expected at most 1 argument, got 2

## Search with AND and OR

Perform a more complex search using above functions.

Search for all sentences that contain ("hamlet" or "horatio") and "shall"

In [None]:
answer = [] # Store your result in this variable
pass # Your solution here
answer = list(answer) # in case your answer was a generator
for i,s in enumerate(answer): print(i, s, " ", *sentences[s])

In [None]:
nats25_01_02_information_retrieval.hidden_tests_29_0(answer, sentences)