## Building the FM-index 

In [1]:
from seal import FMIndex
from transformers import AutoTokenizer

corpus = [
    "Doc 1 @@ This is a sample document",
    "Doc 2 @@ This is another sample document",
    "Doc 3 @@ And here you find the final one",
]
labels = ['doc1', 'doc2', 'doc3']

tokenizer = AutoTokenizer.from_pretrained('facebook/bart-large')
def preprocess(doc):
    doc = ' ' + doc
    doc = tokenizer(doc, add_special_tokens=False)['input_ids']
    doc += [tokenizer.eos_token_id]
    return doc

corpus_tokenized = [preprocess(doc) for doc in corpus]
print(corpus_tokenized)

index = FMIndex()
index.initialize(corpus_tokenized, in_memory=True)
index.labels = labels

index.save('res/sample/sample_corpus.fm_index')
# writes res/sample/sample_corpus.fm_index.fmi
# writes res/sample/sample_corpus.fm_index.oth

index = FMIndex.load('res/sample/sample_corpus.fm_index')
print(index.beginnings)
print(index.occurring)
print(index.occurring_distinct)
print(index.occurring_counts)
print(index.labels)

[[19761, 112, 49314, 152, 16, 10, 7728, 3780, 2], [19761, 132, 49314, 152, 16, 277, 7728, 3780, 2], [19761, 155, 49314, 178, 259, 47, 465, 5, 507, 65, 2]]
[0, 9, 18, 29]
[65, 2, 259, 3780, 132, 5, 10, 16, 465, 277, 152, 155, 49314, 47, 112, 19761, 7728, 178, 507]
[2, 5, 10, 16, 47, 65, 112, 132, 152, 155, 259, 277, 465, 507, 3780, 7728, 19761, 49314]
[3, 1, 1, 2, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 2, 2, 3, 3]
['doc1', 'doc2', 'doc3']


## Decoding with the FM-index

In [2]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from seal import fm_index_generate, FMIndex

tokenizer = AutoTokenizer.from_pretrained('pegasus_paraphrase')
model = AutoModelForSeq2SeqLM.from_pretrained('pegasus_paraphrase')

# building the corpus from a single long string
corpus = " ".join("""
They also were found to have perfectly coiffed hair, and wore what appeared to be Dior makeup. 
“We were shocked to discover the unicorns,” said anthropologist Daniel St. Maurice. “They were 
like nothing we had ever seen before. We had heard legends of the unicorns, but never thought 
they actually existed.” When the scientists first arrived in the valley, the unicorns were 
surprised and startled by the presence of humans, but were also excited. The unicorns welcomed 
the researchers and explained that they had been waiting for them for a very long time. “The 
unicorns said that they had been waiting for us for a very long time,” said Dr. St. Maurice. 
“They said they had always known that humans would eventually discover them, but that they had 
also always known that humans would be too stupid to realize the unicorns had been waiting for 
them.”
""".split()).strip()
corpus = tokenizer(' ' + corpus, add_special_tokens=False)['input_ids'] + [tokenizer.eos_token_id]
index = FMIndex()
index.initialize([corpus], in_memory=True)

# constrained generation
query = " ".join("""
The unicorns greeted the scientists, explaining that they had been expecting the encounter for
a while.'
”""".split()).strip()
out = fm_index_generate(
    model, index,
    **tokenizer([' ' + query], return_tensors='pt'),
    keep_history=False,
    transformers_output=True,
    always_allow_eos=True,
    max_length=100,
)
print(tokenizer.decode(out[0], skip_special_tokens=True).strip())
# unicorns welcomed the researchers and explained that they had been waiting for them for a very long time.


Token indices sequence length is longer than the specified maximum sequence length for this model (170 > 60). Running this sequence through the model will result in indexing errors


The unicorns welcomed the researchers and explained that they had been waiting for them for a very long time.


## Retrieval


In [3]:
from seal import SEALSearcher

searcher = SEALSearcher.load('./ckpt/NQ/NQ.fm_index', './ckpt/NQ/SEAL.NQ.pt')
searcher.include_keys = True

query = "can you eat soup with a fork"

for i, doc in enumerate(searcher.search(query, k=3)):
    print(i, doc.score, doc.docid, *doc.text(), sep='\t')
    print("Matched:")
    matched = sorted(doc.keys, reverse=True, key=lambda x:x[2])
    matched = matched[:5]
    for ngram, freq, score in matched:
        print("{:.1f}".format(score).zfill(5), freq, repr(ngram), sep='\t')

# 0	375.03041350768547	13796077	Chopsticks	are similar, finer points can differ from region to region. 
# In Cambodia, a fork and spoon are the typical utensils used in Cambodian dining and etiquette. Spoons are 
# used to scoop up food or water and the fork is there to help guide the food onto the spoon. Chopsticks 
# are normally used in noodle dishes such as the Kuy Tiev and soup dishes. When eating soup the chopsticks 
# will typically be paired with the spoon, where the chopsticks will pick up the food and the spoon will be 
# used to drink the broth. Forks are never to touch the mouth,
# Matched:
# 161.3	10	' eating soup'
# 059.5	9390	' fork'
# ...



0	510.01721395707966	6957412	Table manners	should not chew or bite food from the fork. The knife should be held with the base into the palm of the hand, not like a pen with the base resting between the thumb and forefinger. The knife must never enter the mouth or be licked. When eating soup, the spoon is held in the right hand and the bowl tipped away from the diner, scooping the soup in outward movements. The soup spoon should never be put into the mouth, and soup should be sipped from the side of the spoon, not the end. Food should always be chewed
Matched:
161.3	10	' eating soup'
126.3	23	'</s> Table manners @@'
059.5	9390	' fork'
052.5	373	' spoon,'
020.3	236262	' food'
1	457.43813062425585	13796077	Chopsticks	are similar, finer points can differ from region to region. In Cambodia, a fork and spoon are the typical utensils used in Cambodian dining and etiquette. Spoons are used to scoop up food or water and the fork is there to help guide the food onto the spoon. Chopsticks are nor