In [None]:
import logging
import os
from collections import defaultdict

from haystack import Answer
from haystack.document_stores import InMemoryDocumentStore
from haystack.nodes import EmbeddingRetriever, FARMReader, PreProcessor
from haystack.pipelines import MostSimilarDocumentsPipeline
from haystack.pipelines.standard_pipelines import TextIndexingPipeline
from haystack.utils import print_answers
from rich.console import Console
from rich.layout import Layout
from rich.style import Style
from rich.text import Text

cleaner = PreProcessor(clean_empty_lines=True,clean_whitespace=True,split_by="word",split_respect_sentence_boundary=False)
logging.basicConfig(format="%(levelname)s - %(name)s -  %(message)s", level=logging.WARNING)
logging.getLogger("haystack").setLevel(logging.INFO)


document_store = InMemoryDocumentStore(embedding_dim=384)
doc_dir = "data/similarity_search"


files_to_index = [doc_dir + "/" + f for f in os.listdir(doc_dir)]
indexing_pipeline = TextIndexingPipeline(document_store)
indexing_pipeline.run_batch(file_paths=files_to_index)
retriever = EmbeddingRetriever(
    document_store=document_store, embedding_model="sentence-transformers/all-MiniLM-L12-v2", use_gpu=True,
)


query_index_pipline = TextIndexingPipeline(document_store)
doc = query_index_pipline.run("data/similarity/doc1.txt")
doc_id = doc.get("documents")[0].id
document_store.update_embeddings(retriever)
msd_pipeline = MostSimilarDocumentsPipeline(document_store)
msd_result = msd_pipeline.run(document_ids=[doc_id])
words =   cleaner.process(doc.get("documents"))[0]
# queries  = [txt for txt in clean_wiki_text(query.content).split("\n") if txt]
queries = ["What is the score?","Who is the artist?","How is it recieved?","Name of the sountrack?"]
reader = FARMReader("deepset/roberta-base-squad2", use_gpu=True)
results = reader.predict_batch(queries, msd_result[0], 3)
print_answers(results)

console = Console(highlight=False, record=True)
# console.print(results)


doc_dics: defaultdict[str, list[dict]] = defaultdict(list[dict])

answers_list: list[list[Answer]] = results.get("answers")
css_highlight_colors = [
    "#FF7F00", # Orange
    "#FFFF22", # Light Purple
    "#F0C674", # Sand
    "#8DD7CF", # Sea Green
    "#FBBDFE", # Pink
    "#E0FFDA", # Yellow Green
    "#9B59B6", # Dark Pink
    "#FFC107",  # Dark Orange
]

query_answers: list[str,list[Answer]] = list(zip(queries,answers_list))
input_content = document_store.get_document_by_id(doc_id).content

for query,answers in query_answers:
    for ans in answers:
        ans_doc_id = ans[0].document_ids[0]
        doc_dics[ans_doc_id].append(dict(query=query,answer=ans[0]))



for d in doc_dics:
    question_texts = Text()
    answer_context = Text(end="\n\n")

    input_text = Text(input_content)
    compare_doc = Text(document_store.get_document_by_id(d).content)
    for i,qa in enumerate(doc_dics[d]):
        answer:Answer = qa["answer"]
        question_texts.append("query: ")
        question_texts.append(qa["query"],Style(color=css_highlight_colors[i]))
        question_texts.append("\n\n")
        answer_context.append(f" [score {answer.score}] ")
        answer_context.append(answer.context,Style(color=css_highlight_colors[i]))
        answer_context.append("\n\n")
        for span in answer.offsets_in_document:
            compare_doc.stylize(css_highlight_colors[i], span.start, span.end)
    layout = Layout()

    layout.split_column(Layout(name="input"),Layout(name="compare"),Layout(name="context"))
    layout["compare"].split_row( Layout(name="queries"),Layout(name="answer"))
    layout["input"].update(input_text)
    layout["context"].update(compare_doc)
    layout["queries"].update(question_texts)
    layout["answer"].update(answer_context)
    console.print(layout)


console.save_html("output.html")


In [None]:
query

In [None]:
words

In [None]:
words.embedding

In [None]:
document_store.duplicate_documents

In [None]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-MiniLM-L6-v2')

#Our sentences we like to encode
sentences = ['This framework generates embeddings for each input sentence',
    'Sentences are passed as a list of string.', 
    'The quick brown fox jumps over the lazy dog.']

#Sentences are encoded by calling model.encode()
sentence_embeddings = model.encode(sentences)

#Print the embeddings
for sentence, embedding in zip(sentences, sentence_embeddings):
    print("Sentence:", sentence)
    print("Embedding:", embedding)
    print("")

In [1]:
from sentence_transformers import SentenceTransformer, util
model = SentenceTransformer('all-MiniLM-L6-v2')

sentences = ['A man is eating food.',
          'A man is eating a piece of bread.',
          'The girl is carrying a baby.',
          'A man is riding a horse.',
          'A woman is playing violin.',
          'Two men pushed carts through the woods.',
          'A man is riding a white horse on an enclosed ground.',
          'A monkey is playing drums.',
          'Someone in a gorilla costume is playing a set of drums.'
          ]

#Encode all sentences
embeddings = model.encode(sentences)

#Compute cosine similarity between all pairs
cos_sim = util.cos_sim(embeddings, embeddings)

#Add all pairs to a list with their cosine similarity score
all_sentence_combinations = []
for i in range(len(cos_sim)-1):
    for j in range(i+1, len(cos_sim)):
        all_sentence_combinations.append([cos_sim[i][j], i, j])

#Sort list by the highest cosine similarity score
all_sentence_combinations = sorted(all_sentence_combinations, key=lambda x: x[0], reverse=True)

print("Top-5 most similar pairs:")
for score, i, j in all_sentence_combinations[0:5]:
    print("{} \t {} \t {:.4f}".format(sentences[i], sentences[j], cos_sim[i][j]))

  from .autonotebook import tqdm as notebook_tqdm


Top-5 most similar pairs:
A man is eating food. 	 A man is eating a piece of bread. 	 0.7553
A man is riding a horse. 	 A man is riding a white horse on an enclosed ground. 	 0.7369
A monkey is playing drums. 	 Someone in a gorilla costume is playing a set of drums. 	 0.6433
A woman is playing violin. 	 Someone in a gorilla costume is playing a set of drums. 	 0.2564
A man is eating food. 	 A man is riding a horse. 	 0.2474


In [2]:
"""
This example show how in-document search can be used with a CrossEncoder.

The document is split into passage. Here, we use three consecutive sentences as a passage. You can use shorter passage, for example, individual sentences,
or longer passages, like full paragraphs.


The CrossEncoder takes the search query and scores every passage how relevant the passage is for the given score. The five passages with the highest score are  then returned.

As CrossEncoder, we use cross-encoder/ms-marco-TinyBERT-L-2, a BERT model with only 2 layers trained on the MS MARCO dataset. This is an extremely quick model able to score up to 9000 passages per second (on a V100 GPU). You can also use a larger model, which gives better results but is also slower.

Note: As we score the [query, passage]-pair for every new query, this search method
becomes at some point in-efficient if the document gets too large.

Usage: python in_document_search_crossencoder.py
"""

from sentence_transformers import CrossEncoder
from nltk import sent_tokenize
import time


#As document, we take the first two section from the Wikipedia article about Europe
document = """Europe is a continent located entirely in the Northern Hemisphere and mostly in the Eastern Hemisphere. It comprises the westernmost part of Eurasia and is bordered by the Arctic Ocean to the north, the Atlantic Ocean to the west, the Mediterranean Sea to the south, and Asia to the east. Europe is commonly considered to be separated from Asia by the watershed of the Ural Mountains, the Ural River, the Caspian Sea, the Greater Caucasus, the Black Sea, and the waterways of the Turkish Straits. Although some of this border is over land, Europe is generally accorded the status of a full continent because of its great physical size and the weight of history and tradition.

Europe covers about 10,180,000 square kilometres (3,930,000 sq mi), or 2% of the Earth's surface (6.8% of land area), making it the second smallest continent. Politically, Europe is divided into about fifty sovereign states, of which Russia is the largest and most populous, spanning 39% of the continent and comprising 15% of its population. Europe had a total population of about 741 million (about 11% of the world population) as of 2018. The European climate is largely affected by warm Atlantic currents that temper winters and summers on much of the continent, even at latitudes along which the climate in Asia and North America is severe. Further from the sea, seasonal differences are more noticeable than close to the coast.

European culture is the root of Western civilization, which traces its lineage back to ancient Greece and ancient Rome. The fall of the Western Roman Empire in 476 AD and the subsequent Migration Period marked the end of Europe's ancient history and the beginning of the Middle Ages. Renaissance humanism, exploration, art and science led to the modern era. Since the Age of Discovery, started by Portugal and Spain, Europe played a predominant role in global affairs. Between the 16th and 20th centuries, European powers colonized at various times the Americas, almost all of Africa and Oceania, and the majority of Asia.

The Age of Enlightenment, the subsequent French Revolution and the Napoleonic Wars shaped the continent culturally, politically and economically from the end of the 17th century until the first half of the 19th century. The Industrial Revolution, which began in Great Britain at the end of the 18th century, gave rise to radical economic, cultural and social change in Western Europe and eventually the wider world. Both world wars took place for the most part in Europe, contributing to a decline in Western European dominance in world affairs by the mid-20th century as the Soviet Union and the United States took prominence. During the Cold War, Europe was divided along the Iron Curtain between NATO in the West and the Warsaw Pact in the East, until the revolutions of 1989 and fall of the Berlin Wall.

In 1949, the Council of Europe was founded with the idea of unifying Europe to achieve common goals. Further European integration by some states led to the formation of the European Union (EU), a separate political entity that lies between a confederation and a federation. The EU originated in Western Europe but has been expanding eastward since the fall of the Soviet Union in 1991. The currency of most countries of the European Union, the euro, is the most commonly used among Europeans; and the EU's Schengen Area abolishes border and immigration controls between most of its member states. There exists a political movement favoring the evolution of the European Union into a single federation encompassing much of the continent.

In classical Greek mythology, Europa (Ancient Greek: Εὐρώπη, Eurṓpē) was a Phoenician princess. One view is that her name derives from the ancient Greek elements εὐρύς (eurús), "wide, broad" and ὤψ (ōps, gen. ὠπός, ōpós) "eye, face, countenance", hence their composite Eurṓpē would mean "wide-gazing" or "broad of aspect". Broad has been an epithet of Earth herself in the reconstructed Proto-Indo-European religion and the poetry devoted to it. An alternative view is that of R.S.P. Beekes who has argued in favor of a Pre-Indo-European origin for the name, explaining that a derivation from ancient Greek eurus would yield a different toponym than Europa. Beekes has located toponyms related to that of Europa in the territory of ancient Greece and localities like that of Europos in ancient Macedonia.

There have been attempts to connect Eurṓpē to a Semitic term for "west", this being either Akkadian erebu meaning "to go down, set" (said of the sun) or Phoenician 'ereb "evening, west", which is at the origin of Arabic Maghreb and Hebrew ma'arav. Michael A. Barry finds the mention of the word Ereb on an Assyrian stele with the meaning of "night, [the country of] sunset", in opposition to Asu "[the country of] sunrise", i.e. Asia. The same naming motive according to "cartographic convention" appears in Greek Ἀνατολή (Anatolḗ "[sun] rise", "east", hence Anatolia). Martin Litchfield West stated that "phonologically, the match between Europa's name and any form of the Semitic word is very poor", while Beekes considers a connection to Semitic languages improbable. Next to these hypotheses there is also a Proto-Indo-European root *h1regʷos, meaning "darkness", which also produced Greek Erebus.

Most major world languages use words derived from Eurṓpē or Europa to refer to the continent. Chinese, for example, uses the word Ōuzhōu (歐洲/欧洲), which is an abbreviation of the transliterated name Ōuluóbā zhōu (歐羅巴洲) (zhōu means "continent"); a similar Chinese-derived term Ōshū (欧州) is also sometimes used in Japanese such as in the Japanese name of the European Union, Ōshū Rengō (欧州連合), despite the katakana Yōroppa (ヨーロッパ) being more commonly used. In some Turkic languages, the originally Persian name Frangistan ("land of the Franks") is used casually in referring to much of Europe, besides official names such as Avrupa or Evropa."""

## We split this article into paragraphs and then every paragraph into sentences
paragraphs = []
for paragraph in document.replace("\r\n", "\n").split("\n\n"):
    if len(paragraph.strip()) > 0:
        paragraphs.append(sent_tokenize(paragraph.strip()))


#We combine up to 3 sentences into a passage. You can choose smaller or larger values for window_size
#Smaller value: Context from other sentences might get lost
#Lager values: More context from the paragraph remains, but results are longer
window_size = 3
passages = []
for paragraph in paragraphs:
    for start_idx in range(0, len(paragraph), window_size):
        end_idx = min(start_idx+window_size, len(paragraph))
        passages.append(" ".join(paragraph[start_idx:end_idx]))


print("Paragraphs: ", len(paragraphs))
print("Sentences: ", sum([len(p) for p in paragraphs]))
print("Passages: ", len(passages))


## Load our cross-encoder. Use fast tokenizer to speed up the tokenization
model = CrossEncoder('cross-encoder/ms-marco-TinyBERT-L-2')

## Some queries we want to search for in the document
queries = ["How large is Europe?",
           "Is Europe a continent?",
           "What is the currency in EU?",
           "Fall Roman Empire when",                    #We can also search for key word queries
           "Is Europa in the south part of the globe?"]   #Europe is miss-spelled & the matching sentences does not mention any of the content words

#Search in a loop for the individual queries
for query in queries:
    start_time = time.time()

    #Concatenate the query and all passages and predict the scores for the pairs [query, passage]
    model_inputs = [[query, passage] for passage in passages]
    scores = model.predict(model_inputs)

    #Sort the scores in decreasing order
    results = [{'input': inp, 'score': score} for inp, score in zip(model_inputs, scores)]
    results = sorted(results, key=lambda x: x['score'], reverse=True)

    print("Query:", query)
    print("Search took {:.2f} seconds".format(time.time() - start_time))
    for hit in results[0:5]:
        print("Score: {:.2f}".format(hit['score']), "\t", hit['input'][1])


    print("==========")

Paragraphs:  8
Sentences:  38
Passages:  15
Query: How large is Europe?
Search took 0.03 seconds
Score: 0.89 	 Europe covers about 10,180,000 square kilometres (3,930,000 sq mi), or 2% of the Earth's surface (6.8% of land area), making it the second smallest continent. Politically, Europe is divided into about fifty sovereign states, of which Russia is the largest and most populous, spanning 39% of the continent and comprising 15% of its population. Europe had a total population of about 741 million (about 11% of the world population) as of 2018.
Score: 0.30 	 Europe is a continent located entirely in the Northern Hemisphere and mostly in the Eastern Hemisphere. It comprises the westernmost part of Eurasia and is bordered by the Arctic Ocean to the north, the Atlantic Ocean to the west, the Mediterranean Sea to the south, and Asia to the east. Europe is commonly considered to be separated from Asia by the watershed of the Ural Mountains, the Ural River, the Caspian Sea, the Greater Cau

In [3]:
from sentence_transformers import SentenceTransformer, util
model = SentenceTransformer('multi-qa-MiniLM-L6-cos-v1')

query_embedding = model.encode('How big is London')
passage_embedding = model.encode(['London has 9,787,426 inhabitants at the 2011 census',
                                  'London is known for its finacial district'])

print("Similarity:", util.dot_score(query_embedding, passage_embedding))

.gitattributes: 100%|██████████| 737/737 [00:00<00:00, 3.12MB/s]
1_Pooling/config.json: 100%|██████████| 190/190 [00:00<00:00, 1.17MB/s]
README.md: 100%|██████████| 11.5k/11.5k [00:00<00:00, 755kB/s]
config.json: 100%|██████████| 612/612 [00:00<00:00, 1.72MB/s]
config_sentence_transformers.json: 100%|██████████| 116/116 [00:00<00:00, 738kB/s]
data_config.json: 100%|██████████| 25.5k/25.5k [00:00<00:00, 1.40MB/s]
pytorch_model.bin: 100%|██████████| 90.9M/90.9M [00:38<00:00, 2.37MB/s]
sentence_bert_config.json: 100%|██████████| 53.0/53.0 [00:00<00:00, 307kB/s]
special_tokens_map.json: 100%|██████████| 112/112 [00:00<00:00, 849kB/s]
tokenizer.json: 100%|██████████| 466k/466k [00:00<00:00, 2.36MB/s]
tokenizer_config.json: 100%|██████████| 383/383 [00:00<00:00, 1.66MB/s]
train_script.py: 100%|██████████| 13.8k/13.8k [00:00<00:00, 28.6MB/s]
vocab.txt: 100%|██████████| 232k/232k [00:00<00:00, 2.10MB/s]
modules.json: 100%|██████████| 349/349 [00:00<00:00, 1.36MB/s]


Similarity: tensor([[0.5472, 0.6330]])


In [6]:
from sentence_transformers import SentenceTransformer, util
import torch

embedder = SentenceTransformer('all-MiniLM-L6-v2')

# Corpus with example sentences
corpus = ['A man is eating food.',
          'A man is eating a piece of bread.',
          'The girl is carrying a baby.',
          'A man is riding a horse.',
          'A woman is playing violin.',
          'Two men pushed carts through the woods.',
          'A man is riding a white horse on an enclosed ground.',
          'A monkey is playing drums.',
          'A cheetah is running behind its prey.'
          ]
corpus_embeddings = embedder.encode(corpus, convert_to_tensor=True)

# Query sentences:
queries = ['A man is eating pasta.', 'Someone in a gorilla costume is playing a set of drums.', 'A cheetah chases prey on across a field.']


# Find the closest 5 sentences of the corpus for each query sentence based on cosine similarity
top_k = min(5, len(corpus))
for query in queries:
    query_embedding = embedder.encode(query, convert_to_tensor=True)

    # We use cosine-similarity and torch.topk to find the highest 5 scores
    cos_scores = util.cos_sim(query_embedding, corpus_embeddings)[0]
    top_results = torch.topk(cos_scores, k=top_k)

    print("\n\n======================\n\n")
    print("Query:", query)
    print("\nTop 5 most similar sentences in corpus:")

    for score, idx in zip(top_results[0], top_results[1]):
        print(corpus[idx], "(Score: {:.4f})".format(score))

    """
    # Alternatively, we can also use util.semantic_search to perform cosine similarty + topk
    hits = util.semantic_search(query_embedding, corpus_embeddings, top_k=5)
    hits = hits[0]      #Get the hits for the first query
    for hit in hits:
        print(corpus[hit['corpus_id']], "(Score: {:.4f})".format(hit['score']))"""





Query: A man is eating pasta.

Top 5 most similar sentences in corpus:
A man is eating food. (Score: 0.7035)
A man is eating a piece of bread. (Score: 0.5272)
A man is riding a horse. (Score: 0.1889)
A man is riding a white horse on an enclosed ground. (Score: 0.1047)
A cheetah is running behind its prey. (Score: 0.0980)




Query: Someone in a gorilla costume is playing a set of drums.

Top 5 most similar sentences in corpus:
A monkey is playing drums. (Score: 0.6433)
A woman is playing violin. (Score: 0.2564)
A man is riding a horse. (Score: 0.1389)
A man is riding a white horse on an enclosed ground. (Score: 0.1191)
A cheetah is running behind its prey. (Score: 0.1080)




Query: A cheetah chases prey on across a field.

Top 5 most similar sentences in corpus:
A cheetah is running behind its prey. (Score: 0.8253)
A man is eating food. (Score: 0.1399)
A monkey is playing drums. (Score: 0.1292)
A man is riding a white horse on an enclosed ground. (Score: 0.1097)
A man is riding a 

: 

In [4]:
doc = open("data/similarity/doc1.txt","r")

In [5]:
doc_txt = doc.read()

In [7]:
import spacy

# Load English tokenizer, tagger, parser and NER
nlp = spacy.load("en_core_web_md")

# Process whole documents
text = doc_txt
doc = nlp(text)

# Analyze syntax
print("Noun phrases:", [chunk.text for chunk in doc.noun_chunks])
print("Verbs:", [token.lemma_ for token in doc if token.pos_ == "VERB"])

# Find named entities, phrases and concepts
for entity in doc.ents:
    print(entity.text, entity.label_)

Noun phrases: ['\n\nThe soundtrack album', 'the fourth season', 'HBO series', 'Game', 'Thrones', "'Game", 'Thrones', 'Season', 'June', 'CD', 'July', 'Season', 'Game', 'Thrones', 'the Icelandic band', 'Sigur Rós', 'their rendition', 'The Rains', 'Castamere', 'a cameo appearance', "King Joffrey's wedding", 'the second episode', 'The Lion', 'the Rose', '=', '=', 'The soundtrack', 'mostly positive reviews', 'critics', 'The soundtrack', 'a score', 'Heather Phares', 'AllMusic', '=', '=Track listing==', '=', 'Credits', 'personnel==\nPersonnel', 'the album liner notes', 'David Benioff', 'liner notes', 'Ramin Djawadi', 'composer', 'primary artist', 'producer', 'Sigur Rós', 'primary artist', 'George R.R. Martin – lyricist', 'D.B. Weiss – liner notes', '=', 'Charts==\n\n\n\n\n\n Chart', 'Peak position']
Verbs: ['title', 'release', 'see', 'perform', 'receive', 'award', 'adapt']
the fourth season DATE
HBO ORG
''Game of Thrones'' WORK_OF_ART
'Game of Thrones: Season 4''' WORK_OF_ART
June 10, 2014 DA

In [11]:
doc.text_with_ws

'\n\nThe soundtrack album of the fourth season of HBO series \'\'Game of Thrones\'\', titled \'\'\'\'\'Game of Thrones: Season 4\'\'\'\'\' was released digitally on June 10, 2014, and on CD on July 1, 2014. Season 4 of \'\'Game of Thrones\'\' saw the Icelandic band Sigur Rós perform their rendition of "The Rains of Castamere" in a cameo appearance at King Joffrey\'s wedding in the second episode, "The Lion and the Rose".\n\n==Reception==\nThe soundtrack received mostly positive reviews from critics. The soundtrack was awarded a score of 4/5 by Heather Phares of AllMusic.\n\n==Track listing==\n\n\n==Credits and personnel==\nPersonnel adapted from the album liner notes.\n\n* David Benioff – liner notes\n* Ramin Djawadi – composer, primary artist, producer\n* Sigur Rós – primary artist \n* George R.R. Martin – lyricist\n* D.B. Weiss – liner notes \n\n\n==Charts==\n\n\n\n\n\n Chart (2014)\n\n Peak position\n\n\n\n\n\n\n\n\n'