In [1]:
from datasets import load_dataset


In [2]:

from haystack import Document
from haystack.components.converters import TextFileToDocument
from haystack.components.embedders import (
    SentenceTransformersDocumentEmbedder,
    SentenceTransformersTextEmbedder,
)
from haystack.components.preprocessors import DocumentCleaner, DocumentSplitter
from haystack.components.retrievers.in_memory.embedding_retriever import (
    InMemoryEmbeddingRetriever,
)
from haystack.components.writers import DocumentWriter
from haystack.document_stores.in_memory import InMemoryDocumentStore

doc_dir = "./game-of-thrones"
document_store = InMemoryDocumentStore()
text_file_converter = TextFileToDocument()
cleaner = DocumentCleaner()
splitter = DocumentSplitter(split_length = 600, split_overlap=40)
writer = DocumentWriter(document_store)

dataset = load_dataset("Tuana/game-of-thrones",split="train")


# doc_embedder = SentenceTransformersDocumentEmbedder(model="sentence-transformers/all-MiniLM-L12-v2", progress_bar=False)
# doc_embedder = SentenceTransformersDocumentEmbedder(model="jinaai/jina-embedding-l-en-v1", progress_bar=False)
doc_embedder = SentenceTransformersDocumentEmbedder(model="jinaai/jina-embeddings-v2-base-en",trust_remote_code=True, progress_bar=False)
doc_embedder.warm_up()
docs_with_embeddings = doc_embedder.run([Document(**ds) for ds in dataset])
document_store.write_documents(docs_with_embeddings["documents"])

retriever = InMemoryEmbeddingRetriever(document_store, top_k=1)



configuration_bert.py:   0%|          | 0.00/8.24k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/jinaai/jina-bert-implementation:
- configuration_bert.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_bert.py:   0%|          | 0.00/97.7k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/jinaai/jina-bert-implementation:
- modeling_bert.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


OutOfMemoryError: CUDA out of memory. Tried to allocate 30.47 GiB. GPU 

In [None]:
# query = """
# Arya runs away from King's Landing with Yoren and his group. She becomes friends with Lommy, Gendry, and Hot Pie.
# They are attacked by Amory Lorch, but Arya and the others escape through a tunnel.
# Arya frees three prisoners, including Jaqen H'ghar.
# """
# query = """
# Arya, Gendry, and Hot Pie encounter Thoros of Myr, a Red Priest who is a member of the Brotherhood Without Banners, a group of men sent by Ned to restore order in the Riverlands. As Thoros takes them to the Brotherhood's hideout they encounter the Hound, a captive of other Brotherhood men. The Hound is brought before the Brotherhood's leader, Ser Beric Dondarrion, and after Arya accuses him of Mycah's murder Beric sentences him to trial by combat. The Hound wins, to Arya's fury, and is released by Dondarrion. Arya is further enraged when the Brotherhood sells Gendry to Melisandre and escapes the Brotherhood. She is captured by the Hound, who intends to take her to The Twins to ransom her to her brother Robb. However, as they arrive the Freys betray the Starks and slaughter their forces, with Arya barely escaping the massacre. In the aftermath of their escape, Arya encounters a Frey soldier bragging about his role in desecrating Robb's corpse. Arya stabs the soldier to death, marking the first time she has deliberately killed another person. ====Season 4====
# """
query = """
The Night King is shown leading his army south. Through ravens' eyes, Bran locates the Night King's army beyond the Wall.
"""
# query = """
# Arya escapes King's Landing with Yoren and his party of recruits; and on the road, she clashes with the other Night's Watch child recruits Lommy, Gendry, and Hot Pie but eventually befriends them. On the way, the party is attacked by Amory Lorch when Yoren refuses to yield Gendry, who is actually a bastard son of the late King Robert, to the Lannisters.  The Night's Watch convoy is overrun and massacred, but Arya and the other children escape through a tunnel.  Before escaping, she rescues three prisoners locked in a wagon cage, among them a mysterious man named Jaqen H'ghar.
# """
text_embedder = SentenceTransformersTextEmbedder(model="jinaai/jina-embedding-l-en-v1", progress_bar=False)
text_embedder.warm_up()
query_embedding = text_embedder.run(query)["embedding"]

result = retriever.run(query_embedding=query_embedding)

In [None]:
print(result["documents"][0].content)


===In the Riverlands===
Sandor and Arya continue their journey to the Vale.


In [None]:
import nltk

nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/nozander/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
from torch.nn.functional import cosine_similarity
import torch

def extract_similar_answer_sentences(
    query_content: str,
    doc_content: str,
    doc_embedder,
    threshold: float = 0.9,
    ) -> str:
    query_lines = nltk.sent_tokenize(query_content)
    answer_lines = nltk.sent_tokenize(doc_content)

    answer_embed = torch.tensor([text_embedder.run(line)["embedding"] for line in query_lines])
    query_embed = torch.tensor([text_embedder.run(line)["embedding"] for line in answer_lines])

    similarity_matrix = cosine_similarity(query_embed.unsqueeze(0), answer_embed.unsqueeze(1), dim=2)

    _, answer_indices = torch.nonzero(similarity_matrix > threshold, as_tuple=True)

    if len(answer_indices) == 0:
        _, max_indices = torch.max(similarity_matrix, dim=1)
        best_answer_combined = "\n".join([answer_lines[i] for i in torch.unique(max_indices)])
    else:
        best_answer_combined = "\n".join([answer_lines[i] for i in torch.unique(answer_indices)])

    return best_answer_combined

In [None]:
def search(query):
  query_embedding = text_embedder.run(query)["embedding"]

  result = retriever.run(query_embedding=query_embedding)
  answer_content = result["documents"][0].content
  similarity_answer = extract_similar_answer_sentences(query, answer_content, text_embedder)

  return similarity_answer

In [None]:
search(query)

'\n===In the Riverlands===\nSandor and Arya continue their journey to the Vale.'

In [None]:
sentences = [
    "Tyrion is introduced as the third and youngest child of wealthy and powerful Tywin Lannister, the former Hand of the King, and Joanna Lannister",
    "The Valyrians are characterized by their silver hair and violet eyes. Valyria was called The Freehold because every man who owned land was allowed to vote for their leaders.",
    "Arya, Gendry, and Hot Pie encounter Thoros of Myr, a Red Priest who is a member of the Brotherhood Without Banners, a group of men sent by Ned to restore order in the Riverlands.",
    "In the fifth season, the region of Dorne is introduced as a location. Alexander Siddig joins the cast as Doran Martell, the ruling Prince of Dorne, and elder brother of Oberyn Martell, while his son Trystane Martell is portrayed by Toby Sebastian.",
    "the Red Keep's gardens (identified as a godswood in the novels) the production used the cloister of the St Dominic Monastery in Rabat, in Malta.",
    "The novels were later on adapted to the hit HBO series ''Game of Thrones'' in 2011.",
    "Arya Stark's training with the Faceless Men in Braavos tests her identity and purpose in ways she never imagined.",
    "It first played during the season six finale of the show and was composed by Ramin Djawadi in 2016",
    "Naath, also known as the Isle of Butterflies, is an island off the north-west coast of Sothoryos that lies west of the Basilisk Isles.",
    "Arya runs away from King's Landing with Yoren and his group. She becomes friends with Lommy, Gendry, and Hot Pie. They are attacked by Amory Lorch, but Arya and the others escape through a tunnel. Arya frees three prisoners, including Jaqen H'ghar."
    ]

In [None]:
answers = []
for sentence in sentences:
  answers.append(search(sentence))

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer

def calculate_similarity(query, answer):

    model = SentenceTransformer('bert-base-nli-mean-tokens')
    embed_query = model.encode([query], show_progress_bar=False)
    embed_answer = model.encode([answer], show_progress_bar=False)

    similarity_matrix = cosine_similarity(embed_query, embed_answer)

    similarity_score = similarity_matrix[0][0]

    return similarity_score

In [None]:
answers

['\n==Storylines==\nCoat of arms of House Lannister',
 '\n===In the Riverlands===\nSandor and Arya continue their journey to the Vale.',
 '\n===In the Riverlands===\nSandor and Arya continue their journey to the Vale.',
 '\n====In the Riverlands====\n* Gary Oliver as Ternesio Terys',
 '\n===Filming===\nThe Minčeta Tower in Dubrovnik was used as the location of the House of the Undying.',
 '\n== Storylines ==\nCoat of arms of House Lannister',
 '\n===In the Riverlands===\nSandor and Arya continue their journey to the Vale.',
 '\n===Music===\nRamin Djawadi returned as the composer of the show for the seventh season.',
 '\n====Naath====\nNaath, also known as the Isle of Butterflies, is an island off the north-west coast of Sothoryos that lies west of the Basilisk Isles.',
 '\n===In the Riverlands===\nSandor and Arya continue their journey to the Vale.']

In [None]:
# Mean score for "jinaai/jina-embeddings-v2-base-en"
scores = []
for sentence, answer in zip(sentences, answers):
  scores.append(calculate_similarity(sentence, answer))
print(f"Mean Score: {sum(scores) / len(scores):.2f}")



Mean Score: 0.51


In [None]:
# Mean score for "sentence-transformers/all-MiniLM-L12-v2"
scores = []
for sentence, answer in zip(sentences, answers):
  scores.append(calculate_similarity(sentence, answer))
print(f"Mean Score: {sum(scores) / len(scores):.2f}")



Mean Score: 0.51


In [None]:
# sample semantic search answer using "jinaai/jina-embeddings-v2-base-en"
query = """
Ramsay draws the ire of Roose after flaying the family of a Northern lord who refuses to pledge fealty.
"""
answers[3]

'\n====In the Riverlands====\n* Gary Oliver as Ternesio Terys'

In [None]:
# sample semantic search answer using "sentence-transformers/all-MiniLM-L12-v2"
query = """
Ramsay draws the ire of Roose after flaying the family of a Northern lord who refuses to pledge fealty.
"""
answers[3]

'\n====In the Riverlands====\n* Gary Oliver as Ternesio Terys'

In [None]:
answers[3]

'\n====In the Riverlands====\n* Gary Oliver as Ternesio Terys'

In [None]:
query = """
Ramsay draws the ire of Roose after flaying the family of a Northern lord who refuses to pledge fealty.
"""
# text_embedder = SentenceTransformersTextEmbedder(model="sentence-transformers/all-MiniLM-L12-v2", progress_bar=False)
text_embedder = SentenceTransformersTextEmbedder(model="jinaai/jina-embedding-l-en-v1", progress_bar=False)
text_embedder.warm_up()
query_embedding = text_embedder.run(query)["embedding"]

result = retriever.run(query_embedding=query_embedding)

In [None]:
result['documents'][0].content

'\n===In the Riverlands===\nSandor and Arya continue their journey to the Vale.'

In [None]:
dataset

Dataset({
    features: ['id', 'content', 'content_type', 'meta', 'score', 'embedding'],
    num_rows: 2357
})