In [1]:
import re
import logging
import spacy
import torch
import torch.nn.functional as F
from collections import defaultdict

from haystack.document_stores import InMemoryDocumentStore
from haystack.nodes import EmbeddingRetriever, PreProcessor
from haystack.utils import print_answers
from rich.console import Console
from rich.layout import Layout
from rich.text import Text

cleaner = PreProcessor(clean_empty_lines=True, clean_whitespace=True, split_by="word", split_respect_sentence_boundary=False)
logging.basicConfig(format="%(levelname)s - %(name)s -  %(message)s", level=logging.WARNING)
logging.getLogger("haystack").setLevel(logging.INFO)

document_store = InMemoryDocumentStore(embedding_dim=384)

# Create documents
documents = [
    dict(
        content="Dogs are wonderful pets. They are known for their loyalty and friendly nature. Whether playing fetch or going for a walk, dogs make great companions. Their wagging tails express joy and excitement.",
        meta={"name": "Document 1"},
    ),
    dict(
        content="Cats, on the other hand, are more independent creatures. They enjoy exploring their surroundings and finding cozy spots for napping. The soothing sound of a cat's purring often indicates contentment. Cats can be mysterious and charming in their own unique way.",
        meta={"name": "Document 2"},
    ),
    dict(
        content="Books have the power to transport readers to different worlds and ignite their imagination. Reading is a delightful hobby that opens doors to new ideas and perspectives. Whether exploring fantasy realms or diving into historical events, books offer a journey of discovery.",
        meta={"name": "Document 3"},
    )
]

document_store.write_documents(documents)

retriever = EmbeddingRetriever(
    document_store=document_store, embedding_model="sentence-transformers/all-MiniLM-L12-v2", use_gpu=True, progress_bar=False
)
document_store.update_embeddings(retriever)

query = "Books, with their influential ability, can transport readers to various realms, sparking their imagination. Engaging in the delightful hobby of reading not only opens doors to fresh ideas but also provides perspectives. Whether delving into fantasy realms or immersing in historical events, books offer a journey of discovery."
retrieval_results = retriever.retrieve(query, top_k=1)

# Load English NER model
nlp = spacy.load("en_core_web_md")
query = nlp(query)
extracted_doc = nlp(retrieval_results[0].content)

pos_tags = ["NOUN", "NUM", "ADJ", "VERB"]

highlighted_query_phrases = defaultdict(list)
highlighted_extracted_doc_phrases = defaultdict(list)

for pos_tag in pos_tags:
    if pos_tag == "NOUN":
        query_pos_tags = [str(chunk) for chunk in query.noun_chunks]
        extracted_doc_pos_tags = [str(chunk) for chunk in extracted_doc.noun_chunks]
    else:
        query_pos_tags = [str(chunk) for chunk in query if chunk.pos_ == pos_tag]
        extracted_doc_pos_tags = [str(chunk) for chunk in extracted_doc if chunk.pos_ == pos_tag]

    
    # Get similarity scores and filter for scores above 0.5
    query_pos_tags_embed = torch.tensor(retriever.embed_queries(query_pos_tags))
    extracted_doc_pos_tags_embed = torch.tensor(retriever.embed_queries(extracted_doc_pos_tags))
    # Calculate cosine similarity matrix
    similarity_matrix = F.cosine_similarity(query_pos_tags_embed.unsqueeze(1), extracted_doc_pos_tags_embed.unsqueeze(0), dim=-1)

    # Find indices where similarity is greater than 0.55
    indices = torch.nonzero(similarity_matrix > 0.55, as_tuple=False)

    # Gather the results based on the indices
    high_similarity_scores = [
        (similarity_matrix[index[0], index[1]].item(), query_pos_tags[index[0]], extracted_doc_pos_tags[index[1]])
        for index in indices
    ]
    
    # Highlight the highest similar phrases
    for similarity_score, query_pos_tag, extracted_doc_pos_tag in high_similarity_scores:
        query_matches = re.finditer(re.escape(query_pos_tag), query.text)
        extracted_doc_matches = re.finditer(re.escape(extracted_doc_pos_tag), extracted_doc.text)

        for match in query_matches:
            start, end = match.start(), match.end()

            highlighted_query_phrases[pos_tag].append((query_pos_tag, start, end))
        
        for match in extracted_doc_matches:
            start, end = match.start(), match.end()

            highlighted_extracted_doc_phrases[pos_tag].append((extracted_doc_pos_tag, start, end))

query_text = Text(query.text)
extracted_doc_text = Text(extracted_doc.text)

console = Console(highlight=False)

for pos_tag, highlighted_phrases_list in highlighted_query_phrases.items():

    for query_pos_tag, start, end in highlighted_phrases_list:
        query_text.stylize("yellow", start, end)

for pos_tag, highlighted_phrases_list in highlighted_extracted_doc_phrases.items():
    for query_pos_tag, start, end in highlighted_phrases_list:
        extracted_doc_text.stylize("yellow", start, end)

layout = Layout()
layout.split_column(
    Layout(name="input"),
    Layout(name="compare")
    )
layout["input"].size = 5
layout["compare"].size = 5
layout["input"].update(query_text)
layout["compare"].update(extracted_doc_text)

console.print(layout)

INFO - haystack.modeling.utils -  Using devices: CPU - Number of GPUs: 0
INFO - haystack.modeling.utils -  Using devices: CPU - Number of GPUs: 0
INFO - haystack.nodes.retriever.dense -  Init retriever using embeddings of model sentence-transformers/all-MiniLM-L12-v2
  return self.fget.__get__(instance, owner)()
INFO - haystack.document_stores.memory -  Updating embeddings for 0 docs ...
Documents Processed: 10000 docs [00:00, 98357.87 docs/s]     


In [2]:
import os
import re
import logging
import spacy
import torch
import torch.nn.functional as F
from collections import defaultdict

from haystack import Answer
from haystack.document_stores import InMemoryDocumentStore
from haystack.nodes import EmbeddingRetriever, PreProcessor
from haystack.pipelines.standard_pipelines import TextIndexingPipeline
from haystack.utils import print_answers
from rich.console import Console
from rich.layout import Layout
from rich.text import Text

cleaner = PreProcessor(clean_empty_lines=True, clean_whitespace=True, split_by="word", split_respect_sentence_boundary=False)
logging.basicConfig(format="%(levelname)s - %(name)s -  %(message)s", level=logging.WARNING)
logging.getLogger("haystack").setLevel(logging.INFO)

document_store = InMemoryDocumentStore(embedding_dim=384)
doc_dir = "../../data/dataset/"
files_to_index = [doc_dir + "/" + f for f in os.listdir(doc_dir)]
indexing_pipeline = TextIndexingPipeline(document_store)
indexing_pipeline.run_batch(file_paths=files_to_index)

retriever = EmbeddingRetriever(
    document_store=document_store, embedding_model="sentence-transformers/all-MiniLM-L12-v2", use_gpu=True, progress_bar=False
)
document_store.update_embeddings(retriever)

query = "The final season of the fantasy drama television series ''Game of Thrones'', produced by HBO, premiered on April 14, 2019, and concluded on May 19, 2019."
retrieval_results = retriever.retrieve(query, top_k=1)

# Load English NER model
nlp = spacy.load("en_core_web_md")
query = nlp(query)
extracted_doc = nlp(retrieval_results[0].content)

pos_tags = ["NOUN", "NUM", "ADJ", "VERB"]

highlighted_query_phrases = defaultdict(list)
highlighted_extracted_doc_phrases = defaultdict(list)

for pos_tag in pos_tags:
    if pos_tag == "NOUN":
        query_pos_tags = [str(chunk) for chunk in query.noun_chunks]
        extracted_doc_pos_tags = [str(chunk) for chunk in extracted_doc.noun_chunks]
    else:
        query_pos_tags = [str(chunk) for chunk in query if chunk.pos_ == pos_tag]
        extracted_doc_pos_tags = [str(chunk) for chunk in extracted_doc if chunk.pos_ == pos_tag]

    
    # Get similarity scores and filter for scores above 0.5
    query_pos_tags_embed = torch.tensor(retriever.embed_queries(query_pos_tags))
    extracted_doc_pos_tags_embed = torch.tensor(retriever.embed_queries(extracted_doc_pos_tags))
    # Calculate cosine similarity matrix
    similarity_matrix = F.cosine_similarity(query_pos_tags_embed.unsqueeze(1), extracted_doc_pos_tags_embed.unsqueeze(0), dim=-1)

    # Find indices where similarity is greater than 0.55
    indices = torch.nonzero(similarity_matrix > 0.55, as_tuple=False)

    # Gather the results based on the indices
    high_similarity_scores = [
        (similarity_matrix[index[0], index[1]].item(), query_pos_tags[index[0]], extracted_doc_pos_tags[index[1]])
        for index in indices
    ]
    
    # Highlight the highest similar phrases
    for similarity_score, query_pos_tag, extracted_doc_pos_tag in high_similarity_scores:
        query_matches = re.finditer(re.escape(query_pos_tag), query.text)
        extracted_doc_matches = re.finditer(re.escape(extracted_doc_pos_tag), extracted_doc.text)

        for match in query_matches:
            start, end = match.start(), match.end()

            highlighted_query_phrases[pos_tag].append((query_pos_tag, start, end))
        
        for match in extracted_doc_matches:
            start, end = match.start(), match.end()

            highlighted_extracted_doc_phrases[pos_tag].append((extracted_doc_pos_tag, start, end))

query_text = Text(query.text)
extracted_doc_text = Text(extracted_doc.text)

console = Console(highlight=False)

for pos_tag, highlighted_phrases_list in highlighted_query_phrases.items():

    for query_pos_tag, start, end in highlighted_phrases_list:
        query_text.stylize("yellow", start, end)

for pos_tag, highlighted_phrases_list in highlighted_extracted_doc_phrases.items():
    for query_pos_tag, start, end in highlighted_phrases_list:
        extracted_doc_text.stylize("yellow", start, end)

layout = Layout()
layout.split_column(
    Layout(name="input"),
    Layout(name="compare")
    )
layout["input"].size = 5
layout["compare"].size = 5
layout["input"].update(query_text)
layout["compare"].update(extracted_doc_text)

console.print(layout)

INFO - haystack.modeling.utils -  Using devices: CPU - Number of GPUs: 0
INFO - haystack.pipelines.base -  It seems that an indexing Pipeline is run, so using the nodes' run method instead of run_batch.
Converting files:   0%|          | 0/183 [00:00<?, ?it/s]

Converting files: 100%|██████████| 183/183 [00:01<00:00, 100.63it/s]
Preprocessing: 100%|██████████| 183/183 [00:02<00:00, 83.57docs/s] 
INFO - haystack.modeling.utils -  Using devices: CPU - Number of GPUs: 0
INFO - haystack.nodes.retriever.dense -  Init retriever using embeddings of model sentence-transformers/all-MiniLM-L12-v2
INFO - haystack.document_stores.memory -  Updating embeddings for 0 docs ...
Documents Processed: 10000 docs [07:38, 21.79 docs/s]           


## Getting intersection by comparing all combination of words

In [5]:
import os
import re
import logging
import nltk
import spacy
import torch
import torch.nn.functional as F
from collections import defaultdict

from nltk.corpus import stopwords
from haystack import Answer
from haystack.document_stores import InMemoryDocumentStore
from haystack.nodes import EmbeddingRetriever, PreProcessor
from haystack.pipelines.standard_pipelines import TextIndexingPipeline
from haystack.utils import print_answers
from rich.console import Console
from rich.layout import Layout
from rich.text import Text

cleaner = PreProcessor(clean_empty_lines=True, clean_whitespace=True, split_by="word", split_respect_sentence_boundary=False)
logging.basicConfig(format="%(levelname)s - %(name)s -  %(message)s", level=logging.WARNING)
logging.getLogger("haystack").setLevel(logging.INFO)

# Download stopwords data
nltk.download('stopwords')

document_store = InMemoryDocumentStore(embedding_dim=384)
doc_dir = "../../data/dataset"
files_to_index = [doc_dir + "/" + f for f in os.listdir(doc_dir)]
indexing_pipeline = TextIndexingPipeline(document_store)
indexing_pipeline.run_batch(file_paths=files_to_index)

retriever = EmbeddingRetriever(
    document_store=document_store, embedding_model="sentence-transformers/all-MiniLM-L12-v2", use_gpu=True, progress_bar=False
)
document_store.update_embeddings(retriever)

query = "The final season of the fantasy drama television series ''Game of Thrones'', produced by HBO, premiered on April 14, 2019, and concluded on May 19, 2019."
extracted_doc = retriever.retrieve(query, top_k=1)

# Tokenize the query and extracted document excluding stop words
stop_words = set(stopwords.words('english'))

query_tokens = [token for token in query.split() if token.lower() not in stop_words]
extracted_doc_tokens = [token for token in retrieval_results[0].content.split() if token.lower() not in stop_words]

# Get similarity scores and filter for scores above 0.55
query_tokens_embed = torch.tensor(retriever.embed_queries(query_tokens))
extracted_doc_tokens_embed = torch.tensor(retriever.embed_queries(extracted_doc_tokens))

# Calculate cosine similarity matrix
similarity_matrix = F.cosine_similarity(query_tokens_embed.unsqueeze(1), extracted_doc_tokens_embed.unsqueeze(0), dim=-1)

# Find indices where similarity is greater than 0.55
indices = torch.nonzero(similarity_matrix > 0.55, as_tuple=False)

# Gather the results based on the indices
high_similarity_scores = [
    (similarity_matrix[index[0], index[1]].item(), query_tokens[index[0]], extracted_doc_tokens[index[1]])
    for index in indices
]
    
# Highlight the highest similar phrases
for similarity_score, query_pos_tag, extracted_doc_pos_tag in high_similarity_scores:
    query_matches = re.finditer(query_pos_tag, query)
    extracted_doc_matches = re.finditer(re.escape(extracted_doc_pos_tag), extracted_doc[0].content)

    for match in query_matches:
        start, end = match.start(), match.end()

        highlighted_query_phrases[pos_tag].append((query_pos_tag, start, end))
    
    for match in extracted_doc_matches:
        start, end = match.start(), match.end()

        highlighted_extracted_doc_phrases[pos_tag].append((extracted_doc_pos_tag, start, end))

query_text = Text(query)
extracted_doc_text = Text(extracted_doc[0].content)

console = Console(highlight=False)

for pos_tag, highlighted_phrases_list in highlighted_query_phrases.items():

    for query_pos_tag, start, end in highlighted_phrases_list:
        query_text.stylize("yellow", start, end)

for pos_tag, highlighted_phrases_list in highlighted_extracted_doc_phrases.items():
    for query_pos_tag, start, end in highlighted_phrases_list:
        extracted_doc_text.stylize("yellow", start, end)

layout = Layout()
layout.split_column(
    Layout(name="input"),
    Layout(name="compare")
    )
layout["input"].size = 5
layout["compare"].size = 5
layout["input"].update(query_text)
layout["compare"].update(extracted_doc_text)

console.print(layout)


[nltk_data] Downloading package stopwords to
[nltk_data]     /home/sann_htet/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
INFO - haystack.modeling.utils -  Using devices: CPU - Number of GPUs: 0
INFO - haystack.pipelines.base -  It seems that an indexing Pipeline is run, so using the nodes' run method instead of run_batch.
Converting files: 100%|██████████| 183/183 [00:01<00:00, 101.28it/s]
Preprocessing: 100%|██████████| 183/183 [00:02<00:00, 75.80docs/s]
INFO - haystack.modeling.utils -  Using devices: CPU - Number of GPUs: 0
INFO - haystack.nodes.retriever.dense -  Init retriever using embeddings of model sentence-transformers/all-MiniLM-L12-v2
INFO - haystack.document_stores.memory -  Updating embeddings for 0 docs ...
Documents Processed: 10000 docs [08:06, 20.58 docs/s]           
