In [21]:
from milvus_haystack import MilvusDocumentStore

document_store = MilvusDocumentStore(
    connection_args={
        "host": "localhost",
        "port": "19530",
        "user": "",
        "password": "",
        "secure": False,
    },
)

In [16]:
import os
import getpass

os.environ["JINA_API_KEY"] = getpass.getpass("Jina AI API Key: ")

In [31]:
import json
from typing import List
from haystack import Document, component

relevant_keys = ['Summary', 'Issue_key', 'Issue_id', 'Parent_id', 'Issue type', 'Status', 'Project lead', 'Priority', 'Assignee', 'Reporter', 'Creator', 'Created', 'Updated', 'Last Viewed', 'Due Date', 'Labels',
                 'Description', 'Comment', 'Comment__1', 'Comment__2', 'Comment__3', 'Comment__4', 'Comment__5', 'Comment__6', 'Comment__7', 'Comment__8', 'Comment__9', 'Comment__10', 'Comment__11', 'Comment__12',
                 'Comment__13', 'Comment__14', 'Comment__15']

@component
class RemoveKeys:
    @component.output_types(documents=List[Document])
    def run(self, file_name: str):
        with open(file_name, 'r') as file:
            tickets = json.load(file)
        cleaned_tickets = []
        for t in tickets:
            t = {k: v for k, v in t.items() if k in relevant_keys and v}
            cleaned_tickets.append(t)
        return {'documents': cleaned_tickets}

In [32]:
@component
class JsonConverter:
    @component.output_types(documents=List[Document])
    def run(self, tickets: List[Document]):
        tickets_documents = []
        for t in tickets:
            if 'Parent id' in t:
                t = Document(content=json.dumps(t), meta={'Issue_key': t['Issue_key'], 'Issue_id': t['Issue_id'], 'Parent_id': t['Parent_id']})
            else:
                t = Document(content=json.dumps(t), meta={'Issue_key': t['Issue_key'], 'Issue_id': t['Issue_id'], 'Parent_id': ''})
            tickets_documents.append(t)
        return {'documents': tickets_documents}

In [33]:
from haystack import Pipeline
from haystack.components.writers import DocumentWriter
from haystack_integrations.components.embedders.jina import JinaDocumentEmbedder

cleaner = RemoveKeys()
converter = JsonConverter()
embedder =  JinaDocumentEmbedder(model='jina-embeddings-v2-base-en')
writer = DocumentWriter(document_store=document_store)

indexing_pipeline = Pipeline()
indexing_pipeline.add_component('cleaner', cleaner)
indexing_pipeline.add_component('converter', converter)
indexing_pipeline.add_component('embedder', embedder)
indexing_pipeline.add_component('writer', writer)

indexing_pipeline.connect('cleaner', 'converter')
indexing_pipeline.connect('converter', 'embedder')
indexing_pipeline.connect('embedder', 'writer')

indexing_pipeline.run({'cleaner': {'file_name': 'tickets.json'}})

Calculating embeddings:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating embeddings: 100%|██████████| 1/1 [00:01<00:00,  1.22s/it]


{'embedder': {'meta': {'model': 'jina-embeddings-v2-base-en',
   'usage': {'total_tokens': 20131, 'prompt_tokens': 20131}}},
 'writer': {'documents_written': 62}}

In [43]:
from typing import Optional

@component
class RemoveRelated:
    @component.output_types(documents=List[Document])
    def run(self, tickets: List[Document], query_id: Optional[str]):
        retrieved_tickets = []
        for t in tickets:
            if not t.meta['Issue_id'] == query_id and not t.meta['Parent_id'] == query_id:
                retrieved_tickets.append(t)
        return {'documents': retrieved_tickets}

In [54]:
from haystack_integrations.components.embedders.jina import JinaTextEmbedder
from haystack_integrations.components.rankers.jina import JinaRanker
from milvus_haystack import MilvusEmbeddingRetriever

retriever = MilvusEmbeddingRetriever(document_store=document_store, top_k=20)
embedder = JinaTextEmbedder(model='jina-embeddings-v2-base-en')
parent_cleaner = RemoveRelated()
ranker = JinaRanker()

query_pipeline_reranker = Pipeline()
query_pipeline_reranker.add_component('query_embedder', embedder)
query_pipeline_reranker.add_component('doc_retriever', retriever)
query_pipeline_reranker.add_component('parent_cleaner', parent_cleaner)
query_pipeline_reranker.add_component('jina_reranker', ranker)

query_pipeline_reranker.connect('query_embedder.embedding', 'doc_retriever.query_embedding')
query_pipeline_reranker.connect('doc_retriever', 'parent_cleaner')
query_pipeline_reranker.connect('parent_cleaner', 'jina_reranker')

<haystack.core.pipeline.pipeline.Pipeline object at 0x15ad2e6c0>
🚅 Components
  - query_embedder: JinaTextEmbedder
  - doc_retriever: MilvusEmbeddingRetriever
  - parent_cleaner: RemoveRelated
  - jina_reranker: JinaRanker
🛤️ Connections
  - query_embedder.embedding -> doc_retriever.query_embedding (List[float])
  - doc_retriever.documents -> parent_cleaner.tickets (List[Document])
  - parent_cleaner.documents -> jina_reranker.documents (List[Document])

In [57]:
query_ticket_key = 'ZOOKEEPER-3282'

with open('tickets.json', 'r') as file:
    tickets = json.load(file)

for ticket in tickets:
    if ticket['Issue_key'] == query_ticket_key:
        query = str(ticket)
        query_ticket_id = ticket['Issue_id']
        result = query_pipeline_reranker.run(data={'query_embedder':{'text': query},
                                                    'parent_cleaner': {'query_id': query_ticket_id},
                                                    'jina_reranker': {'query': query, 'top_k': 10}
                                                    }
                                                )

        for idx, res in enumerate(result['jina_reranker']['documents']):
            print('Doc {}:'.format(idx + 1), res)

Doc 1: Document(id=b3cef78cd1901d5eb932228b5be1e01c379a18bb9bca9e5d17d1ba2e5c9b10e4, content: '{"Summary": "add the new doc: zookeeperClients.md", "Issue_key": "ZOOKEEPER-3283", "Issue_id": 13216...', meta: {'Issue_key': 'ZOOKEEPER-3283', 'Issue_id': 13216610, 'Parent_id': ''}, score: 0.8583627939224243, embedding: vector of size 768)
Doc 2: Document(id=b48de999465cb3c5ceaad2145f1b2327870030aa31a5792fa7795ea4455a5e3a, content: '{"Summary": "add a new documentation: zookeeperCodingGuide.md", "Issue_key": "ZOOKEEPER-3616", "Issu...', meta: {'Issue_key': 'ZOOKEEPER-3616', 'Issue_id': 13268279, 'Parent_id': ''}, score: 0.8556100726127625, embedding: vector of size 768)
Doc 3: Document(id=8bd9795b3e0edd27a3a62b204d69ec846f4023c09ac87170dc433f24881e00d5, content: '{"Summary": "support the complete linearizable read and multiply read consistency level", "Issue_key...', meta: {'Issue_key': 'ZOOKEEPER-3600', 'Issue_id': 13265507, 'Parent_id': ''}, score: 0.8250265121459961, embedding: vector of