In [3]:
import os
from langchain_community.document_loaders import DirectoryLoader, UnstructuredPDFLoader, PyMuPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter, MarkdownHeaderTextSplitter
from langchain.vectorstores import Chroma, FAISS
from langchain.document_loaders import TextLoader
from langchain.chains import RetrievalQA,  ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory
from langchain.llms import HuggingFacePipeline
from langchain.document_loaders import TextLoader
from langchain.document_loaders import PyPDFLoader
from langchain.embeddings import HuggingFaceEmbeddings

import torch
from sentence_transformers import SentenceTransformer
from torch.nn import functional as F
import numpy as np

from pymilvus import (FieldSchema, DataType, CollectionSchema, Collection)

from pymilvus import connections, MilvusClient, WeightedRanker, AnnSearchRequest

from langchain_community.llms import LlamaCpp
from langchain_community.chat_models import ChatOllama
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.callbacks import CallbackManager, StreamingStdOutCallbackHandler
from langchain_core.prompts import PromptTemplate
from langchain.memory import ConversationSummaryBufferMemory, CombinedMemory
from langchain.chains import LLMChain
from langchain_core.documents.base import Document

import time
# from transformers import AutoTokenizer, pipeline, AutoConfig, AutoModelForCausalLM, AutoModel
from tqdm import tqdm
import json
from langchain_community.llms import CTransformers
import tiktoken

import fitz
import re, glob
import pymupdf4llm, pdfplumber
from unstructured.partition.pdf import partition_pdf
from unstructured.partition.md import partition_md
from unstructured.chunking.title import chunk_by_title
from unstructured.chunking.basic import chunk_elements
import subprocess
from collections import Counter
import gc
import pickle
import itertools

## Tests

In [None]:
# loader = DirectoryLoader(
#     os.path.abspath("/home/niloy/code/simple_chat/persist_storage"),
#     glob="**/*.pdf",
#     use_multithreading=True,
#     show_progress=True,
#     max_concurrency=50,
#     loader_cls=UnstructuredPDFLoader,
# )
# documents = loader.load()

In [None]:
# toc_regex = r"""
#         ^(?:(?:Table of )?Contents|List of (?:Figures|Tables))?  # Optional Titles
#         (?:\s*\n+)?                                             # Optional newline(s) 
#         (?:\s*\d+\.?\s+(.+)\s*)?                                # Optional: number, period, title 
#         \s*\.{2,}\s*\d+                                         # Required dotted leader line and page number
#     """

# pdf_contents = []
# for page in tqdm(temp_data):
#     page_data = page.page_content
#     if (flags & flag_content_started) == flag_content_started:
#         pdf_contents += page
#     else:
#         llm_answer = llm_tocqa_chain.invoke({"context": page_data})
#         print(page_data)
#         print(llm_answer["text"])
#         if llm_answer["text"].lower().startswith("no"):
#             pdf_contents += page
#         else:
#             flags = flags | flag_toc_found
#         if llm_answer["text"].lower().startswith("no") and (flags == flag_toc_found):
#             flags = flags | flag_toc_ended


## Pdf Extraction

In [2]:
def extract_page_contents(pdf_path):
    toc_dotted_line_regex = r".*\.{2,}\s*\d+"
    pdf_name = os.path.basename(pdf_path)
    print("saving .. " + pdf_name)
    original_doc = fitz.open(pdf_path)
    output_doc = fitz.open()
    pages_to_exclude = [] 
    flag_toc_found = 1
    flag_toc_ended = 2
    flag_content_started = 3
    flags = 0
    for i, page in enumerate(original_doc):
        text = page.get_text()
        if (flags & flag_content_started) == flag_content_started:
            break
        else:
            if len(re.findall(toc_dotted_line_regex, text)) > 0:
                pages_to_exclude.append(i)
                flags = flags | flag_toc_found
            elif (flags & flag_toc_found) == flag_toc_found and len(re.findall(toc_dotted_line_regex, text)) == 0:
                flags = flags | flag_toc_ended
    output_doc = original_doc
    print("excluding pages ... " + str(pages_to_exclude))
    output_doc.delete_pages(from_page=1, to_page=pages_to_exclude[-1])
    output_doc.save(pdf_name)

    # temp_loader = PyMuPDFLoader(pdf_path)
    # temp_data = temp_loader.load()
    elements = partition_pdf(pdf_name, strategy="hi_res", infer_table_structure=True, hi_res_model_name="yolox")
    os.remove(pdf_name)
    
    return elements
    

In [30]:
def extract_page_contents_pdfplumber(pdf_path):
    toc_dotted_line_regex = r".*\.{2,}\s*\d+"
    pdf_name = os.path.basename(pdf_path)
    print("saving .. " + pdf_name)
    original_doc = pdfplumber.open(pdf_path)
    pages_to_exclude = [] 
    flag_toc_found = 1
    flag_toc_ended = 2
    flag_content_started = 3
    flags = 0
    for i, page in enumerate(original_doc.pages):
        text = page.extract_text(layout=True)
        if (flags & flag_content_started) == flag_content_started:
            break
        else:
            if len(re.findall(toc_dotted_line_regex, text)) > 0:
                pages_to_exclude.append(i)
                flags = flags | flag_toc_found
            elif (flags & flag_toc_found) == flag_toc_found and len(re.findall(toc_dotted_line_regex, text)) == 0:
                flags = flags | flag_toc_ended
    print("excluding pages ... " + str(pages_to_exclude))
    output_doc = []
    output_doc.append(original_doc.pages[0])
    output_doc.extend(original_doc.pages[pages_to_exclude[-1]+1:])

    # temp_loader = PyMuPDFLoader(pdf_path)
    # temp_data = temp_loader.load()
    # elements = partition_pdf(pdf_name, strategy="hi_res", infer_table_structure=True)
    
    return output_doc
    
    

In [31]:
def extract_page_contents_markdown(pdf_path):
    toc_dotted_line_regex = r".*\.{2,}\s*\d+"
    pdf_name = os.path.basename(pdf_path)
    print("saving .. " + pdf_name)
    original_doc = fitz.open(pdf_path)
    output_doc = fitz.open()
    pages_to_exclude = [] 
    flag_toc_found = 1
    flag_toc_ended = 2
    flag_content_started = 3
    flags = 0
    for i, page in enumerate(original_doc):
        text = page.get_text()
        if (flags & flag_content_started) == flag_content_started:
            break
        else:
            if len(re.findall(toc_dotted_line_regex, text)) > 0:
                pages_to_exclude.append(i)
                flags = flags | flag_toc_found
            elif (flags & flag_toc_found) == flag_toc_found and len(re.findall(toc_dotted_line_regex, text)) == 0:
                flags = flags | flag_toc_ended
    output_doc = original_doc
    print("excluding pages ... " + str(pages_to_exclude))
    output_doc.delete_pages(from_page=1, to_page=pages_to_exclude[-1])

    out_markdown = pymupdf4llm.to_markdown(output_doc)
    
    return out_markdown
    

In [26]:
def extract_page_contents_unstructured(pdf_path):
    toc_dotted_line_regex = r".*\.{2,}\s*\d+"
    pdf_name = os.path.basename(pdf_path)
    print("saving .. " + pdf_name)
    original_doc = fitz.open(pdf_path)
    output_doc = fitz.open()
    pages_to_exclude = [] 
    flag_toc_found = 1
    flag_toc_ended = 2
    flag_content_started = 3
    flags = 0
    for i, page in enumerate(original_doc):
        text = page.get_text()
        if (flags & flag_content_started) == flag_content_started:
            break
        else:
            if len(re.findall(toc_dotted_line_regex, text)) > 0:
                pages_to_exclude.append(i)
                flags = flags | flag_toc_found
            elif (flags & flag_toc_found) == flag_toc_found and len(re.findall(toc_dotted_line_regex, text)) == 0:
                flags = flags | flag_toc_ended
    output_doc = original_doc
    print("excluding pages ... " + str(pages_to_exclude))
    output_doc.delete_pages(from_page=1, to_page=pages_to_exclude[-1])
    output_doc.save(pdf_name)

    elements = partition_pdf(pdf_name, strategy="hi_res")
    os.remove(pdf_name)
    
    return elements
    

In [31]:
def convert_with_nougat(pdf_path, output_dir):
    result = subprocess.run(["nougat", pdf_path, "-o", output_dir], capture_output=True)
    print(result.stdout)
    print(result.stderr)
    return

In [33]:
def extract_page_contents_markdown_unstructured(pdf_path):
    toc_dotted_line_regex = r".*\.{2,}\s*\d+"
    pdf_name = os.path.basename(pdf_path)
    print("saving .. " + pdf_name)
    original_doc = fitz.open(pdf_path)
    output_doc = fitz.open()
    pages_to_exclude = [] 
    flag_toc_found = 1
    flag_toc_ended = 2
    flag_content_started = 3
    flags = 0
    for i, page in enumerate(original_doc):
        text = page.get_text()
        if (flags & flag_content_started) == flag_content_started:
            break
        else:
            if len(re.findall(toc_dotted_line_regex, text)) > 0:
                pages_to_exclude.append(i)
                flags = flags | flag_toc_found
            elif (flags & flag_toc_found) == flag_toc_found and len(re.findall(toc_dotted_line_regex, text)) == 0:
                flags = flags | flag_toc_ended
    output_doc = original_doc
    print("excluding pages ... " + str(pages_to_exclude))
    output_doc.delete_pages(from_page=1, to_page=pages_to_exclude[-1])
    output_doc.save(pdf_name)

    output_dir = "markdown_results/"
    
    convert_with_nougat(pdf_name, output_dir)

    output_pdf_name = output_dir + os.path.splitext(os.path.basename(pdf_name))[0] + ".mmd"

    elements = partition_md(filename=output_pdf_name)

    os.remove(pdf_name)
    
    return elements
    

In [4]:
def load_from_directory(directory_path):
    pdfs = glob.glob(directory_path + "/*.pdf")
    documents = []
    for pdf in tqdm(pdfs):
        documents.append(extract_page_contents(pdf))
    # documents.append(extract_page_contents(pdfs[1]))
    return documents

In [5]:
ocr_only_documents = load_from_directory("persist_storage")

  0%|                                                                             | 0/7 [00:00<?, ?it/s]

saving .. 3EST000235-2562_Ben_CCU-S2_Safety_Manual.pdf
excluding pages ... [1, 2]


Some weights of the model checkpoint at microsoft/table-transformer-structure-recognition were not used when initializing TableTransformerForObjectDetection: ['model.backbone.conv_encoder.model.layer2.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer3.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer4.0.downsample.1.num_batches_tracked']
- This IS expected if you are initializing TableTransformerForObjectDetection from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TableTransformerForObjectDetection from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
 14%|█████████▋                                                          | 1/7 [02:16<13:37, 136.23s/

saving .. 3EST000235-2511_Ben_CCU-S2_ECP-USB_Content_Specification.pdf
excluding pages ... [1, 2, 3]


 29%|███████████████████▍                                                | 2/7 [07:22<19:42, 236.53s/it]

saving .. 3EST000236-8112_Ben_CCU-S2_Product_Release Note_1.2.1.0.pdf
excluding pages ... [1, 2, 3]


 43%|█████████████████████████████▏                                      | 3/7 [09:01<11:34, 173.73s/it]

saving .. 3EGM081750-0035_Gen_TBCI_and_TBI-ED_Interface_Control_Document.pdf
excluding pages ... [1, 2, 3, 4, 5]


 57%|██████████████████████████████████████▊                             | 4/7 [14:07<11:17, 225.82s/it]

saving .. 3EST000235-2556_Een_CCU-S2_User_Manual.pdf
excluding pages ... [5, 6, 7, 8, 9]


 71%|████████████████████████████████████████████████▌                   | 5/7 [24:28<12:16, 368.40s/it]

saving .. 3EGM007200D2806_Yen_MITRAC_Control_C&C_Platform_Catalogue.pdf
excluding pages ... [1, 2, 3, 4, 5, 6]


 86%|██████████████████████████████████████████████████████████▎         | 6/7 [32:22<06:44, 404.14s/it]

saving .. 3EST000235-2552_Aen_CCU-S2_Interface_Control_Document.pdf
excluding pages ... [1, 2]


100%|████████████████████████████████████████████████████████████████████| 7/7 [34:37<00:00, 296.84s/it]


In [34]:
unstructured_markdown_documents = load_from_directory("persist_storage")

saving .. 3EST000235-2511_Ben_CCU-S2_ECP-USB_Content_Specification.pdf
excluding pages ... [1, 2, 3]


In [35]:
unrefined_documents = load_from_directory("persist_storage")

  0%|                                                                             | 0/7 [00:00<?, ?it/s]

saving .. 3EST000235-2562_Ben_CCU-S2_Safety_Manual.pdf
excluding pages ... [1, 2]
b'INFO: likely hallucinated title at the end of the page: #### 2.1.1 CCU-\n'


 14%|█████████▊                                                           | 1/7 [01:27<08:42, 87.07s/it]

saving .. 3EST000235-2511_Ben_CCU-S2_ECP-USB_Content_Specification.pdf
excluding pages ... [1, 2, 3]
b''


 29%|███████████████████▍                                                | 2/7 [04:33<12:06, 145.33s/it]

saving .. 3EST000236-8112_Ben_CCU-S2_Product_Release Note_1.2.1.0.pdf
excluding pages ... [1, 2, 3]


 43%|█████████████████████████████▏                                      | 3/7 [05:22<06:46, 101.58s/it]

b''
saving .. 3EGM081750-0035_Gen_TBCI_and_TBI-ED_Interface_Control_Document.pdf
excluding pages ... [1, 2, 3, 4, 5]


 57%|██████████████████████████████████████▊                             | 4/7 [07:14<05:16, 105.45s/it]

b''
saving .. 3EST000235-2556_Een_CCU-S2_User_Manual.pdf
excluding pages ... [5, 6, 7, 8, 9]
b'INFO: likely hallucinated title at the end of the page: ### Intended Audience\n'


 71%|████████████████████████████████████████████████▌                   | 5/7 [13:24<06:41, 200.96s/it]

saving .. 3EGM007200D2806_Yen_MITRAC_Control_C&C_Platform_Catalogue.pdf
excluding pages ... [1, 2, 3, 4, 5, 6]
b'INFO: likely hallucinated title at the end of the page: #### 3.8.2 Node-ID Plug - 12SX02\n'


 86%|██████████████████████████████████████████████████████████▎         | 6/7 [17:00<03:26, 206.22s/it]

saving .. 3EST000235-2552_Aen_CCU-S2_Interface_Control_Document.pdf
excluding pages ... [1, 2]


100%|████████████████████████████████████████████████████████████████████| 7/7 [18:44<00:00, 160.71s/it]

b'INFO: likely hallucinated title at the end of the page: ### Data Set Identifiers\n'





In [7]:
unrefined_documents2 = load_from_directory("persist_storage")

NameError: name 'load_from_directory' is not defined

## Document Loading

In [9]:
document_data_file_yolox = "documents_yolox.pickle"

In [11]:
with open(document_data_file_yolox, 'wb') as file:
    pickle.dump(ocr_only_documents, file)

In [25]:
document_data_file = "documents_yolox.pickle"
with open(document_data_file, "rb") as file:
    documents = pickle.load(file)

In [3]:
documents[2]

[<unstructured.documents.elements.NarrativeText at 0x7de11a39b990>,
 <unstructured.documents.elements.Text at 0x7de11a39bdd0>,
 <unstructured.documents.elements.Header at 0x7de11a3a4250>,
 <unstructured.documents.elements.NarrativeText at 0x7de11a3a4650>,
 <unstructured.documents.elements.Title at 0x7de11a3a4a90>,
 <unstructured.documents.elements.Title at 0x7de11a3a4e90>,
 <unstructured.documents.elements.Image at 0x7de11a3a52d0>,
 <unstructured.documents.elements.Title at 0x7de11a3a56d0>,
 <unstructured.documents.elements.Image at 0x7de11a3a5ad0>,
 <unstructured.documents.elements.Title at 0x7de11a3a5ed0>,
 <unstructured.documents.elements.Text at 0x7de11a3a6310>,
 <unstructured.documents.elements.Title at 0x7de11a3a6750>,
 <unstructured.documents.elements.Title at 0x7de11a3a6b90>,
 <unstructured.documents.elements.Title at 0x7de11a3a6f90>,
 <unstructured.documents.elements.Title at 0x7de11a3a73d0>,
 <unstructured.documents.elements.Image at 0x7de11a3a7810>,
 <unstructured.documents.

In [4]:
for element in ocr_only_documents[2]:
    if( "unstructured.documents.elements.Table" in str(type(element))):
        print(element.metadata.text_as_html)
    else:
        print(element.text)
    print("====")

NameError: name 'ocr_only_documents' is not defined

### Cleaning Headers and Footers

In [26]:
def find_footer(str_list):
    # print(list(map(lambda x : len(x), str_list)))
    data = Counter(str_list)
    commons = list(map(lambda x: x[0], data.most_common(3)))
    return max(commons, key=len)

def clean_footer(documents):
    filtered_documents = []
    for current_document in documents:
        filtered_elements = list(filter(lambda x: "unstructured.documents.elements.Table" not in str(type(x)) and len(x.text) > 0, current_document))
        footer_str = find_footer(list(map(lambda x: x.text, filtered_elements)))
        filtered_document = list(filter(lambda x: footer_str not in x.text, current_document))
        filtered_documents.append(filtered_document)
    return filtered_documents

In [27]:
filtered_documents = clean_footer(documents[:-1])
filtered_documents.append(documents[-1])

In [28]:
def chunk_using_unstructured(documents):
    chunks_list = []
    for document in documents:
        chunks = chunk_by_title(elements=document, max_characters=2200, new_after_n_chars=1600, overlap=128)
        # chunks = chunk_elements(elements=document, max_characters=1024, new_after_n_chars=768, overlap=128)
        chunks_list.append(chunks)
    return chunks_list

In [29]:
filtered_chunk_list = chunk_using_unstructured(filtered_documents)

In [30]:
for element in filtered_chunk_list[4]:
    print(type(element))

<class 'unstructured.documents.elements.CompositeElement'>
<class 'unstructured.documents.elements.Table'>
<class 'unstructured.documents.elements.CompositeElement'>
<class 'unstructured.documents.elements.Table'>
<class 'unstructured.documents.elements.CompositeElement'>
<class 'unstructured.documents.elements.Table'>
<class 'unstructured.documents.elements.TableChunk'>
<class 'unstructured.documents.elements.TableChunk'>
<class 'unstructured.documents.elements.CompositeElement'>
<class 'unstructured.documents.elements.Table'>
<class 'unstructured.documents.elements.CompositeElement'>
<class 'unstructured.documents.elements.Table'>
<class 'unstructured.documents.elements.CompositeElement'>
<class 'unstructured.documents.elements.Table'>
<class 'unstructured.documents.elements.Table'>
<class 'unstructured.documents.elements.CompositeElement'>
<class 'unstructured.documents.elements.Table'>
<class 'unstructured.documents.elements.CompositeElement'>
<class 'unstructured.documents.element

In [31]:
for element in filtered_chunk_list[4]:
    if( "unstructured.documents.elements.Table" in str(type(element))):
        print(element.metadata.text_as_html)
    else:
        print(element.text)
    print("====")

ALSTOM

MITRAC TCMS CCU-S2 - User Manual

For product revision 1.2.0

Doc ID-number: 3EST000235-2556

Revision: _E

Language: en

[tens]

ALSTOM

Product name: CCU-S2

Version: 1.2.0

Doc. type: User Manual

Doc ID-number: 3EST000235-2556

Revision: _E

Langauge: en

Page: 11/191

1 Introduction

1.1 Purpose

This document describes the main functionalities of the CCU-S2 and how to use them.

This document is divided in the following main parts:
====
<table><thead><th>Chapter</th><th>Purpose</th></thead><tr><td>Pf</td><td>An overview of the CCU-S2.</td></tr><tr><td>w]</td><td>How to update a factory installed CCU-S2 unit.</td></tr><tr><td>BY]</td><td>Functional features</td></tr><tr><td>oy}</td><td>Communication and I/O</td></tr><tr><td>mm]</td><td>Diagnostics.</td></tr><tr><td>NI</td><td>Safety related verification activities required.</td></tr><tr><td>CO}</td><td>Limitations and performance</td></tr><tr><td>OO}</td><td>Troubleshooting the CCU-S2.</td></tr><tr><td>10-11</td><td>Relate

In [32]:
def find_header(element):
    string = element.text
    if "unstructured.documents.elements.Table" not in str(type(element)):
        return False
    else:
        # print("matching " + string)
        id_regex = r"\d[A-Z]{3}\d{6}[-A-Z]\d{4}"
        matches = re.findall(id_regex, string)
        if len(matches) > 0 and "Revision" in string and "Page" in string and len(string.split())<23:
            return True
        else:
            return False
            
def merge_tables(document):
    merged_document = []
    for i, element in enumerate(document):
        if "unstructured.documents.elements.TableChunk" in str(type(element)):
            if i-1>=0 and "unstructured.documents.elements.TableChunk" in str(type(document[i-1])):
                merged_document[-1].text += element.text
                merged_document[-1].metadata.text_as_html += element.metadata.text_as_html
            elif i-1>=0 and "unstructured.documents.elements.TableChunk" not in str(type(document[i-1])):
                merged_document.append(element)
            elif i == 0:
                merged_document.append(element)
        else:
            merged_document.append(element)
    return merged_document
            
def align_table_chunks(documents):
    cleaned_documents = []
    for document in documents:
        merged_document = merge_tables(document)
        clean_document = []
        document_populated = False
        for i, element in enumerate(merged_document):
            if "unstructured.documents.elements.Table" in str(type(element)):
                if document_populated:
                    clean_document[-1].text += "\n" + element.metadata.text_as_html
                else:
                    clean_document.append(element)
                    document_populated = True
            else:
                clean_document.append(element)
                document_populated = True
        cleaned_documents.append(clean_document)
    return cleaned_documents


def clean_header(documents):
    filtered_documents = []
    filtered_headers = []
    for current_document in documents:
        filtered_document = list(filter(lambda x: not find_header(x), current_document))
        headers = list(filter(lambda x: find_header(x), current_document))
        if len(headers)>0:
            filtered_headers.append(headers[0].text)
        filtered_documents.append(filtered_document)
    cleaned_documents = align_table_chunks(filtered_documents)
    return cleaned_documents, filtered_headers

In [33]:
cleaned_documents, document_headers = clean_header(filtered_chunk_list)

In [34]:
len(cleaned_documents)

7

In [35]:
for element in cleaned_documents[3]:
    print(element)
    # print(vars(element.metadata))
    print("====")

Responsible Division ISC Prepared Responsible Unit TPCC Document Type Title Description Document State 2015-10-26 Jochen Breidt Interface Control Document Released Checked Object/Project Document Identify Number 2016-04-29 Michael Reismann TBCI 3EGM081750-0035 Approved File Name Language Revision Page 2016-04-29 Peter Sandberg 3EGM081750-0035 TBCI Interface Control Document.doc EN _G 1/92
====
TBCI

Interface Control Document

[tens] by BOMBARDIER

1 1 End Node | Middle Node | End Node [4------ + lll LS esa ee SSeS — >| WTB |: H WTB : WTB : WTB Hh ! I ! i NEE ¥ INE Gw-Fw | GWw-FW | Gw-Fw TCN-GW-S | TCN-GW-S | TCN-GW-S B | I 8 I 8 | g 3 3 3 | I | 3 3 % 2 | 2 | 2 8 I | I | eat, Train Train i nat Train Feati Application | control Controt {7 APPlication Control! {7 APPlication CCU-S i CCU-S CCU-S Application process data <— Protocols for TCN / UIC train <—» Control and status data “<---> Train wide control commands according UIC e. g. NADI confirm to TCN Master inauguration Detection proto

## Summary Chain

In [13]:
# Prompt
prompt_text = """You are an assistant tasked with summarizing tables and text for retrieval. \
    These summaries will be embedded and used to retrieve the raw text or table elements. \
    Give a concise summary of the table or text that is well optimized for retrieval. Table or text: {element}"""
prompt = ChatPromptTemplate.from_template(prompt_text)

# Summary chain
model = ChatOllama(model="llama3:latest")
summarize_chain = {"element": lambda x: x} | prompt | model | StrOutputParser()

In [16]:
summary_documents = []
for clean_document in tqdm(cleaned_documents):
    clean_texts = list(map(lambda x: x.text, clean_document))
    summary_document = summarize_chain.batch(clean_texts, {"max_concurrency": 5})
    summary_documents.append(summary_document)

100%|██████████████████████████████████████████████████████████████████| 7/7 [1:17:18<00:00, 662.65s/it]


In [17]:
len(summary_documents)

7

In [18]:
summary_data_file = "documents_yolox_summary.pickle"
with open(summary_data_file, 'wb') as file:
    pickle.dump(summary_documents, file)

In [14]:
summary_data_file = "documents_yolox_summary.pickle"
with open(summary_data_file, "rb") as file:
    summary_documents = pickle.load(file)

## Keyword Chain

In [18]:
keyword_prompt_text = """You are an assistant tasked with finding keywords for retrieval. \
    These keywords will be embedded and used to retrieve the raw text or table elements. \
    Provide at most 5 keywords in a comma seperated manner representing the idea of the text that is well optimized for retrieval. And only provide the keywords with no additional texts.\
    Text: {element}"""
keyword_prompt = ChatPromptTemplate.from_template(keyword_prompt_text)
keyword_chain = {"element": lambda x: x} | keyword_prompt | model | StrOutputParser()

In [20]:
keyword_texts = []
for summary_document in tqdm(summary_documents):
    clean_texts = list(map(lambda x: x, summary_document))
    keyword_text = keyword_chain.batch(clean_texts, {"max_concurrency": 50})
    keyword_texts.append(keyword_text)

100%|████████████████████████████████████████████████████████████████████| 7/7 [11:57<00:00, 102.52s/it]


### Clean Keywords

In [31]:
cleaned_keywords = []
for keyword_text in keyword_texts:
    temp_keyword = list(map(lambda x: x.split("Keywords:")[-1].strip(), keyword_text))
    cleaned_keyword = list(map(lambda x: x.split("\n\n")[-1].strip(), temp_keyword))
    cleaned_keywords.append(cleaned_keyword)

In [34]:
keywords_data_file = "documents_yolox_summary_keyword.pickle"
with open(keywords_data_file, 'wb') as file:
    pickle.dump(cleaned_keywords, file)

In [35]:
keywords_data_file = "documents_yolox_summary_keyword.pickle"
with open(keywords_data_file, "rb") as file:
    keyword_texts = pickle.load(file)

## Chuking using PDF

In [13]:
splitter = RecursiveCharacterTextSplitter(chunk_size=1024, chunk_overlap=64)
docs = splitter.split_documents(documents)

## Chunking using Markdown

In [22]:
headers_to_split_on = [
    ("#", "Header 1"),
    ("##", "Header 2"),
    ("###", "Header 3"),
    ('####', "Header 4"),
]
markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on)
markdown_splits = markdown_splitter.split_text(documents[0])

In [24]:
markdown_splits

[Document(page_content='|Responsible division DIS|Responsible unit WKCS|Document type Safety Manual|Confidentiality status|Col5|Col6|\n|---|---|---|---|---|---|\n|Prepared 2022-10-07 Francisco Moreira||Title MITRAC TCMS CCU-S2 Safety Manual|Document state Released|||\n|Checked 2022-11-10 Christian Strzyz|||3EST000235-2562|||\n|Approved 2022-11-11 Gunnar Björnstad||File name 3EST000235-2562 CCU-S2 Safety Manual.doc|Language en|Revision _B|Page 1/44|  \n![3EST000235-2562_Ben_CCU-S2_Safety_Manual.pdf-0-2.png](3EST000235-2562_Ben_CCU-S2_Safety_Manual.pdf-0-2.png)'),
 Document(page_content='![3EST000235-2562_Ben_CCU-S2_Safety_Manual.pdf-0-0.png](3EST000235-2562_Ben_CCU-S2_Safety_Manual.pdf-0-0.png)  \n![3EST000235-2562_Ben_CCU-S2_Safety_Manual.pdf-0-1.png](3EST000235-2562_Ben_CCU-S2_Safety_Manual.pdf-0-1.png)  \n**Export from DOORS**  \nDOORS Module Name: 3EST000235-2562 CCU-S2 Safety Manual  \nDOORS Baseline: Current 0.14 (_B)  \nDOORS Location: /TCMS platform/300 Product level requirement

## Chunking using Unstructured

In [74]:
unstructured_chunk_list = chunk_using_unstructured(documents)

In [75]:
for element in unstructured_chunk_list[0]:
    if element.category == "Table":
        print(element.metadata)
    else:
        print(element.text)
    print("=====")

n e D _ v e R 1 0 - 4 0 5 6 1 0 - 5 0 - 0 2 - 0 2 - C P P

:

Id:

d

I

e a p m e T

t

l
=====
<unstructured.documents.elements.ElementMetadata object at 0x7fd08de640d0>
=====
MITRAC TCMS

CCU-S2 Safety Manual

MITRAC TCMS System Contiguration with TCN for Control and Dynamic IP for Comiort Veh 01 Veh 02 Veh 03 Veh 04 Veh 05 Dd ee od Ce area)

Export from DOORS

DOORS Module Name: 3EST000235-2562 CCU-S2 Safety Manual

DOORS Baseline: Current 0.14 (_B)

DOORS Location: /TCMS platform/300 Product level requirement and design/CCU-S2

© ALSTOM SA 2022. All rights reserved. Reproduction, use or disclosure to third parties, without express written authorization from Alstom, is strictly prohibited

ALSTOM

CCU-S2 Safety Manual

3EST000235-2562

Revision _B

Page 4/44

Document Introduction

1

1.1 Abstract

The MITRAC CCU-S2 Safety Manual contains instructions, information and safety considerations that shall be followed in the application of the generic CCU-S2 when used for applications wi

## Chunking using Unstructured + Markdown

In [49]:
unstructured_markdown_chunk_list = chunk_using_unstructured(unstructured_markdown_documents)

In [50]:
for element in unstructured_markdown_chunk_list[0]:
    if element.category == "Table":
        print(element.metadata.text_as_html)
    else:
        print(element.text)
    print("=====")

<table><tr><td>Responsible division DIS</td><td>Responsible unit WKCS</td><td>Document type Technical Description</td><td>Confidentiality status --</td><td>Col5</td><td>Col6</td></tr><tr><td>Prepared 2022-05-19 Sven Linder</td><td></td><td>Title CCU-S2 ECP-USB Content Specification</td><td>Document state Released</td><td></td><td></td></tr><tr><td>Checked 2022-06-23 Christer Olsson</td><td></td><td></td><td>3EST000235-2511</td><td></td><td></td></tr><tr><td>Approved 2022-06-23 Peder Johansson</td><td></td><td>File name 3EST000235-2511 CCU-S2 ECP-USB Content Spe cification.pdf generated via 3EST000221-9153 ECP-USB Content Specification EA Model</td><td>Revision _B</td><td>Language en</td><td>Pages 1/88</td></tr></table>
=====
CCU-S2 ECP-USB Content Specification

© ALSTOM SA 2022. All rights reserved. Reproduction, use or disclosure to third parties, without express written authorization from Alstom,

is strictly prohibited.
=====
<table><tr><td>Col1</td><td>Language: en</td><td>Revisio

## Chunking unrefined

In [70]:
unrefined_chunk_list = chunk_using_unstructured(unrefined_documents)

In [76]:
for element in unrefined_chunk_list[6]:
    # print(element.category)
    # print(vars(element))
    if element.category == "Table":
        print(element.metadata.text_as_html)
    else:
        print(element.text)
    print("=====")

the evolution of mobility

MITRAC TCMS CCU-S2 Interface Control Document

_______________________________________________________________________________ _________

Doc ID-number:

3EST000235-2552

Revision: _A

Language: en

[tei J

MITRAC TCMS

CCU-S2 ICD

Language

Language

Revision

Revision

Page

en

_A

2

3EST000235-2552

Table of Contents

1 Introduction ............................................................................ 4 1.1 Purpose ...................................................................................................... 4 1.2 Intended Audience ..................................................................................... 4 1.3 Related documents .................................................................................... 4 1.4 Abbreviations and Definitions .................................................................... 5
=====
<table><tr><td>1.2</td><td>Intended Audience</td></tr><tr><td>1.3 1.4</td><td>Related doCUMENES 00... eee e

In [82]:
unrefined_chunk_list2 = chunk_using_unstructured(unrefined_documents2)

In [84]:
for element in unrefined_chunk_list2[1]:
    # print(element.category)
    # print(vars(element))
    if element.category == "Table":
        print(element.metadata.text_as_html)
    else:
        print(element.text)
    print("=====")

<table><tr><td rowspan="4">DIS Prepared 2022-05-19 Checked 2022-06-23</td><td rowspan="4">WKCS Sven Linder Christer Olsson</td><td>Technical Description</td><td>--</td><td></td><td></td></tr><tr><td></td><td></td><td>Title</td><td colspan="3">Document state A LSTO M</td></tr><tr><td></td><td></td><td>CCU-S2 ECP-USB Content</td><td colspan="3">Released</td></tr><tr><td></td><td></td><td>Specification</td><td colspan="3">3EST000235-2511</td></tr><tr><td>Approved</td><td></td><td>File name</td><td>Revision</td><td>Language</td><td>| Pages</td></tr><tr><td>2022-06-23</td><td>Peder Johansson _|</td><td>Sismwasiunmnaraterace rotten ean</td><td>|B</td><td>en</td><td>1/88</td></tr></table>
=====
CCU-S2 ECP-USB Content Specification

357000212 2624 PR. GHA BYY oooer 34

© ALSTOM SA 2022. All rights reserved. Reproduction, use or disclosure to third parties, without express written authorization from Alstom, is strictly prohibited.
=====
<table><thead><th></th><th>Language:</th><th>Page:</th><th

## Load Sentence Embedding

In [4]:
# Initialize torch settings
DEVICE = torch.device('cpu' 
   if torch.cuda.is_available() 
   else 'cpu')
# Load the encoder model from huggingface model hub.
model_name = "mixedbread-ai/mxbai-embed-large-v1"
encoder = SentenceTransformer(model_name, device=DEVICE)


# Get the model parameters and save for later.
MAX_SEQ_LENGTH = encoder.get_max_seq_length() 
EMBEDDING_LENGTH = encoder.get_sentence_embedding_dimension()


## Milvus Collection Creation

In [38]:
chunk_list = []
for (text_chunks, summary_chunks, keyword_chunks) in tqdm(zip(cleaned_documents, summary_documents, keyword_texts)):
    for (text_chunk, summary_chunk, keyword_chunk) in tqdm(zip(text_chunks, summary_chunks, keyword_chunks)):
        # Generate embeddings using encoder from HuggingFace.
        text_summary = summary_chunk.split("\n\n",1)[-1]
        embeddings = torch.tensor(encoder.encode([text_summary]))
        keyword_embeddings = torch.tensor(encoder.encode([keyword_chunk]))
        # embeddings = torch.tensor(encoder.encode([chunk.page_content]))
        embeddings = F.normalize(embeddings, p=2, dim=1)
        keyword_embeddings = F.normalize(keyword_embeddings, p=2, dim=1)
        
        converted_values = list(map(np.float32, embeddings))[0]
        keyword_converted_values = list(map(np.float32, keyword_embeddings))[0]
        # Assemble embedding vector, original text chunk, metadata.
        chunk_dict = {
            'vector': converted_values,
            'keyword_vector': keyword_converted_values,
            'text': text_chunk.text,
            'summary_text': text_summary,
            'keywords': keyword_chunk,
            'source': text_chunk.metadata.filename,}
        chunk_list.append(chunk_dict)

0it [00:00, ?it/s]
0it [00:00, ?it/s][A
1it [00:00,  2.02it/s][A
3it [00:00,  5.77it/s][A
5it [00:00,  8.62it/s][A
7it [00:00, 10.89it/s][A
9it [00:00, 12.94it/s][A
11it [00:01, 14.61it/s][A
13it [00:01, 14.87it/s][A
15it [00:01, 15.42it/s][A
17it [00:01, 15.97it/s][A
20it [00:01, 17.73it/s][A
22it [00:01, 16.54it/s][A
24it [00:01, 17.14it/s][A
26it [00:01, 16.98it/s][A
28it [00:02, 16.83it/s][A
31it [00:02, 17.78it/s][A
34it [00:02, 18.85it/s][A
36it [00:02, 18.24it/s][A
38it [00:02, 18.28it/s][A
40it [00:02, 18.14it/s][A
42it [00:02, 17.76it/s][A
44it [00:02, 18.01it/s][A
46it [00:03, 18.07it/s][A
48it [00:03, 18.21it/s][A
51it [00:03, 19.06it/s][A
53it [00:03, 18.74it/s][A
55it [00:03, 18.85it/s][A
58it [00:03, 20.34it/s][A
61it [00:03, 20.77it/s][A
64it [00:03, 16.20it/s][A
1it [00:03,  3.95s/it]
0it [00:00, ?it/s][A
3it [00:00, 19.47it/s][A
6it [00:00, 20.10it/s][A
9it [00:00, 17.17it/s][A
11it [00:00, 17.05it/s][A
13it [00:00, 15.15it/s][A
15it

In [39]:
chunk_list[50]

{'vector': array([-0.01398961,  0.00451811,  0.0061275 , ..., -0.02664636,
         0.00488809,  0.03021417], dtype=float32),
 'keyword_vector': array([-0.03023824,  0.002345  ,  0.01365139, ..., -0.03936822,
         0.01140285,  0.0370791 ], dtype=float32),
 'text': 'The content of the non-volatile memory is automatically erased once the device identity, the consist identity or the version (V) in a safety application V.R.U.B is replaced/changed. These are user con- figurable parameters, therefore the CCU-S2 depends on that they are correctly managed and main- tained in order to determine whether or not to erase the non-volatile memory. See the CCU-S2 User Manual [12].\n\nTOOLS-REP-SRAC-150 After any maintenance activity it shall be made sure that the correct ECP- USBs are plugged to their safety devices and the verification procedure de- fined in TOOLS-REP-SRAC-126 is re-executed for each maintained device.\n\nThe definition for maintenance has several possible use cases:\n\nUse case

## Milvus Connection

In [5]:
encoding = tiktoken.get_encoding("cl100k_base")

In [6]:
ENDPOINT=f'http://localhost:19530'
client = MilvusClient(
  uri=ENDPOINT)

In [7]:
EMBEDDING_LENGTH

1024

In [37]:
COLLECTION_NAME = "UnstructureddRefinedDocs"

## Milvus Collection Insertion

In [47]:
client.drop_collection(collection_name=COLLECTION_NAME)

In [48]:
# 1. Define a minimum expandable schema.
fields = [
    FieldSchema("pk", DataType.INT64, is_primary=True, auto_id=True),
    FieldSchema("vector", DataType.FLOAT_VECTOR, dim=EMBEDDING_LENGTH),
    FieldSchema("keyword_vector", DataType.FLOAT_VECTOR, dim=EMBEDDING_LENGTH),
]
schema = CollectionSchema(
   fields,
   enable_dynamic_field=True,)

index_params = client.prepare_index_params()

index_params.add_index(
    field_name="vector",
    index_type="AUTOINDEX",
    metric_type="COSINE"
)
index_params.add_index(
    field_name="keyword_vector",
    index_type="FLAT",
    metric_type="COSINE"
)

client.create_collection(collection_name=COLLECTION_NAME, schema=schema, index_params=index_params)

# 2. Create the collection.
# mc = Collection("MilvusDocs", schema)

# 3. Index the collection.
# mc.create_index(
#    field_name="vector",
#    index_params={
#        "index_type": "AUTOINDEX",
#        "metric_type": "COSINE",})

In [49]:
# Insert data into the Milvus collection.
# insert_result = mc.insert(chunk_list)
insert_result = client.insert(collection_name=COLLECTION_NAME, data=chunk_list)

# After final entity is inserted, call flush 
# to stop growing segments left in memory.

# print(mc.partitions)

In [50]:
client.describe_collection(collection_name=COLLECTION_NAME)

{'collection_name': 'UnstructuredYoloxKeywordRefinedDocs',
 'auto_id': True,
 'num_shards': 1,
 'description': '',
 'fields': [{'field_id': 100,
   'name': 'pk',
   'description': '',
   'type': <DataType.INT64: 5>,
   'params': {},
   'auto_id': True,
   'is_primary': True},
  {'field_id': 101,
   'name': 'vector',
   'description': '',
   'type': <DataType.FLOAT_VECTOR: 101>,
   'params': {'dim': 1024}},
  {'field_id': 102,
   'name': 'keyword_vector',
   'description': '',
   'type': <DataType.FLOAT_VECTOR: 101>,
   'params': {'dim': 1024}}],
 'aliases': [],
 'collection_id': 450433855148775382,
 'consistency_level': 2,
 'properties': {},
 'num_partitions': 1,
 'enable_dynamic_field': True}

In [51]:
res = client.query(collection_name=COLLECTION_NAME, output_fields=["count(*)"])
res

[{'count(*)': 883}]

In [52]:
res = client.query(collection_name=COLLECTION_NAME, filter="", offset = 100, limit=5)
res

[{'vector': [0.039032944,
   -0.027098177,
   0.032851715,
   0.048891425,
   -0.046514925,
   -0.010161658,
   -0.010116343,
   -0.0061183884,
   0.011240874,
   0.04749966,
   0.002583984,
   0.02433764,
   0.0073255384,
   0.020555867,
   -0.041298967,
   0.002914948,
   0.005094661,
   0.0040265354,
   -0.026460765,
   0.02053075,
   0.008002463,
   0.022009399,
   -0.06561151,
   0.0018352633,
   -0.025994416,
   0.028002534,
   -0.051278763,
   0.021354875,
   0.042241327,
   0.05225363,
   -0.030040693,
   0.019532146,
   -0.020877015,
   -0.027094405,
   -0.011481502,
   0.0012322582,
   0.033284515,
   -0.018232651,
   -0.032140736,
   -0.030726105,
   0.012780065,
   0.011718639,
   0.013358975,
   -0.03943259,
   -0.027636643,
   0.0046296003,
   0.0032455185,
   -0.008369186,
   0.0123516945,
   -0.058295596,
   0.0029774997,
   0.019641578,
   0.032826297,
   -0.008786692,
   -0.026020853,
   -0.019433057,
   -0.009591623,
   0.007089974,
   -0.001625974,
   0.020455606,
 

## Chatbot Testing

In [9]:
# template = """

# Context: {context}

# Question: {question}

# Answer: Let's work this out in a step by step way to be sure we have the right answer."""

template = """
[INST] <<SYS>>
You are a helpful, respectful and honest assistant. Always answer as helpfully as possible.  Please ensure that your responses are socially unbiased and positive in nature. If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.
<</SYS>>
Previous Conversation:
{chat_history}

Context: {context}

Question: {question}

[/INST]

"""

summary_template = """
[INST] <<SYS>>
As a professional summarizer, create a comprehensive summary of the provided context, while adhering to these guidelines:
* Craft a summary that is detailed, thorough, in-depth, and complex, while maintaining clarity and conciseness.
* Incorporate main ideas and essential information, and focusing on critical aspects.
* Rely strictly on the provided text, without including external information.
* Format the summary in paragraph form for easy understanding.
<</SYS>>

Context: {context}

[/INST]

"""

toc_qa_prompt = """
[INST] <<SYS>>
Here are characteristics of table of contents pages and main content pages:
Table of Contents: Has numbered list, Lists chapter/section titles, lists tables/figures, has dots leading to numbers, often titled as "table of contents"/ "list of figures"/ "list of tables".
Main Content: Contains names of sections and the description of those sections, paragraphs of text, likely longer.

Considering these features, is the following page_content a table of contents page? Answer only in yes or no.
<</SYS>>

page_content: {context}

[/INST]

"""

keyword_prompt_text = """
[INST] <<SYS>>
    You are an assistant tasked with finding keywords for retrieval using the question below. \
    These keywords will be embedded and used to retrieve raw text or table elements. \
    Provide keywords in a comma seperated manner representing the idea of the question that is well optimized for retrieval. Provide the keywords with no additional texts.\
<</SYS>>

question: {context}

[/INST]

"""

prompt = PromptTemplate.from_template(template)
summary_prompt = PromptTemplate.from_template(summary_template)
toc_qa_prompt = PromptTemplate.from_template(toc_qa_prompt)
keyword_prompt = PromptTemplate.from_template(keyword_prompt_text)

In [10]:
# Callbacks support token-wise streaming
callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])

In [11]:
gc.collect()
torch.cuda.empty_cache()

In [12]:
n_gpu_layers = 24  # The number of layers to put on the GPU. The rest will be on the CPU. If you don't know how many layers there are, you can use -1 to move all to GPU.
n_batch = 512 # Should be between 1 and n_ctx, consider the amount of VRAM in your GPU.
n_ctx = 4096
max_tokens = 1024

# llm = AutoModelForCausalLM.from_pretrained("TheBloke/Llama-2-7b-Chat-GGUF", model_file="llama-2-7b-chat.q4_K_M.gguf", model_type="llama", gpu_layers=50)


# Make sure the model path is correct for your system!
llm = ChatOllama(model="llama3:latest",
    n_ctx=n_ctx,
    n_gpu_layers = n_gpu_layers,
    n_batch=n_batch,
    max_tokens=max_tokens,
    # callback_manager=callback_manager,
    # verbose=True,  # Verbose is required to pass to the callback manager
)
# llm = LlamaCpp(
#     model_path="models/llama-2-7b-chat.Q4_K_M.gguf",
#     n_ctx=n_ctx,
#     n_gpu_layers = n_gpu_layers,
#     n_batch=n_batch,
#     max_tokens=max_tokens,
#     # callback_manager=callback_manager,
#     # verbose=True,  # Verbose is required to pass to the callback manager
# )

In [24]:
QUESTION = "What are the complex type tables?"
QUERY = [QUESTION]

# Before conducting a search, load the data into memory.
client.load_collection(collection_name=COLLECTION_NAME)

# Embed the question using the same encoder.
embedded_question = torch.tensor(encoder.encode([QUESTION]))
# Normalize embeddings to unit length.
embedded_question = F.normalize(embedded_question, p=2, dim=1)
# Convert the embeddings to list of list of np.float32.
embedded_question = list(map(np.float32, embedded_question))

# Return top k results with AUTOINDEX.
TOP_K = 5

# Run semantic vector search using your query and the vector database.
start_time = time.time()
results = client.search(
    collection_name=COLLECTION_NAME,
    data=embedded_question, 
    anns_field="vector", 
    output_fields=["text", "source"], 
    limit=TOP_K)

elapsed_time = time.time() - start_time
print(f"Milvus search time: {elapsed_time} sec")


Milvus search time: 0.0044934749603271484 sec


In [None]:
for n, hits in enumerate(results):
    print(f"{n}th query result")
    for hit in hits:
        print(hit)


In [33]:
# Assemble the context as a stuffed string.
context = ""
for r in results[0]:
    text = r['entity']['text']
    context += f"{text} "

# Also save the context metadata to retrieve along with the answer.
context_metadata = {
   "source": results[0][0]["entity"]["source"],}

In [None]:
full_question = prompt.format(context = context, question = question)
full_question

In [13]:
chat_memory = ConversationSummaryBufferMemory(llm=llm, max_token_limit=1000, memory_key="chat_history", input_key="question")
# context_memory = ConversationSummaryBufferMemory(llm=llm, max_token_limit = 2000, memory_key = "context", input_key="question")
# memory = CombinedMemory(memories=[chat_memory, context_memory])

In [14]:
summary_token_limit = 1300
llm_chain = LLMChain(llm=llm, prompt=prompt, memory=chat_memory, verbose=True)
llm_summary_chain = LLMChain(llm = llm, prompt=summary_prompt, verbose=True)
llm_tocqa_chain = LLMChain(llm=llm, prompt=toc_qa_prompt)
llm_keyword_chain = LLMChain(llm=llm, prompt=keyword_prompt)

llm_chain = prompt | llm
llm_summary_chain = summary_prompt | llm

In [15]:
def check_token_count(context, encoding):
    encoded_question = encoding.encode(context)
    print("token count: ")
    print(len(encoded_question) + 70)
    return len(encoded_question) + 70

In [16]:
def get_context(results):
    context = ""
    for r in results[0]:
        text = r['entity']['text']
        context += f"\n{text} "
    
    # Also save the context metadata to retrieve along with the answer.
    context_metadata = {
       "source": results[0][0]["entity"]["source"],}
    return context

In [17]:
def get_context_kw(results):
    context = ""
    for r in results[0]:
        text = r.entity.text
        context += f"\n{text} "
    
    # Also save the context metadata to retrieve along with the answer.
    context_metadata = {
       "source": results[0][0].entity.source,}
    return context

In [18]:
def clean_keywords(text):
    temp_keyword = text.split("Keywords:")[-1].strip()
    cleaned_keyword = temp_keyword.split("\n\n")[-1].strip()
    return cleaned_keyword

In [19]:
def generate_context(question, milvus_client, COLLECTION_NAME, encoder, context, encoding, prompt):
    
    milvus_client.load_collection(collection_name=COLLECTION_NAME)
    # Embed the question using the same encoder.
    embedded_question = torch.tensor(encoder.encode([question]))
    # Normalize embeddings to unit length.
    embedded_question = F.normalize(embedded_question, p=2, dim=1)
    # Convert the embeddings to list of list of np.float32.
    embedded_question = list(map(np.float32, embedded_question))
    
    # Return top k results with AUTOINDEX.
    TOP_K = 3
    
    # Run semantic vector search using your query and the vector database.
    start_time = time.time()
    results = milvus_client.search(
        collection_name=COLLECTION_NAME,
        data=embedded_question, 
        anns_field="vector", 
        output_fields=["text", "source"], 
        limit=TOP_K)
    
    elapsed_time = time.time() - start_time
    print(f"Milvus search time: {elapsed_time} sec")
    new_context = get_context(results)
    temp_context = context + f"{new_context}"
    if check_token_count(temp_context, encoding) > summary_token_limit:
        full_question = prompt.format(context = temp_context)
        check_token_count(full_question, encoding)
        result = llm_summary_chain.invoke({"context": temp_context})
        return result['text']
    else:
        return temp_context
    

In [20]:
def generate_context_with_keywords(question, rerank, COLLECTION_NAME, encoder, context, encoding, prompt):
    
    # milvus_client.load_collection(collection_name=COLLECTION_NAME)
    connections.connect(host="localhost", port="19530")
    milvus_collection = Collection(name=COLLECTION_NAME)
    # Embed the question using the same encoder.
    embedded_question = torch.tensor(encoder.encode([question]))
    # Normalize embeddings to unit length.
    embedded_question = F.normalize(embedded_question, p=2, dim=1)
    # Convert the embeddings to list of list of np.float32.
    embedded_question = list(map(np.float32, embedded_question))
    
    # Return top k results with AUTOINDEX.
    TOP_K = 3
    
    # Run semantic vector search using your query and the vector database.
    start_time = time.time()
    
    search_param_1 = {
        "data": embedded_question, # Query vector
        "anns_field": "vector", # Vector field name
        "param": {
            "metric_type": "COSINE", # This parameter value must be identical to the one used in the collection schema
            "params": {"nprobe": 10}
        },
        "limit": 10 # Number of search results to return in this AnnSearchRequest
    }
    request_1 = AnnSearchRequest(**search_param_1)

    question_keywords = llm_keyword_chain.invoke({"context": question})['text']
    quesiton_keywords = clean_keywords(question_keywords)

    embedded_keywords = torch.tensor(encoder.encode([quesiton_keywords]))
    # Normalize embeddings to unit length.
    embedded_keywords = F.normalize(embedded_keywords, p=2, dim=1)
    # Convert the embeddings to list of list of np.float32.
    embedded_keywords = list(map(np.float32, embedded_keywords))
    
    search_param_2 = {
        "data": embedded_keywords, # Query vector
        "anns_field": "keyword_vector", # Vector field name
        "param": {
            "metric_type": "COSINE", # This parameter value must be identical to the one used in the collection schema
            "params": {"nprobe": 10}
        },
        "limit": 10 # Number of search results to return in this AnnSearchRequest
    }
    request_2 = AnnSearchRequest(**search_param_2)

    reqs = [request_1, request_2]
    
    results = milvus_collection.hybrid_search(
        reqs=reqs,
        rerank=rerank,
        output_fields=["text", "source"], 
        limit=TOP_K)
    
    elapsed_time = time.time() - start_time
    print(f"Milvus search time: {elapsed_time} sec")
    
    new_context = get_context_kw(results)
    temp_context = context + f"{new_context}"
    if check_token_count(temp_context, encoding) > summary_token_limit:
        full_question = prompt.format(context = temp_context)
        check_token_count(full_question, encoding)
        result = llm_summary_chain.invoke({"context": temp_context})
        return result['text']
    else:
        return temp_context
    

In [21]:
rerank = WeightedRanker(0.5,0.5)

In [36]:
context = ""
chat_memory.clear()
while True:
    user_input = input("You: ")
    if user_input.lower() == "exit":
        print("Chatbot: Thanks!")
        break
    # context = ""
    context = generate_context(user_input, client, COLLECTION_NAME, encoder, context, encoding, summary_prompt)
    # context = generate_context_with_keywords(user_input, rerank, COLLECTION_NAME, encoder, context, encoding, summary_prompt)
    result = llm_chain.invoke({"context": context, "question": user_input})
    response = result['text']
    print("Chatbot:", response)

KeyboardInterrupt: Interrupted by user