In [64]:
import uuid
from langchain.retrievers.multi_vector import MultiVectorRetriever
from langchain.storage import InMemoryStore
from langchain_community.vectorstores.chroma import Chroma
from langchain_core.documents import Document
from langchain_core.embeddings import Embeddings
from langchain_openai import OpenAIEmbeddings
import pickle


In [77]:
## Config embeddings
class CustomMultiVectorRetriever:
    def __init__(self, embedding_model:Embeddings) -> None:
        self.embedding_model = embedding_model
        # The vectorstore to use to index the child chunks*(summaries)
        self.vectorstore = Chroma(
            collection_name="table_summaries", 
            embedding_function=embedding_model
            )

    def index_and_store_summaries(self, summary_tables, tables, retriever:MultiVectorRetriever):
        # Add tables
        table_ids = [str(uuid.uuid4()) for _ in tables]
        summary_tables_docs = [
            Document(page_content=s, metadata={"doc_id": table_ids[i]})
            for i, s in enumerate(summary_tables)
        ]
        retriever.vectorstore.add_documents(summary_tables_docs)
        retriever.docstore.mset(list(zip(table_ids, tables)))
        print("Embedding Text, Table to Vectorstore Complete.")
        return

    def get_retriever(self, tables, summary_tables):
        # The storage layer for the parent documents
        store = InMemoryStore()
        id_key = "doc_id"

        # The retriever (empty to start)
        retriever = MultiVectorRetriever(
            vectorstore=self.vectorstore,
            docstore=store,
            id_key=id_key,
        )
        # Here, you can call index_and_store_summaries method as a callback
        self.index_and_store_summaries(
            tables=tables,
            summary_tables=summary_tables,
            retriever=retriever,
            )
        
        return retriever

embedding_model=OpenAIEmbeddings(model="text-embedding-3-small")

### this is simple text retriever
simpleVectorRetriever = Chroma(
    collection_name="text",
    embedding_function=embedding_model
)


In [79]:
with open('elements_processed.pkl', 'rb') as file:
    elem = pickle.load(file)

text_elem = [e.text for e in elem if e.type=='text']

rv = simpleVectorRetriever.from_texts(text_elem, embedding=embedding_model)


In [80]:
rve = rv.as_retriever()

In [63]:

temporaryhtmlelements = ['<table cellspacing="0"><thead><tr><th>Supply specification KWN 49037</th></tr></thead><tbody><tr><td>Revision: A</td></tr></tbody></table>', '<table cellspacing="0"><thead><tr><th>Group</th><th>Inclusion thickness</th><th>Max. permitted class</th></tr></thead><tbody><tr><td rowspan="2">A</td><td>Fine</td><td>3</td></tr><tr><td>Thick</td><td>3</td></tr><tr><td rowspan="2">B</td><td>Fine</td><td>2,5</td></tr><tr><td>Thick</td><td>1,5</td></tr><tr><td rowspan="2">C</td><td>Fine</td><td>2,5</td></tr><tr><td>Thick</td><td>1,5</td></tr><tr><td rowspan="2">D</td><td>Fine</td><td>2</td></tr><tr><td>Thick</td><td>1,5</td></tr><tr><td>DS</td><td></td><td>2</td></tr></tbody></table>', '- highly malleable, individual grey particles - numerous non deformable, angular, low with a wide range of aspect ratios aspect ratio (generally < 3), black or bluish (length/width) and generally rounded ends particles (at least three) aligned in the deformation direction ', 'ratios (generally > 3) and generally sharp randomly distributed particles ends ', '<table cellspacing="0"><tbody><tr><td>Supply specification</td></tr><tr><td>KWN 49037</td></tr><tr><td>Revision: A</td></tr></tbody></table>', '<table cellspacing="0"><tbody><tr><td>Supply specification KWN 49037</td></tr><tr><td>Revision: A</td></tr></tbody></table>', '<table cellspacing="0"><tbody><tr><td>Supply specification</td></tr><tr><td>Revision: A</td></tr></tbody></table>', '<table cellspacing="0"><thead><tr><th>No.</th><th>Indication</th><th>Value / Name</th><th></th><th></th><th>Reference/Note</th></tr></thead><tbody><tr><td>1</td><td>Material designation</td><td colspan="3">31CrMoV9</td><td>EN 10085</td></tr><tr><td>2</td><td>Material number</td><td>1.8519</td><td></td><td></td><td>EN 10085</td></tr><tr><td>3</td><td>Mechanical properties</td><td>acc. drawing</td><td></td><td></td><td>Verification is done on one sample per material charge</td></tr><tr><td>4</td><td>Manufacturing process</td><td colspan="3">Drop-forged, free-form forged or rolled</td><td>Without inclusions and folds</td></tr><tr><td>5</td><td>Forging grade</td><td>F</td><td></td><td></td><td>EN 10243-1</td></tr><tr><td>6</td><td>Tolerances</td><td>-</td><td></td><td></td><td>EN 10243-1</td></tr><tr><td>7</td><td>Fineness</td><td colspan="3" rowspan="2">- M1</td><td>EN 10243-1</td></tr><tr><td>8</td><td>Steel grade</td><td>EN 10243-1</td></tr><tr><td>9</td><td>Surface finish</td><td colspan="3">clean descaled</td><td></td></tr><tr><td>10</td><td>Chemical composition</td><td>Acc. EN 10085 DIN EN 683-5 Deviating S ≤ 0,02%</td><td>or from standard:</td><td></td><td>Melt analysis per material charge</td></tr><tr><td rowspan="9">11</td><td rowspan="9">Cleanliness</td><td colspan="3">Inclusion group Width Klasse</td><td rowspan="10">per melting charge sample, per material charge</td></tr><tr><td rowspan="2">A</td><td>Fine</td><td>2</td></tr><tr><td>Thick</td><td>1</td></tr><tr><td>B</td><td>Fine</td><td>2</td></tr><tr><td rowspan="2">C</td><td>Thick</td><td>1</td></tr><tr><td>Fine</td><td>0,5</td></tr><tr><td></td><td>Thick</td><td>0,5</td></tr><tr><td>D</td><td>Fine</td><td>1</td></tr><tr><td rowspan="2">DS</td><td>Thick</td><td>1</td></tr><tr><td></td><td></td><td></td><td>2</td></tr><tr><td>12</td><td>Grain size</td><td colspan="3">≥ 6 for 90% of the area Single grain ≤ 3 is prohibited</td><td>DIN EN ISO 643 After heat treatment on the test bar</td></tr><tr><td>13</td><td>Heat treatment condition</td><td colspan="2">+QT</td><td></td><td>EN 10085 or DIN EN 683-5</td></tr><tr><td>14</td><td>Ultrasonic testing</td><td>acc. drawing</td><td></td><td></td><td>EN 10228-3</td></tr><tr><td>15</td><td>Magnetic particle inspection</td><td>acc. drawing</td><td></td><td></td><td>EN 10228-1</td></tr><tr><td>16</td><td>Penetration test</td><td>acc. drawing</td><td></td><td></td><td>EN 10228-2</td></tr><tr><td>17</td><td>Component marking</td><td>acc. drawing</td><td></td><td></td><td></td></tr><tr><td>18</td><td>Inspection certificate</td><td>Acceptance</td><td>test certificate</td><td>3.1</td><td>EN 10204</td></tr></tbody></table>', '2 Material number 1.8519 EN 10085 3 Mechanical properties acc. drawing Verification is done on one sample per material charge 4 Manufacturing process Drop-forged, free-form forged or Without inclusions and folds rolled 5 Forging grade F EN 10243-1 6 Tolerances - EN 10243-1 7 Fineness - EN 10243-1 8 Steel grade M1 EN 10243-1 9 Surface finish clean descaled 10 Chemical composition Acc. EN 10085 or Melt analysis per material DIN EN 683-5 charge Deviating from standard: S ≤ 0,02% 11 Cleanliness Inclusion Width Klasse per melting charge sample, group per material charge A Fine 2 Thick 1 B Fine 2 Thick 1 C Fine 0,5 Thick 0,5 D Fine 1 Thick 1 DS 2 12 Grain size ≥ 6 for 90% of the area DIN EN ISO 643 Single grain ≤ 3 is prohibited After heat treatment on the test bar 13 Heat treatment condition +QT EN 10085 or DIN EN 683-5 14 Ultrasonic testing acc. drawing EN 10228-3 15 Magnetic particle inspection acc. drawing EN 10228-1 16 Penetration test acc. drawing EN 10228-2 17 Component marking acc. drawing 18 Inspection certificate Acceptance test certificate 3.1 EN 10204 ', '<table cellspacing="0"><thead><tr><th>No.</th><th>Indication</th><th>Value / Name</th><th></th><th></th><th>Reference/Note</th></tr></thead><tbody><tr><td>1</td><td>Material designation</td><td colspan="3">30CrNiMo8</td><td>EN 10083-3</td></tr><tr><td>2</td><td>Material number</td><td>1.6580</td><td></td><td></td><td>EN 10083-3</td></tr><tr><td>3</td><td>Mechanical properties</td><td>acc. drawing</td><td></td><td></td><td>Verification is done on one sample per material charge</td></tr><tr><td>4</td><td>Manufacturing process</td><td>Drop-forged, rolled</td><td>free-form</td><td>forged or</td><td>Without inclusions and folds</td></tr><tr><td>5</td><td>Forging grade</td><td>F</td><td></td><td></td><td>EN 10243-1</td></tr><tr><td>6</td><td>Tolerances</td><td>-</td><td></td><td></td><td>EN 10243-1</td></tr><tr><td>7</td><td>Fineness</td><td>-</td><td></td><td></td><td>EN 10243-1</td></tr><tr><td>8</td><td>Steel grade</td><td colspan="3">M1</td><td>EN 10243-1</td></tr><tr><td>9</td><td>Surface finish</td><td colspan="3" rowspan="2">clean descaled Acc. EN 10083-3 or DIN EN 683-2 Deviating from standard: S ≤ 0,02% Cu ≤ 0,3%</td><td></td></tr><tr><td>10</td><td>Chemical composition</td><td>Melt analysis per material charge</td></tr><tr><td rowspan="10">11</td><td>Cleanliness</td><td>Inclusion</td><td>Width</td><td>Klasse</td><td rowspan="11">per melting charge sample, per material charge</td></tr><tr><td></td><td>group</td><td></td><td></td></tr><tr><td></td><td>A</td><td>Fine 2</td><td></td></tr><tr><td></td><td></td><td>Thick 1</td><td></td></tr><tr><td></td><td>B</td><td>Fine 2</td><td></td></tr><tr><td></td><td rowspan="2">C</td><td>Thick 1</td><td></td></tr><tr><td></td><td>Fine 0,5</td><td></td></tr><tr><td></td><td></td><td>Thick 0,5</td><td></td></tr><tr><td></td><td rowspan="2">D</td><td>Fine 1</td><td></td></tr><tr><td></td><td>Thick 1</td><td></td></tr><tr><td></td><td></td><td>DS</td><td>2</td><td></td></tr><tr><td>12</td><td>Grain size</td><td colspan="3">≥ 5 for 90% of the area Single grain ≤ 3 is prohibited</td><td>DIN EN ISO 643 After heat treatment on the test bar</td></tr><tr><td>13</td><td>Heat treatment condition</td><td>+QT</td><td></td><td></td><td>EN 10083-3 or DIN EN 683-2</td></tr><tr><td>14</td><td>Ultrasonic testing</td><td>acc. drawing</td><td></td><td></td><td>EN 10228-3</td></tr><tr><td>15</td><td>Magnetic particle inspection</td><td>acc. drawing</td><td></td><td></td><td>EN 10228-1</td></tr><tr><td>16</td><td>Penetration test</td><td>acc. drawing</td><td></td><td></td><td>EN 10228-2</td></tr><tr><td>17</td><td>Component marking</td><td>acc. drawing</td><td></td><td></td><td></td></tr><tr><td>18</td><td>Inspection certificate</td><td>Acceptance</td><td>test certificate</td><td>3.1</td><td>EN 10204</td></tr></tbody></table>', '<table cellspacing="0"><thead><tr><th>Revision</th><th>Date</th><th>Changes</th></tr></thead><tbody><tr><td>A</td><td>03/06/2022</td><td>First edition</td></tr></tbody></table>']

with open('table_summaries.pkl', 'rb') as file:
    table_summaries = pickle.load(file)

### CustomMultiRetriever
customTableSummaryRetriever = CustomMultiVectorRetriever(embedding_model=embedding_model).get_retriever(summary_tables=table_summaries, tables=temporaryhtmlelements)

  self._core_bpe = _tiktoken.CoreBPE(mergeable_ranks, special_tokens, pat_str)


Embedding Text, Table to Vectorstore Complete.


In [75]:
print(type(simpleVectorRetriever))

<class 'list'>


In [81]:
from langchain.retrievers import EnsembleRetriever

ensemble_retriever = EnsembleRetriever(retrievers=[rve, customTableSummaryRetriever], weights=[0.5, 0.5])

In [83]:
## 잘 됨 여기에 reranker 붙이고 출력까지 ㄱㄱ

In [84]:
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import CrossEncoderReranker
from langchain_community.cross_encoders import HuggingFaceCrossEncoder

def pretty_print_docs(docs):
    print(
        f"\n{'-' * 100}\n".join(
            [f"Document {i+1}:\n\n" + d.page_content for i, d in enumerate(docs)]
        )
    )


model = HuggingFaceCrossEncoder(model_name="BAAI/bge-reranker-v2-m3")
compressor = CrossEncoderReranker(model=model, top_n=5)
compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor, base_retriever=ensemble_retriever
)

compressed_docs = compression_retriever.invoke("Chemical composition?")
pretty_print_docs(compressed_docs)

  from .autonotebook import tqdm as notebook_tqdm


Document 1:

<table cellspacing="0"><thead><tr><th>No.</th><th>Indication</th><th>Value / Name</th><th></th><th></th><th>Reference/Note</th></tr></thead><tbody><tr><td>1</td><td>Material designation</td><td colspan="3">31CrMoV9</td><td>EN 10085</td></tr><tr><td>2</td><td>Material number</td><td>1.8519</td><td></td><td></td><td>EN 10085</td></tr><tr><td>3</td><td>Mechanical properties</td><td>acc. drawing</td><td></td><td></td><td>Verification is done on one sample per material charge</td></tr><tr><td>4</td><td>Manufacturing process</td><td colspan="3">Drop-forged, free-form forged or rolled</td><td>Without inclusions and folds</td></tr><tr><td>5</td><td>Forging grade</td><td>F</td><td></td><td></td><td>EN 10243-1</td></tr><tr><td>6</td><td>Tolerances</td><td>-</td><td></td><td></td><td>EN 10243-1</td></tr><tr><td>7</td><td>Fineness</td><td colspan="3" rowspan="2">- M1</td><td>EN 10243-1</td></tr><tr><td>8</td><td>Steel grade</td><td>EN 10243-1</td></tr><tr><td>9</td><td>Surface finish<

In [86]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnableParallel, RunnablePassthrough
from langchain_openai import ChatOpenAI

In [89]:
template = """Answer the question based only on the following context:
{context}

Question: {question}
"""

model = ChatOpenAI(temperature=0, model="gpt-4-turbo-preview", max_tokens=2048)


prompt = ChatPromptTemplate.from_template(template)
output_parser = StrOutputParser()

setup_and_retrieval = RunnableParallel(
    {"context": compression_retriever, "question": RunnablePassthrough()}
)
chain = setup_and_retrieval | prompt | model | output_parser


In [93]:
response = chain.invoke("")

print(response)

Based on the provided documents, here is the specific information related to the material 30CrNiMo8:

1. **Material Designation**: 30CrNiMo8
2. **Material Number**: 1.6580
3. **Mechanical Properties**: According to the drawing, with verification done on one sample per material charge.
4. **Manufacturing Process**: Drop-forged, rolled, free-form forged or without inclusions and folds.
5. **Forging Grade**: F, as per EN 10243-1.
6. **Tolerances**: As per EN 10243-1.
7. **Fineness**: As per EN 10243-1.
8. **Steel Grade**: M1, according to EN 10243-1.
9. **Surface Finish**: Clean descaled according to EN 10083-3 or DIN EN 683-2, with deviations from standard: S ≤ 0.02%, Cu ≤ 0.3%.
10. **Chemical Composition**: Melt analysis per material charge.
11. **Cleanliness**: Inclusion group width class per melting charge sample, per material charge, with specific classes for A (Fine 2, Thick 1), B (Fine 2), C (Thick 1, Fine 0.5, Thick 0.5), D (Fine 1, Thick 1), and DS (2).
12. **Grain Size**: ≥ 5 fo