In [None]:
# Cookbook for multi-modal (text + tables + images) RAG 
# (https://github.com/langchain-ai/langchain/blob/master/cookbook/Semi_Structured_RAG.ipynb)
# (https://github.com/langchain-ai/langchain/blob/master/cookbook/Semi_structured_and_multi_modal_RAG.ipynb) 
# + Option_3: Retrieve image summary but pass raw image to LLM for synthesis
# pdf2image needs Poppler for Windows,
# Go to: https://github.com/oschwartz10612/poppler-windows/releases
# Download the latest Release-24.08.0.zip file.
# make "C:\Program Files\poppler\"poppler-24.08.0"
# Add Poppler to PATH: System Variables ---> New ---> "C:\Program Files\poppler\poppler-24.08.0\Library\bin"
# download and install "Tesseract" (https://github.com/UB-Mannheim/tesseract/wiki)
# Add Tesseract to PATH: System Variables ---> New ---> "C:\Program Files\Tesseract-OCR"


In [1]:
path = "D:/4-IntoCode/16_LangChain/AgilProjekt_multiModel/"  # use / instead of \


In [2]:
from typing import Any

from pydantic import BaseModel
from unstructured.partition.pdf import partition_pdf

# Get elements
raw_pdf_elements = partition_pdf(
    filename=path + "AKAP1.pdf",
    # Using pdf format to find embedded image blocks
    extract_images_in_pdf=True,
    # Use layout model (YOLOX) to get bounding boxes (for tables) and find titles
    # Titles are any sub-section of the document
    infer_table_structure=True,
    # Post processing to aggregate text once we have the title
    chunking_strategy="by_title",
   
)

In [3]:
# Create a dictionary to store counts of each type
category_counts = {}

for element in raw_pdf_elements:
    category = str(type(element))
    if category in category_counts:
        category_counts[category] += 1
    else:
        category_counts[category] = 1

# Unique_categories will have unique elements
unique_categories = set(category_counts.keys())
category_counts


{"<class 'unstructured.documents.elements.CompositeElement'>": 197,
 "<class 'unstructured.documents.elements.TableChunk'>": 3}

In [4]:
class Element(BaseModel):
    type: str
    text: Any


# Categorize by type
categorized_elements = []
for element in raw_pdf_elements:
    if "unstructured.documents.elements.Table" in str(type(element)):
        categorized_elements.append(Element(type="table", text=str(element)))
    elif "unstructured.documents.elements.CompositeElement" in str(type(element)):
        categorized_elements.append(Element(type="text", text=str(element)))

# Tables
table_elements = [e for e in categorized_elements if e.type == "table"]
print(len(table_elements))

# Text
text_elements = [e for e in categorized_elements if e.type == "text"]
print(len(text_elements))

3
197


In [None]:
# Multi-vector retriever
# Text and Table summaries

from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate


In [6]:
# Prompt
prompt_text = """You are an assistant tasked with summarizing tables and text. \
Give a concise summary of the table or text. Table or text chunk: {element} """
prompt = ChatPromptTemplate.from_template(prompt_text)


# Summary chain
import getpass
import os

if not os.environ.get("GOOGLE_API_KEY"):
  os.environ["GOOGLE_API_KEY"] = getpass.getpass("Enter API key for Google Gemini: ") # use Google Gemini instead of OpenAI

from langchain_google_genai import ChatGoogleGenerativeAI
model = ChatGoogleGenerativeAI(model="gemini-1.5-flash", temperature=0)    # gemini-1.5-flash instead of gemini-2.0-flash

summarize_chain = {"element": lambda x: x} | prompt | model | StrOutputParser()

In [None]:
# Apply to text
texts = [i.text for i in text_elements]

# to avoid 429 error, we can use batch processing
import time

summaries = []
for text in texts:
    try:
        result = summarize_chain.invoke(text)
        summaries.append(result)
        time.sleep(5)  # deal with 1 request per 5 seconds
        break
    except Exception as e:
        print(f"Error: {e}")
        time.sleep(30)  # if error, wait 30 seconds before retrying
for text in texts:
    result = summarize_chain.invoke(text)
    print(result) # deal it one by one

text_summaries = summarize_chain.batch(texts[:10], {"max_concurrency": 1}) 

In [None]:
# Text summary Result: (after running 19 minutes, and stopped by ResourceExhausted error)
'''This is the header information for a research article published in the *Journal of Molecular and Cellular Cardiology*.  The article, titled "A-kinase anchoring protein 1 (AKAP1) and its role in some cardiovascular diseases," was authored by Wenwen Marin from Qingdao University and focuses on AKAP1's involvement in cardiovascular illnesses.
This abstract likely discusses the role of A-kinase anchoring proteins (AKAPs) in mitochondrial signaling pathways, specifically within the context of cardiovascular diseases.
This review focuses on AKAP1, a mitochondrial A-kinase anchoring protein crucial for cardiac function.  It will summarize AKAP1's sequence, structure-function relationships with binding partners, and its role in the molecular mechanisms of cardiac hypertrophy.
AKAP1 is a potential therapeutic target for cardiovascular disease, due to its involvement in hypoxia-induced myocardial infarction and endothelial dysfunction.
AKAPs are essential "conductors" in cellular signal transduction, similar to a conductor leading an orchestra.  They position signaling molecules (enzymes, receptors, mRNA) in specific locations within the cell, optimizing signaling and maintaining homeostasis.
AKAPs are scaffold proteins that bind to PKA regulatory subunits via an amphipathic alpha-helix, and also possess domains for anchoring signaling complexes to specific subcellular locations.
A-kinase anchoring proteins (AKAPs) are crucial for precise regulation of cAMP signal transduction.  They localize key enzymes (e.g., adenylyl cyclase, phosphodiesterase, protein phosphatase) and factors (e.g., GPCRs) within cellular structures (plasma membrane, cytoskeleton, etc.) to facilitate efficient cAMP signaling.

AKAPs enhance cAMP signaling efficiency by forming multi-functional complexes with various proteins (ion channels, kinases, GTPases, etc.), thus integrating signaling crosstalk.'''

In [22]:
# Apply to tables
tables = [i.text for i in table_elements]

# to avoid 429 error, we can use batch processing
import time

summaries = []
for table in tables:
    try:
        result = summarize_chain.invoke(table)
        summaries.append(result)
        time.sleep(5)  # deal with 1 request per 5 seconds
    except Exception as e:
        print(f"Error: {e}")
        time.sleep(30)  # if error, wait 30 seconds before retrying
for table in tables:
    result = summarize_chain.invoke(table)
    print(result) # deal it one by one

table_summaries = summarize_chain.batch(tables, {"max_concurrency": 1})



The table shows the expression levels of various AKAP (A-kinase anchoring proteins) family members (e.g., AKAP1, AKAP2, AKAP110) in a tissue or cell type, as assessed by antibody detection and RNA sequencing (HPA-RNA-seq).  Many showed low or undetectable expression levels.  Specific expression levels are given as numerical values (e.g., 0.0, 42, 75), but the units are not specified.
The table shows gene expression data from two sources, GTEx RNA-seq and FANTOM5 CAGE.  The data includes numerical values (likely counts or percentages) but lacks clear column headers making precise interpretation impossible.  The numbers vary significantly between the two datasets.
The table lists several protein motifs and their approximate amino acid residue ranges within a protein sequence.  Motifs include tubulin binding, leucine zipper, mitochondrial targeting sequence, PP1 binding domain, PKA/RII binding site, and Tudor domain.  Specific residue ranges are provided for some, while others are indicat

In [None]:
# Table summary Result:
'''The table shows the expression levels of various AKAP (A-kinase anchoring proteins) family members (e.g., AKAP1, AKAP2, AKAP110) in a tissue or cell type, as assessed by antibody detection and RNA sequencing (HPA-RNA-seq).  Many showed low or undetectable expression levels.  Specific expression levels are given as numerical values (e.g., 0.0, 42, 75), but the units are not specified.
The table shows gene expression data from two sources, GTEx RNA-seq and FANTOM5 CAGE.  The data includes numerical values (likely counts or percentages) but lacks clear column headers making precise interpretation impossible.  The numbers vary significantly between the two datasets.
The table lists several protein motifs and their approximate amino acid residue ranges within a protein sequence.  Motifs include tubulin binding, leucine zipper, mitochondrial targeting sequence, PP1 binding domain, PKA/RII binding site, and Tudor domain.  Specific residue ranges are provided for some, while others are indicated more generally.'''

In [None]:
# Add to vectorstore
# Use Multi Vector Retriever with summaries:
# InMemoryStore stores the raw text, tables
# vectorstore stores the embedded summaries

import uuid

from langchain.retrievers.multi_vector import MultiVectorRetriever
from langchain.storage import InMemoryStore
from langchain_chroma import Chroma
from langchain_core.documents import Document

from langchain_huggingface import HuggingFaceEmbeddings
from langchain_chroma import Chroma

embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")

# The vectorstore to use to index the child chunks
vectorstore = Chroma(
    collection_name="example_collection",
    embedding_function=embeddings,
    persist_directory="./chroma_langchain_db",  # Where to save data locally, remove if not necessary
)

# The storage layer for the parent documents
store = InMemoryStore()
id_key = "doc_id"

# The retriever (empty to start)
retriever = MultiVectorRetriever(
    vectorstore=vectorstore,
    docstore=store,
    id_key=id_key,
)

# Add texts
doc_ids = [str(uuid.uuid4()) for _ in texts]
summary_texts = [
    Document(page_content=s, metadata={id_key: doc_ids[i]})
    for i, s in enumerate(text_summaries)
]
retriever.vectorstore.add_documents(summary_texts)
retriever.docstore.mset(list(zip(doc_ids, texts)))

# Add tables
table_ids = [str(uuid.uuid4()) for _ in tables]
summary_tables = [
    Document(page_content=s, metadata={id_key: table_ids[i]})
    for i, s in enumerate(table_summaries)
]
retriever.vectorstore.add_documents(summary_tables)
retriever.docstore.mset(list(zip(table_ids, tables)))

In [24]:
# Run RAG pipeline.
from langchain_core.runnables import RunnablePassthrough

# Prompt template
template = """Answer the question based only on the following context, which can include text and tables:
{context}
Question: {question}
"""
prompt = ChatPromptTemplate.from_template(template)

# LLM
model = ChatGoogleGenerativeAI(model="gemini-1.5-flash", temperature=0)

# RAG pipeline
chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | model
    | StrOutputParser()
)

In [None]:
chain.invoke("What is the role of AKAPs in the cardiovascular system?")