In [1]:
! pip install langchain langchain-community langchain-chroma "unstructured[all-docs]" pydantic lxml



In [2]:
!apt-get update
!apt-get install -y poppler-utils


Hit:1 http://archive.ubuntu.com/ubuntu jammy InRelease
Hit:2 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease                          
Hit:3 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease         
Hit:4 http://archive.ubuntu.com/ubuntu jammy-updates InRelease                                      
Hit:5 http://archive.ubuntu.com/ubuntu jammy-backports InRelease                                    
Hit:6 http://security.ubuntu.com/ubuntu jammy-security InRelease                                    
Hit:7 https://r2u.stat.illinois.edu/ubuntu jammy InRelease                                          
Hit:8 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease     
Hit:9 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Hit:10 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Reading package lists... Done
W: Skipping acquire of configured file 'main/source/Sources' as rep

In [4]:
from typing import Any

from pydantic import BaseModel
from unstructured.partition.pdf import partition_pdf

data_folder = "/kaggle/input/diabetes-book/Handbook of Diabetes.pdf"
# Path to save images
extracted_img = "/kaggle/working/"

# Get elements
raw_pdf_elements = partition_pdf(
    filename=data_folder,
    # Using pdf format to find embedded image blocks
    extract_images_in_pdf=True,
    # Use layout model (YOLOX) to get bounding boxes (for tables) and find titles
    # Titles are any sub-section of the document
    infer_table_structure=True,
    # Post processing to aggregate text once we have the title
    chunking_strategy="by_title",
    # Chunking params to aggregate text blocks
    # Attempt to create a new chunk 3800 chars
    # Attempt to keep chunks > 2000 chars
    # Hard max on chunks
    max_characters=4000,
    new_after_n_chars=3800,
    combine_text_under_n_chars=2000,
    image_output_dir_path=extracted_img,
)

yolox_l0.05.onnx:   0%|          | 0.00/217M [00:00<?, ?B/s]

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

config.json:   0%|          | 0.00/1.47k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/115M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/46.8M [00:00<?, ?B/s]

In [5]:
# Create a dictionary to store counts of each type
category_counts = {}

for element in raw_pdf_elements:
    category = str(type(element))
    if category in category_counts:
        category_counts[category] += 1
    else:
        category_counts[category] = 1

# Unique_categories will have unique elements
# TableChunk if Table > max chars set above
unique_categories = set(category_counts.keys())
category_counts

{"<class 'unstructured.documents.elements.CompositeElement'>": 388}

In [6]:
class Element(BaseModel):
    type: str
    text: Any


# Categorize by type
categorized_elements = []
for element in raw_pdf_elements:
    if "unstructured.documents.elements.Table" in str(type(element)):
        categorized_elements.append(Element(type="table", text=str(element)))
    elif "unstructured.documents.elements.CompositeElement" in str(type(element)):
        categorized_elements.append(Element(type="text", text=str(element)))

# Tables
table_elements = [e for e in categorized_elements if e.type == "table"]
print(len(table_elements))

# Text
text_elements = [e for e in categorized_elements if e.type == "text"]
print(len(text_elements))

0
388


In [7]:
!pip install -U langchain_ollama

Collecting langchain_ollama
  Downloading langchain_ollama-0.2.3-py3-none-any.whl.metadata (1.9 kB)
Collecting ollama<1,>=0.4.4 (from langchain_ollama)
  Downloading ollama-0.4.7-py3-none-any.whl.metadata (4.7 kB)
Downloading langchain_ollama-0.2.3-py3-none-any.whl (19 kB)
Downloading ollama-0.4.7-py3-none-any.whl (13 kB)
Installing collected packages: ollama, langchain_ollama
Successfully installed langchain_ollama-0.2.3 ollama-0.4.7


In [None]:
#Download ollama
!curl -fsSL https://ollama.com/install.sh | sh
import subprocess
process = subprocess.Popen("ollama serve", shell=True) #runs on a different thread
#Download model
!ollama pull llama2
!pip install ollama
import ollama


In [9]:
from langchain_community.llms import Ollama
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate

In [10]:
# Prompt
prompt_text = """You are an assistant tasked with summarizing tables and text. \
Give a concise summary of the table or text. Table or text chunk: {element} """
prompt = ChatPromptTemplate.from_template(prompt_text)

# Summary chain
model = Ollama(model="llama2")
summarize_chain = {"element": lambda x: x} | prompt | model | StrOutputParser()

  model = Ollama(model="llama2")


In [11]:
# Apply to text
texts = [i.text for i in text_elements if i.text != ""]
text_summaries = summarize_chain.batch(texts, {"max_concurrency": 5})


In [12]:
# Apply to tables
tables = [i.text for i in table_elements]
table_summaries = summarize_chain.batch(tables, {"max_concurrency": 5})

In [13]:
!ollama pull llava

[?2026h[?25l[1Gpulling manifest ⠋ [K[?25h[?2026l[?2026h[?25l[1Gpulling manifest ⠙ [K[?25h[?2026l[?2026h[?25l[1Gpulling manifest ⠹ [K[?25h[?2026l[?2026h[?25l[1Gpulling manifest [K
pulling 170370233dd5...   0% ▕                ▏    0 B/4.1 GB                  [K[?25h[?2026l[?2026h[?25l[A[1Gpulling manifest [K
pulling 170370233dd5...   0% ▕                ▏    0 B/4.1 GB                  [K[?25h[?2026l[?2026h[?25l[A[1Gpulling manifest [K
pulling 170370233dd5...   0% ▕                ▏  17 MB/4.1 GB                  [K[?25h[?2026l[?2026h[?25l[A[1Gpulling manifest [K
pulling 170370233dd5...   1% ▕                ▏  48 MB/4.1 GB                  [K[?25h[?2026l[?2026h[?25l[A[1Gpulling manifest [K
pulling 170370233dd5...   2% ▕                ▏  81 MB/4.1 GB                  [K[?25h[?2026l[?2026h[?25l[A[1Gpulling manifest [K
pulling 170370233dd5...   4% ▕                ▏ 145 MB/4.1 GB                  [K[?25h[?2026l[?2026h[?25

In [14]:
import os
import base64
import json
import requests
from pathlib import Path

# Define the directory containing the images
IMG_DIR = "/kaggle/working/figures/"

# Create a function to process an image with LLaVA
def process_image_with_llava(image_path):
    # Check if the image exists
    if not os.path.exists(image_path):
        return f"Image not found: {image_path}"
    
    try:
        # Read and encode the image to base64
        with open(image_path, "rb") as img_file:
            base64_image = base64.b64encode(img_file.read()).decode('utf-8')
        
        # Prepare the request payload
        payload = {
            "model": "llava",
            "prompt": "Describe the image in detail. If it contains graphs, charts, or medical illustrations, explain what they show regarding diabetes.",
            "images": [base64_image],
            "stream": False
        }
        
        # Make the API call to Ollama
        response = requests.post("http://localhost:11434/api/generate", json=payload)
        
        # Process the response
        if response.status_code == 200:
            result = response.json()
            return result.get("response", "No description generated")
        else:
            return f"Error: {response.status_code}, {response.text}"
    
    except Exception as e:
        return f"Error processing image: {str(e)}"

# Process all images in the directory
def process_all_images():
    # Create the directory if it doesn't exist
    Path(IMG_DIR).mkdir(parents=True, exist_ok=True)
    
    # Get all image files
    image_files = [f for f in os.listdir(IMG_DIR) if f.lower().endswith(('.png', '.jpg', '.jpeg'))]
    
    results = {}
    
    for img_file in image_files:
        img_path = os.path.join(IMG_DIR, img_file)
        output_file = os.path.join(IMG_DIR, os.path.splitext(img_file)[0] + ".txt")
        
        # Process the image
        description = process_image_with_llava(img_path)
        
        # Save the description to a text file
        with open(output_file, "w") as f:
            f.write(description)
        
        # Store in results dictionary
        results[img_file] = description
        print(f"Processed {img_file}")
    
    return results

# Execute the function
image_descriptions = process_all_images()

# Print a summary
print(f"Processed {len(image_descriptions)} images")
for img, desc in image_descriptions.items():
    print(f"\n--- {img} ---")
    # Print first 100 characters of description
    print(desc[:100] + "..." if len(desc) > 100 else desc)

Processed figure-93-99.jpg
Processed figure-128-356.jpg
Processed figure-108-184.jpg
Processed figure-108-126.jpg
Processed figure-272-541.jpg
Processed figure-180-434.jpg
Processed figure-288-550.jpg
Processed figure-73-83.jpg
Processed figure-108-322.jpg
Processed figure-108-315.jpg
Processed figure-64-72.jpg
Processed figure-108-314.jpg
Processed figure-203-468.jpg
Processed figure-198-459.jpg
Processed figure-24-15.jpg
Processed figure-137-370.jpg
Processed figure-108-222.jpg
Processed figure-110-341.jpg
Processed figure-77-86.jpg
Processed figure-108-289.jpg
Processed figure-108-252.jpg
Processed figure-108-156.jpg
Processed figure-62-65.jpg
Processed figure-117-345.jpg
Processed figure-108-312.jpg
Processed figure-194-447.jpg
Processed figure-294-555.jpg
Processed figure-108-220.jpg
Processed figure-108-129.jpg
Processed figure-108-185.jpg
Processed figure-108-225.jpg
Processed figure-62-66.jpg
Processed figure-144-383.jpg
Processed figure-162-410.jpg
Processed figure-108-187.jpg

In [15]:
import uuid
from langchain.retrievers.multi_vector import MultiVectorRetriever
from langchain.storage import InMemoryStore
from langchain_chroma import Chroma
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_core.documents import Document

# Initialize the HuggingFaceEmbeddings with the model you want
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# The vectorstore to use to index the child chunks
vectorstore = Chroma(
    collection_name="summaries", 
    embedding_function=embeddings
)

# The storage layer for the parent documents
store = InMemoryStore()
id_key = "doc_id"

# The retriever (empty to start)
retriever = MultiVectorRetriever(
    vectorstore=vectorstore,
    docstore=store,
    id_key=id_key,
)

  embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling%2Fconfig.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [18]:
# Add texts
doc_ids = [str(uuid.uuid4()) for _ in texts]
summary_texts = [
    Document(page_content=s, metadata={id_key: doc_ids[i]})
    for i, s in enumerate(text_summaries)
]
retriever.vectorstore.add_documents(summary_texts)
retriever.docstore.mset(list(zip(doc_ids, texts)))

# Add tables
#table_ids = [str(uuid.uuid4()) for _ in tables]
#summary_tables = [
    #Document(page_content=s, metadata={id_key: table_ids[i]})
    #for i, s in enumerate(table_summaries)
#]
#retriever.vectorstore.add_documents(summary_tables)
#retriever.docstore.mset(list(zip(table_ids, tables)))

# Add images
img_ids = [str(uuid.uuid4()) for _ in image_descriptions]
summary_img = [
    Document(page_content=s, metadata={id_key: img_ids[i]})
    for i, s in enumerate(image_descriptions)
]
retriever.vectorstore.add_documents(summary_img)
retriever.docstore.mset(
    list(zip(img_ids, image_descriptions))
)

In [None]:
from langchain_core.runnables import RunnablePassthrough

# Prompt template
template = """Answer the question based only on the following context, which can include text and tables:
{context}
Question: {question}
"""
prompt = ChatPromptTemplate.from_template(template)

# Option 1: LLM
model = Ollama(model="llama2")

# RAG pipeline
chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | model
    | StrOutputParser()
)

In [23]:
chain.invoke(
    "How do the World Health Organization (WHO) and the American Diabetes Association (ADA) define diabetes based on fasting plasma glucose (FPG) levels?"
)

"The World Health Organization (WHO) and the American Diabetes Association (ADA) define diabetes based on fasting plasma glucose (FPG) levels as follows:\n\n* For adults, a FPG level ≥126 mg/dL (7.0 mmol/L) is considered diagnostic of diabetes.\n* For children and adolescents, a FPG level ≥100 mg/dL (5.6 mmol/L) is considered diagnostic of diabetes.\n\nIt's worth noting that these criteria are based on the WHO and ADA's consensus guidelines, but other organizations may have slightly different criteria. Additionally, these criteria do not necessarily detect diabetes in the same individuals, as there can be variability in plasma glucose levels over time and between different ethnic groups."