In [76]:
import os
from bs4 import BeautifulSoup, NavigableString
from llama_index.multi_modal_llms.gemini import GeminiMultiModal 
from llama_index.embeddings.gemini import GeminiEmbedding 
from llama_index.llms.gemini import Gemini  
import tiktoken
import seaborn as sns
import requests
from PIL import Image
import matplotlib.pyplot as plt
from io import BytesIO

In [None]:
os.environ["GOOGLE_API_KEY"] = "GOOGLE_API_KEY"

In [78]:
from llama_index.core import Settings

# Set up Google embedding
embed_model = GeminiEmbedding(
    model_name="models/embedding-001", 
    api_key=os.environ["GOOGLE_API_KEY"]
)

# Configure LlamaIndex to use Google embeddings
Settings.embed_model = embed_model

  embed_model = GeminiEmbedding(


In [None]:
NEO4J_URI = ""
NEO4J_USERNAME = ""
NEO4J_PASSWORD = ""

In [80]:
from llama_index.core import StorageContext, Document
from llama_index.core.schema import ImageDocument
from llama_index.core.node_parser import SimpleNodeParser

In [81]:
file_path = "/Users/priyanka./Documents/Multimodal RAG pipeline with LlamaIndex and Neo4j/articles"

In [82]:
def process_html_file(file_path):
    with open(file_path, "r", encoding="utf-8") as file:
        soup = BeautifulSoup(file, "html.parser")

    # Find the required section
    content_section = soup.find("section", {"data-field": "body", "class": "e-content"})

    if not content_section:
        return "Section not found."

    sections = []
    current_section = {"header": "", "content": "", "source": file_path.split("/")[-1]}
    images = []
    header_found = False

    for element in content_section.find_all(recursive=True):
        if element.name in ["h1", "h2", "h3", "h4"]:
            if header_found and (current_section["content"].strip()):
                sections.append(current_section)
            current_section = {
                "header": element.get_text(),
                "content": "",
                "source": file_path.split("/")[-1],
            }
            header_found = True
        elif header_found:
            if element.name == "pre":
                current_section["content"] += f"```{element.get_text().strip()}```\n"
            elif element.name == "img":
                img_src = element.get("src")
                img_caption = element.find_next("figcaption")
                caption_text = img_caption.get_text().strip() if img_caption else ""
                images.append(ImageDocument(image_url=img_src))
            elif element.name in ["p", "span", "a"]:
                current_section["content"] += element.get_text().strip() + "\n"

    if current_section["content"].strip():
        sections.append(current_section)

    return images, sections

In [83]:
all_documents = []
all_images = []

# Directory to search in (current working directory)
directory = os.getcwd()

# Walking through the directory
for root, dirs, files in os.walk(directory):
    for file in files:
        if file.endswith(".html"):
            # Update the file path to be relative to the current directory
            images, documents = process_html_file(os.path.join(root, file))
            all_documents.extend(documents)
            all_images.extend(images)

text_docs = [Document(text=el.pop("content"), metadata=el) for el in all_documents]
print(f"Text document count: {len(text_docs)}") 
print(f"Image document count: {len(all_images)}") 

Text document count: 252
Image document count: 328


In [138]:
# Initialize Google Gemini models
gemini_llm = GeminiMultiModal(
    model_name="models/gemini-pro-vision", 
    api_key=os.environ["GOOGLE_API_KEY"]
)

# Initialize Gemini embedding model
gemini_embedding = GeminiEmbedding(
    model_name="models/embedding-001",
    api_key=os.environ["GOOGLE_API_KEY"]
)

  gemini_llm = GeminiMultiModal(
  gemini_embedding = GeminiEmbedding(


In [84]:

node_parser = SimpleNodeParser.from_defaults()

# Parse text documents into nodes
text_nodes = node_parser.get_nodes_from_documents(text_docs)
image_nodes = node_parser.get_nodes_from_documents(all_images)

print(f"Created {len(text_nodes)} text nodes and {len(image_nodes)} image nodes")

Created 326 text nodes and 328 image nodes


In [85]:
from llama_index.vector_stores.neo4jvector import Neo4jVectorStore

In [86]:
text_store = Neo4jVectorStore(
    url=NEO4J_URI,
    username=NEO4J_USERNAME,
    password=NEO4J_PASSWORD,
    index_name="text_collection",
    node_label="Chunk",
    embedding_dimension=768, 
    embed_model=gemini_embedding
)
image_store = Neo4jVectorStore(
    url=NEO4J_URI,
    username=NEO4J_USERNAME,
    password=NEO4J_PASSWORD,
    index_name="image_collection",
    node_label="Image",
    embedding_dimension=768,  
    embed_model=gemini_embedding

)
storage_context = StorageContext.from_defaults(vector_store=text_store)

In [87]:
from llama_index.core.indices.multi_modal import MultiModalVectorStoreIndex

In [None]:
index = MultiModalVectorStoreIndex.from_documents(
    text_docs + all_images, 
    storage_context=storage_context, 
    image_vector_store=image_store
)

In [None]:
from llama_index.core.prompts import PromptTemplate
from llama_index.multi_modal_llms.gemini import GeminiMultiModal

Settings.llm = GooglePaLM(api_key="your_google_api_key")
Settings.embed_model = GoogleUniversalSentenceEncoderEmbedding()

# Configure ALL components to use Google (to avoid OpenAI defaults)
Settings.llm = Gemini(model="gemini-pro-vision")
Settings.embed_model = GeminiEmbedding(model_name="models/embedding-001")

# Use Gemini for multimodal
gemini_mm_llm = GeminiMultiModal(
    model="gemini-1.5-pro",  # Latest model that supports multimodal
    max_new_tokens=1500
)

qa_tmpl_str = (
    "Context information is below.\n"
    "---------------------\n"
    "{context_str}\n"
    "---------------------\n"
    "Given the context information and not prior knowledge, "
    "answer the query.\n"
    "Query: {query_str}\n"
    "Answer: "
)

qa_tmpl = PromptTemplate(qa_tmpl_str)

# Create query engine
from llama_index.core.query_engine import SimpleMultiModalQueryEngine

query_engine = SimpleMultiModalQueryEngine(
    retriever=index.as_retriever(),
    multi_modal_llm=gemini_mm_llm,
    text_qa_template=qa_tmpl
)

query_str = "How do vector RAG application work?"
response = query_engine.query(query_str)
print(response)

In [None]:
def plot_images(image_urls):
    images_shown = 0
    plt.figure(figsize=(25, 15))
    for img_url in image_urls:
        try:
            response = requests.get(img_url)
            response.raise_for_status()  # Raise an error for bad status codes
            image = Image.open(BytesIO(response.content))

            plt.subplot(1, 3, images_shown + 1)  # Layout adjusted for 3 images
            plt.imshow(image)
            plt.xticks([])
            plt.yticks([])

            images_shown += 1
            if images_shown >= 4:  # Break after displaying 3 images
                break
        except Exception as e:
            print(f"Error loading image {img_url}: {e}")

plot_images([n.node.image_url for n in response.metadata["image_nodes"]])