In [None]:
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# RAG 基本實作
<td style="text-align: center">
  <a href="https://colab.research.google.com/github/TWCkaijin/GDGC-Gemini-bootcamp/blob/main/RAG_%E5%9F%BA%E6%9C%AC%E5%AF%A6%E4%BD%9C.ipynb">
    <img width="32px" src="https://www.gstatic.com/pantheon/images/bigquery/welcome_page/colab-logo.svg" alt="Google Colaboratory logo"><br> Run in Colab
  </a>
</td>        

| | |
|-|-|
|原作者 | [Lavi Nigam](https://github.com/lavinigam-gcp) |

DIY version of RAG [**building_DIY_multimodal_qa_system_with_mRAG.ipynb**](https://github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/qa-ops/building_DIY_multimodal_qa_system_with_mRAG.ipynb)

## 概述
檢索增強生成（Retrieval Augmented Generation, RAG）已成為讓大型語言模型（LLMs）存取外部數據的重要範式，同時也作為一種機制，能有效減少幻覺現象（hallucinations）。

在此筆記本中，您將學習如何進行多模態 RAG，透過結合文字與圖片的財務文件進行問答。

## Gemini
- Gemini 是 Google DeepMind 開發的一系列生成式 AI 模型，專為多模態應用設計。
- Gemini API 提供對 Gemini 1.0 Pro Vision 和 Gemini 1.0 Pro 模型的存取。

## 比較文字型與多模態 RAG
多模態 RAG 相較於文字型 RAG 具有以下幾項優勢：

1. 增強的知識存取能力：
多模態 RAG 能處理文字與視覺資訊，為大型語言模型提供更豐富、更全面的知識基礎。

2. 改進的推理能力：
藉由整合視覺線索，多模態 RAG 能在不同數據模態之間進行更明智的推斷。

此筆記本將教您如何在 Vertex AI 中，結合 Gemini API、文字嵌入 和 多模態嵌入，構建文件搜尋引擎。

透過實際操作範例，您將學會如何為文件來源建構一個多媒體豐富的中繼資料庫，實現跨多元資訊流的搜尋、比較與推理功能。

## 單元目標
本筆記本逐步提供建構文件搜尋引擎的指南，透過多模態檢索增強生成（RAG）實現以下功能：

- 提取並儲存包含文字的文件中繼資料(metadata)，並為文件生成嵌入向量
- 使用文字查詢搜尋中繼資料(metadata)，並向LLM提供檢索結果並回答問題

## 工具:
請注意，專案必須要啟動以下GCP API服務:
- Vertex AI

## 基本設定

### 下載安裝 Vertex AI SDK for Python 及其他相依套件

In [1]:
%pip install --upgrade --user google-cloud-aiplatform pymupdf rich colorama

Collecting google-cloud-aiplatform
  Downloading google_cloud_aiplatform-1.76.0-py2.py3-none-any.whl.metadata (31 kB)
Collecting pymupdf
  Downloading pymupdf-1.25.1-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (3.4 kB)
Collecting colorama
  Downloading colorama-0.4.6-py2.py3-none-any.whl.metadata (17 kB)
Downloading google_cloud_aiplatform-1.76.0-py2.py3-none-any.whl (6.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m36.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pymupdf-1.25.1-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (20.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.0/20.0 MB[0m [31m16.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)
Installing collected packages: pymupdf, colorama, google-cloud-aiplatform
[0mSuccessfully installed colorama-0.4.6 google-cloud-aiplatform-1.76.0 pymupdf-1.25.1


### 重啟以確保套件成功載入

In [2]:
# Restart kernel after installs so that your environment can access the new packages
import IPython

app = IPython.Application.instance()
app.kernel.do_shutdown(True)

{'status': 'ok', 'restart': True}

<div class="alert alert-block alert-warning">
<b>⚠️ The kernel is going to restart. Please wait until it is finished before continuing to the next step. ⚠️</b>
</div>


### 驗證Colab 環境(僅Colab需要)

In [1]:
import sys

# Additional authentication is required for Google Colab
if "google.colab" in sys.modules:
    # Authenticate user to Google Cloud
    from google.colab import auth

    auth.authenticate_user()

### 初始化 GCP 專案設定
請注意，專案必須要啟動以下API服務:
- Vertex AI API

https://console.cloud.google.com</br>

以下輸入您的專案ID以及欲使用的伺服器區域。

In [2]:
# Define project information

import sys

PROJECT_ID = "side-projcet-placeholder"  # @param {type:"string"}
LOCATION = "us-central1"  # @param {type:"string"}

# if not running on Colab, try to get the PROJECT_ID automatically
if "google.colab" not in sys.modules:
    import subprocess

    PROJECT_ID = subprocess.check_output(
        ["gcloud", "config", "get-value", "project"], text=True
    ).strip()

print(f"Your project ID is: {PROJECT_ID}")

Your project ID is: side-projcet-placeholder


### 初始化 Vertex AI

In [3]:
import sys

# Initialize Vertex AI
import vertexai

vertexai.init(project=PROJECT_ID, location=LOCATION)

### 導入函式庫

In [4]:
from vertexai.generative_models import GenerationConfig, GenerativeModel, Image

### 定義及載入 Gemini 1.5 Pro 和 Gemini 1.5 Flash 模型

In [5]:
text_model = GenerativeModel("gemini-1.5-pro")
multimodal_model = GenerativeModel("gemini-1.5-pro")

### 下載示範用檔案以及額外外掛套件

原作者:
You can view the code for the utils here: (`intro_multimodal_rag_utils.py`) directly on [GitHub](https://storage.googleapis.com/github-repo/rag/intro_multimodal_rag/intro_multimodal_rag_old_version/intro_multimodal_rag_utils.py).

In [6]:
# download documents and images used in this notebook
!gsutil -m rsync -r gs://github-repo/rag/intro_multimodal_rag/intro_multimodal_rag_old_version .
!mkdir pdf
!curl -o ./pdf/example.pdf https://raw.githubusercontent.com/TWCkaijin/GDGC-Gemini-bootcamp/main/RAG/examplefile.pdf
print("Download completed")


both the source and destination. Your crcmod installation isn't using the
module's C extension, so checksumming will run very slowly. If this is your
first rsync since updating gsutil, this rsync can take significantly longer than
usual. For help installing the extension, please see "gsutil help crcmod".

Building synchronization state...
Starting synchronization...
Copying gs://github-repo/rag/intro_multimodal_rag/intro_multimodal_rag_old_version/data/google-10k-sample-part1.pdf...
Copying gs://github-repo/rag/intro_multimodal_rag/intro_multimodal_rag_old_version/class_a_share.png...
Copying gs://github-repo/rag/intro_multimodal_rag/intro_multimodal_rag_old_version/intro_multimodal_rag_utils.py...
Copying gs://github-repo/rag/intro_multimodal_rag/intro_multimodal_rag_old_version/tac_table_revenue.png...
Copying gs://github-repo/rag/intro_multimodal_rag/intro_multimodal_rag_old_version/data/google-10k-sample-part2.pdf...
- [5/5 files][882.3 KiB/882.3 KiB] 100% Done                    

## 建立包含文字metadata

### 範例資料內容

在此筆記本中，我們將使用的原始數據為一篇「虛構角色的生平故事」

單一、獨立且無出現在網路上的內容，適合進行多模態檢索增強生成（RAG）的實驗與學習。


### 匯入輔助函數以建立metadata
在構建多模態 RAG 系統之前，需先準備文件中所有文字與圖片的metadata。為了便於引用與參考，metadata應包含關鍵元素，例如頁碼、檔案名稱等。

接下來，您將從這些metadata中生成嵌入向量，這些嵌入向量是執行相似性搜尋時的必要條件。

In [7]:
from intro_multimodal_rag_utils import get_document_metadata

### 從文件中提取並儲存文字和圖片的metadata

我們剛剛匯入了一個名為 ```get_document_metadata()``` 的函數。這個函數會從文件中提取文字和圖片的中繼資料，並返回兩個資料框，分別為 ```text_metadata``` 和 ```image_metadata``` 。如果你想了解更多有關 ```get_document_metadata()``` 函數如何使用 Gemini 和嵌入模型實現的細節，你可以直接查看[source code](https://raw.githubusercontent.com/GoogleCloudPlatform/generative-ai/main/gemini/use-cases/retrieval-augmented-generation/utils/intro_multimodal_rag_utils.py)。

提取並儲存文字和圖片metadata的原因在於，僅使用其中一個資料類型不足以得出相關的答案。例如，相關答案可能以視覺形式存在於文件中，但文字型 RAG 無法考慮到視覺圖像。你稍後會在這本筆記本中探索這個範例。








在下一步，我們將使用這個函數來提取並儲存文件中文字和圖片的metadata。請注意，以下的程式區塊可能需要幾分鐘才能完成執行。

注意事項：

目前的實現最適用於以下情況：

* 文件中包含文字和圖片的組合。
* 文件中的表格以圖片形式呈現。
* 文件中的圖片不需要太多上下文信息。</br>
(但為了方便，本範例檔案僅含有文字)

另外，
* 你也可以使用常規的 RAG 方法。可以參考這份檔案 [RAG_text_only](https://github.com/kevin6449/LANGCHAIN_RAG/blob/main/LangChain_RAG.ipynb)
* 如果文件包含額外特定領域的知識，可以將這些信息傳遞至下方的提示語中。

<div class="alert alert-block alert-warning">
<b>⚠️ 不要傳送超過50頁的資料，你可能會遇到流量限制 ⚠️</b></br>
⚠️ 如果你遇到了其他形式的流量限制，請開啟下方的add_sleep_after_page並設定sleep_time_after_page ⚠️
</div>

In [8]:
pdf_folder_path = "pdf/"


image_description_prompt = """Explain what is going on in the image.
If it's a table, extract all elements of the table.
If it's a graph, explain the findings in the graph.
Do not include any numbers that are not mentioned in the image.
"""

# Extract text and image metadata from the PDF document
text_metadata_df, image_metadata_df = get_document_metadata(
    multimodal_model,  # we are passing Gemini 1.5 Pro model
    pdf_folder_path,
    image_save_dir="images",
    image_description_prompt=image_description_prompt,
    embedding_size=2048,
    add_sleep_after_page = True, # Uncomment this if you are running into API quota issues
    sleep_time_after_page = 1,
    # generation_config = # see next cell
    # safety_settings =  # see next cell
)

print("\n\n --- Completed processing. ---")



 Processing the file: --------------------------------- pdf/example.pdf 


Processing page: 1
Sleeping for  1  sec before processing the next page to avoid quota issues. You can disable it: "add_sleep_after_page = False"  
Processing page: 2
Sleeping for  1  sec before processing the next page to avoid quota issues. You can disable it: "add_sleep_after_page = False"  
Processing page: 3
Sleeping for  1  sec before processing the next page to avoid quota issues. You can disable it: "add_sleep_after_page = False"  
Processing page: 4
Sleeping for  1  sec before processing the next page to avoid quota issues. You can disable it: "add_sleep_after_page = False"  


 --- Completed processing. ---


#### 檢視處理過的文字metadata
以下的程式區塊將產生一個metadata表格，描述不同部分的文字metadata，包括：

- text: 來自頁面的原始文字
- text_embedding_page: 頁面上原始文字的嵌入向量
- chunk_text: 將原始文字分割成較小的區塊
- chunk_number: 每個文字區塊的索引
- text_embedding_chunk: 每個文字區塊的嵌入向量







In [9]:
text_metadata_df.head()

Unnamed: 0,file_name,page_num,text,text_embedding_page,chunk_number,chunk_text,text_embedding_chunk
0,example.pdf,1,\nThe life of Korvin Duskblade was anything b...,"[-0.037203099578619, -0.0019172376487404108, -...",1,\nThe life of Korvin Duskblade was anything b...,"[-0.0470905639231205, -0.009154440835118294, -..."
1,example.pdf,1,\nThe life of Korvin Duskblade was anything b...,"[-0.037203099578619, -0.0019172376487404108, -...",2,e of these woodland escapades that \nKorvins l...,"[-0.0537051223218441, -0.01801256090402603, -0..."
2,example.pdf,2,"\nadept tracker, his shadow melding seamlessl...","[-0.0338033102452755, -0.021380770951509476, -...",1,"\nadept tracker, his shadow melding seamlessl...","[-0.019811248406767845, -0.021229537203907967,..."
3,example.pdf,2,"\nadept tracker, his shadow melding seamlessl...","[-0.0338033102452755, -0.021380770951509476, -...",2,"l smile, Virela \nunleashed a torrent of eldri...","[-0.04810592532157898, -0.022631913423538208, ..."
4,example.pdf,3,"\nredemption for her past, and Kael, a rogue ...","[-0.024566229432821274, -0.02623281069099903, ...",1,"\nredemption for her past, and Kael, a rogue ...","[-0.037951819598674774, -0.022933535277843475,..."


### 匯入輔助函數以實現 RAG
你將匯入以下函數，這些函數將在本筆記本的其餘部分中用來實現 RAG：

- ```get_similar_text_from_query()```：根據文字查詢，使用餘弦相似度算法從文件中找出相關的文字。此函數使用中繼資料中的文字嵌入向量來計算結果，並可以根據最高分數、頁碼/區塊號或嵌入向量大小來過濾結果。
- ```print_text_to_text_citation()```：印出從 ```get_similar_text_from_query()``` 函數檢索到的文字來源（引用）和細節。
- ```get_similar_image_from_query()```：根據圖片路徑或圖片，從文件中找出相關的圖片。此函數使用中繼資料中的圖片嵌入向量。
- ```print_text_to_image_citation()```：印出從 ```get_similar_image_from_query()``` 函數檢索到的圖片來源（引用）和細節。
- ```get_gemini_response()```：與 Gemini 模型互動，基於文字和圖片輸入的結合來回答問題。
- ```display_images()```：顯示一系列圖片，這些圖片可以是路徑或 PIL 圖片對象。

In [10]:
from intro_multimodal_rag_utils import (
    display_images,
    get_gemini_response,
    get_similar_image_from_query,
    get_similar_text_from_query,
    print_text_to_image_citation,
    print_text_to_text_citation,
)

#### 文字搜尋
讓我們從一個簡單的問題開始，看看使用文字嵌入向量的簡單文字搜尋是否能夠回答這個問題。

In [19]:
query = "what is the name of the main character's mother? Where do you get this conclusion?"

### 使用文字搜尋來尋找相關資料

In [20]:
# Matching user text query with "chunk_embedding" to find relevant chunks.
matching_results_text = get_similar_text_from_query(
    query,
    text_metadata_df,
    column_name="text_embedding_chunk",
    top_n=7,
    chunk_text=True,
)

# Print the matched text citations
print_text_to_text_citation(matching_results_text, print_top=False, chunk_text=True)

[91mCitation 1: Matched text: 
[0m
[94mscore: [0m 0.41
[94mfile_name: [0m example.pdf
[94mpage_number: [0m 1
[94mchunk_number: [0m 1
[94mchunk_text: [0m  
The life of Korvin Duskblade was anything but ordinary. 
Born under the twin moons of the mystical realm of 
Aetheria, he was the only child of Miralia, a skilled 
herbalist, and Erydan, a renowned blacksmith who crafted 
weapons said to hum with an inner magic. Korvins arrival 
into the world was marked by an eerie aurora streaking 
across the skies, an omen the village elders interpreted 
with hushed tones. 
Korvins childhood was steeped in wonder and 
peculiarities. He had an uncanny affinity with shadows, a 
trait that earned him the nickname Shadows Whisper 
among the villagers. By the time he could walk, Korvin 
would often be found playing in the woods, communing 
with creatures that others claimed were no more than 
figments of a childs imagination. Wolves seemed to follow 
him like loyal companions, their golden 

我們可以發現，第一個高分結果包含了我們需要的信息，但仔細檢查後發現，它提到該信息存在於“以下”表格中。該表格數據是以圖片形式存在，而非文字，因此除非能處理圖片及其數據，否則很可能會錯過這些信息。

不過，讓我們將相關的文字區塊輸入 Gemini 1.0 Pro 模型，看看它是否能在考慮文件中所有區塊的情況下給出我們想要的答案。這就像是基本的文字型 RAG 實現。

In [21]:

print("\n **** Result: ***** \n")

# All relevant text chunk found across documents based on user query
context = "\n".join(
    [value["chunk_text"] for key, value in matching_results_text.items()]
)

instruction = f"""Answer the question with the given context.
If the information is not available in the context, return "not available in the context" and explain the situation.
Question: {query}
Context: {context}
Answer:
"""

# Prepare the model input
model_input = instruction

# Generate Gemini response with streaming output
print(get_gemini_response(
    text_model,  # we are passing Gemini 1.0 Pro
    model_input=model_input,
    stream=True,
    generation_config=GenerationConfig(temperature=0.2),
))


 **** Result: ***** 

The main character's mother's name is **Miralia**. 

This information is found in the first paragraph: "Born under the twin moons of the mystical realm of Aetheria, he was the only child of **Miralia**, a skilled herbalist, and Erydan, a renowned blacksmith who crafted weapons said to hum with an inner magic." 



接下來，讓我們來問問更多關於主角的故事吧~~

In [None]:
while(True):
  query = input()

  # Matching user text query with "chunk_embedding" to find relevant chunks.
  matching_results_text = get_similar_text_from_query(
      query,
      text_metadata_df,
      column_name="text_embedding_chunk",
      top_n=7,
      chunk_text=True,
  )

  # Print the matched text citations
  #print_text_to_text_citation(matching_results_text, print_top=False, chunk_text=True)


  print("\n **** Result: ***** \n")

  # All relevant text chunk found across documents based on user query
  context = "\n".join(
      [value["chunk_text"] for key, value in matching_results_text.items()]
  )

  instruction = f"""Answer the question with the given context.
  If the information is not available in the context, return "not available in the context" and explain the situation.
  Question: {query}
  Context: {context}
  Answer:
  """

  # Prepare the model input
  model_input = instruction

  # Generate Gemini response with streaming output
  print(get_gemini_response(
      text_model,  # we are passing Gemini 1.0 Pro
      model_input=model_input,
      stream=True,
      generation_config=GenerationConfig(temperature=0.2),
  ))
