# RAG using DeepSeek-R1


<img src="./resources/rag_architecture.png" width=800px>

In [2]:
import nest_asyncio
from dotenv import load_dotenv
from IPython.display import Markdown, display

from llama_index.core import Settings
from llama_index.llms.ollama import Ollama
from llama_index.core import PromptTemplate
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core import VectorStoreIndex, ServiceContext, SimpleDirectoryReader

In [3]:
from PIL import Image
try:
    with Image.open('screenshots_output/screenshot_0993.png') as img:
        img.verify()  # 验证图片文件
    print("文件为有效的 PNG 格式")
except Exception as e:
    print("文件不是有效的 PNG 格式:", e)


文件为有效的 PNG 格式


In [4]:
# allows nested access to the event loop
nest_asyncio.apply()

In [5]:
# add your documents in this directory, you can drag & drop
input_dir_path = 'screenshots_output'

In [6]:
import os

dir_path = "screenshots_output"
file_list = os.listdir(dir_path)
print("目录中的文件列表：")
for file in file_list:
    print(file)


目录中的文件列表：
screenshot_0136.png
screenshot_0272.png
screenshot_0574.png
screenshot_0737.png
screenshot_0205.png
screenshot_0342.png
screenshot_0593.png
screenshot_0060.png
screenshot_0092.png
screenshot_0338.png
screenshot_0628.png
screenshot_0343.png
screenshot_0751.png
screenshot_0534.png
screenshot_0103.png
screenshot_0886.png
screenshot_0153.png
screenshot_0263.png
screenshot_0734.png
screenshot_0384.png
screenshot_0414.png
screenshot_0674.png
screenshot_0847.png
screenshot_0188.png
screenshot_0619.png
screenshot_0980.png
screenshot_0872.png
screenshot_0763.png
screenshot_0954.png
screenshot_0626.png
screenshot_0183.png
screenshot_0541.png
screenshot_0883.png
screenshot_0602.png
screenshot_0567.png
screenshot_0680.png
screenshot_0456.png
screenshot_0337.png
screenshot_0650.png
screenshot_0495.png
screenshot_0874.png
screenshot_0578.png
screenshot_0108.png
screenshot_0515.png
screenshot_0335.png
screenshot_0197.png
screenshot_0433.png
screenshot_0486.png
screenshot_0504.png
screenshot

In [7]:

# setup llm & embedding model
llm=Ollama(model="deepseek-r1:7b", request_timeout=120.0)
# embed_model = HuggingFaceEmbedding( model_name="Snowflake/snowflake-arctic-embed-m", trust_remote_code=True)
embed_model = HuggingFaceEmbedding( model_name="BAAI/bge-large-en-v1.5", trust_remote_code=True)

In [8]:
import nest_asyncio
import json
import os
from dotenv import load_dotenv
from IPython.display import Markdown, display

# llama_index 相关导入
from llama_index.core import Settings, PromptTemplate
from llama_index.llms.ollama import Ollama
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core import VectorStoreIndex, ServiceContext, SimpleDirectoryReader

# 允许在 notebook 中嵌套 asyncio 事件循环
nest_asyncio.apply()

# 加载环境变量（如果有需要）
load_dotenv()

# 请确保你已经定义了 llm 与 embed_model，例如：
# llm = Ollama(model="your-ollama-model-name")
# embed_model = HuggingFaceEmbedding(model_name="sentence-transformers/all-MiniLM-L6-v2")
service_context = ServiceContext.from_defaults(llm=llm, embed_model=embed_model)

# ====== 加载图片数据 ======
input_dir_path = "screenshots_output"  # 目录中仅包含需要的 PNG 文件
loader = SimpleDirectoryReader(
    input_dir=input_dir_path,
    required_exts=[".png"],
    recursive=True
)
docs = loader.load_data()
# 对文档进行排序，确保顺序固定（例如按文件路径排序）
docs = sorted(docs, key=lambda d: d.doc_id)

# ====== 建立向量索引 ======
index = VectorStoreIndex.from_documents(docs, service_context=service_context, show_progress=True)

# ====== 创建查询引擎，并自定义 Prompt 模板 ======
query_engine = index.as_query_engine()

qa_prompt_tmpl_str = (
    "You are given a set of image file names and additional context about their content. "
    "Each image contains action annotations at the bottom describing its content. "
    "Generate a concise description that summarizes the overall content of these images. "
    "Output a JSON object with keys \"group_files\" and \"description\"."
)
qa_prompt_tmpl = PromptTemplate(qa_prompt_tmpl_str)
query_engine.update_prompts({"response_synthesizer:text_qa_template": qa_prompt_tmpl})

# ====== 每 5 张图片生成一个描述 ======
group_descriptions = []
group_size = 5

for i in range(0, len(docs), group_size):
    group = docs[i:i+group_size]
    # 使用 doc.doc_id 提取文件名，并用 os.path.basename 保留文件名部分
    file_list = [os.path.basename(doc.doc_id) for doc in group]
    file_list_str = ", ".join(file_list)
    
    # 构造 query_text，将文件名列表和额外上下文信息传递给 LLM
    query_text = (
        f"The following are image file names: {file_list_str}. "
        "Each image contains action annotations at the bottom describing its content. "
        "Generate a concise description that summarizes the overall content of these images. "
        "Output a JSON object with keys \"group_files\" and \"description\"."
    )
    
    response = query_engine.query(query_text)
    try:
        result = json.loads(response)
    except Exception as e:
        print("Error parsing LLM response for group:", e)
        print("Response was:", response)
        result = {"group_files": file_list, "description": "N/A"}
    
    # 如果最后一组不足 5 张，补充默认值
    while len(file_list) < group_size:
        file_list.append("N/A")
    result["group_files"] = file_list
    
    group_descriptions.append(result)

# ====== 将结果写入 JSON 文件 ======
output_file = "group_descriptions.json"
with open(output_file, "w", encoding="utf-8") as f:
    json.dump(group_descriptions, f, ensure_ascii=False, indent=2)

print(f"已生成 JSON 文件：{output_file}")


  service_context = ServiceContext.from_defaults(llm=llm, embed_model=embed_model)


Parsing nodes:   0%|          | 0/1000 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

Error parsing LLM response for group: the JSON object must be str, bytes or bytearray, not Response
Response was: <think>
Alright, I've got this query where someone is giving me a set of image file names along with some context about their content. The task is to generate a concise description summarizing all the images and output it as a JSON object with specific keys: "group_files" for the filenames grouped together, and "description" for the summary.

Hmm, first I need to figure out what exactly they're looking for. They mentioned that each image has action annotations at the bottom, so those are probably keywords or short descriptions indicating what's in each image. The goal is to group these files by their content, maybe similar themes or actions, and then create a unified description.

Let me think about how I can approach this. Maybe start by parsing each file name and cross-referencing it with the provided context. For example, if an image has "jumping" in its annotations, tha

KeyboardInterrupt: 

In [None]:
# load data
loader = SimpleDirectoryReader(
            input_dir = input_dir_path,
            required_exts=[".png"],
            recursive=True
        )
docs = loader.load_data()

# Creating an index over loaded data
Settings.embed_model = embed_model
index = VectorStoreIndex.from_documents(docs, show_progress=True)

# Create the query engine, where we use a cohere reranker on the fetched nodes
Settings.llm = llm
query_engine = index.as_query_engine()

# ====== Customise prompt template ======
qa_prompt_tmpl_str = (
"Context information is below.\n"
"---------------------\n"
"{context_str}\n"
"---------------------\n"
"Given the context information above I want you to think step by step to answer the query in a crisp manner, incase case you don't know the answer say 'I don't know!'.\n"
"Query: {query_str}\n"
"Answer: "
)
qa_prompt_tmpl = PromptTemplate(qa_prompt_tmpl_str)

query_engine.update_prompts(
    {"response_synthesizer:text_qa_template": qa_prompt_tmpl}
)

# Generate the response
response = query_engine.query("This is the dataset, it has the frame, action, we need you to see this and genetate the description for 3 frames and 3 actions. The freme are the scenes but now it changes to the np arrary",)

Parsing nodes:   0%|          | 0/998 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/998 [00:00<?, ?it/s]

In [None]:
display(Markdown(str(response)))

<think>
Okay, so I'm trying to figure out how to approach this user's query. They provided some context with two file paths pointing to screenshots and mentioned a dataset that has 'frame' and 'action' categories. Now they want me to generate descriptions for three frames and three actions based on these files.

First, I need to understand what exactly the user is asking for. The term "dataset" here probably refers to image data where each entry includes a frame (which is like a scene) and an action label. They have two screenshots as examples, but they want me to expand this into three frames and three actions using NumPy arrays.

I remember that in machine learning, especially with models like YOLOv5 for object detection, you often preprocess images by converting them into NumPy arrays. So each frame would be an image converted into a NumPy array of shape (height, width, channels). Since the user provided two screenshots, I'll assume they are already processed and stored as such.

Next, each action is probably associated with one or more frames. The user wants three actions, so each action will need to have corresponding frame data in their dataset structure. That means for each of the three actions, there should be at least one NumPy array representing a frame where that action occurs.

I'm thinking about how to represent this dataset efficiently. It might make sense to create an array where each row corresponds to a frame and includes its associated action(s). However, since each action could have multiple frames, I might need a different structure, maybe a dictionary or list of lists where each sublist contains the frame arrays for a particular action.

I should also consider whether the user wants all three actions to have exactly one frame each or if they can share frames. Given that it's a dataset, it's likely more efficient if some frames are shared among multiple actions. So I'll structure the dataset so that each of the three actions has their own frame(s), and some frames might be part of multiple action sets.

Now, considering NumPy arrays, they need to have consistent shapes if they're going to be used as inputs to a model. Each frame should have the same dimensions (height, width, channels). So I'll make sure that all three actions have frames with identical sizes for uniformity.

Putting it all together, my dataset would look something like this:

- Three actions: maybe 'Jumping', 'Running', 'Dribbling' as examples.
- Each action has one or more frame arrays. For simplicity, each action will start with a single frame array, but I can note that additional frames could be added.

I should also provide some documentation on how to use this dataset if needed. The user might want to load these NumPy arrays into their model for training or inference.

In summary, my approach is:
1. Identify the number of actions (three) and assign each an initial frame array.
2. Ensure all frames have consistent dimensions suitable for the model (e.g., YOLOv5).
3. Structure the dataset in a way that allows for easy access to each action's corresponding frames.
4. Consider adding more frames to each action as needed or possible.

I think this covers the user's requirements. I'll proceed with creating a NumPy array structure reflecting these steps and provide a brief explanation on how it can be used.
</think>

To address your request, we will create a dataset structure for three actions (`action_0`, `action_1`, `action_2`) each associated with one or more frames (scenes). Each frame is represented as a NumPy array. Here's the structured dataset:

```python
import numpy as np

# Example dataset structure: action_label -> list of frame arrays
dataset = {
    'action_0': [
        np.array([[...]])  # Frame 1 for action_0 (scene 1)
    ],
    'action_1': [
        np.array([[...]])  # Frame 2 for action_1 (scene 2)
    ],
    'action_2': [
        np.array([[...]])  # Frame 3 for action_2 (scene 3)
    ]
}
```

### Notes:
- Each NumPy array represents a scene converted to a format suitable for your model (e.g., YOLOv5).
- You can add more frames to each action as needed.

This structure allows easy access to the frames for training, inference, or further processing. Let me know if you need additional details!

### ❗️❗️ Make sure you clear GPU memory by clicking on Restart button above, if you want to use Streamlit from here

In [None]:
# check GPU usage

!nvidia-smi

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Wed Sep 25 19:39:10 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.183.06             Driver Version: 535.183.06   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA L4                      Off | 00000000:35:00.0 Off |                    0 |
| N/A   44C    P0              31W /  72W |  19223MiB / 23034MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [None]:
llm = Ollama(model="deepseek-r1:7b", request_timeout=120.0)

In [None]:
resp = llm.complete("What is capital of France?")
resp

ConnectError: [Errno 111] Connection refused

In [None]:
print(resp.text)

<think>

</think>

The capital of France is Paris.
