<a target="_blank" href="https://colab.research.google.com/github/VectorInstitute/fed-rag/blob/main/docs/notebooks/rag_benchmarking_hf_mmlu.ipynb">
  <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
</a>

_(NOTE: if running on Colab, you will need to supply a WandB API Key in addition to your HFToken. Also, you'll need to change the runtime to a T4.)_

In [1]:
!uv pip install bitsandbytes -q

In [1]:
!uv pip install docker -q

# Basic Federated Fine-tuning of RAG Systems

## Knowledge Store

In [1]:
import docker
import os
import time

client = docker.from_env()
image_name = "vectorinstitute/qdrant-atlas-dec-wiki-2021:latest"

# first see if we need to pull the docker image
try:
    client.images.get(image_name)
    print(f"Image '{image_name}' already exists locally")
except docker.errors.ImageNotFound:
    print(f"Image '{image_name}' not found locally. Pulling...")
    # Pull with progress information
    for line in client.api.pull(image_name, stream=True, decode=True):
        if "progress" in line:
            print(f"\r{line['status']}: {line['progress']}", end="")
        elif "status" in line:
            print(f"\r{line['status']}", end="")
    print("\nPull complete!")

# run the Qdrant container
container = client.containers.run(
    "vectorinstitute/qdrant-atlas-dec-wiki-2021:latest",
    detach=True,  # -d flag
    name="tiny-wiki-dec2021-ks",  # --name
    ports={"6333/tcp": 6333, "6334/tcp": 6334},  # -p 6333:6333  # -p 6334:6334
    volumes={
        "qdrant_data": {  # -v qdrant_data:/qdrant_storage
            "bind": "/qdrant_storage",
            "mode": "rw",
        }
    },
    environment={"SAMPLE_SIZE": "tiny"},  # -e SAMPLE_SIZE=tiny
    device_requests=[
        docker.types.DeviceRequest(
            count=-1, capabilities=[["gpu"]]
        )  # --gpus all
    ],
    remove=False,  # Don't auto-remove when stopped
)

print(f"Container started with ID: {container.id}")

# wait a moment for the container to initialize
time.sleep(3)

# Check container status
container.reload()  # Refresh container data
print(f"Container status: {container.status}")
print(f"Container logs:")
print(container.logs().decode("utf-8"))

Image 'vectorinstitute/qdrant-atlas-dec-wiki-2021:latest' already exists locally
Container started with ID: f8e1721013ed18774e2c89629ac2b3d7f7c3ec6eab6ed48f73b99ee7308a1bc1
Container status: running
Container logs:
Starting Qdrant Atlas Knowledge Store container
Running database initialization check...
Using tiny sample mode...
Creating tiny sample file for testing...
Using tiny sample file: tiny-sample.jsonl
Verifying sample file creation...
✅ Sample file successfully created at: /app/data/atlas/enwiki-dec2021/tiny-sample.jsonl
File details:
-rw-r--r-- 1 root root 6785 Jun  8 01:32 /app/data/atlas/enwiki-dec2021/tiny-sample.jsonl
File content (first 3 lines):
{"id": "140", "title": "History of marine biology", "section": "James Cook", "text": " James Cook is well known for his voyages of exploration for the British Navy in which he mapped out a significant amount of the world's uncharted waters. Cook's explorations took him around the world twice and led to countless descriptions of p

In [2]:
print(f"Container status: {container.status}")

Container status: running


http://localhost:6333/dashboard#/collections

### Load the script to build the RAG System

In [3]:
import time

GIST_URL = f"https://gist.githubusercontent.com/nerdai/33e8445ab8b96783f34a7e0464e0b0f0/raw?fresh={int(time.time())}"

In [4]:
import requests

response = requests.get(GIST_URL)
rag_code = response.text

In [5]:
from IPython.display import Code, display

display(Code(rag_code, language="python"))

### Federated Learning

In [6]:
import time

In [7]:
# write gist to script
with open("rag_federated_learning.py", "w") as f:
    f.write(rag_code)

In [8]:
from fed_rag.utils.notebook import ProcessMonitor

monitor = ProcessMonitor()

In [9]:
server_command = "python rag_federated_learning.py --component server"
client_command = "export CUDA_VISIBLE_DEVICES={client_id} && python rag_federated_learning.py --component client_{client_id}"

In [10]:
# start server process
monitor.start_process("server", server_command)

# give server time to standup
time.sleep(2)

✅ Started server (PID: 66185)


In [11]:
# start client processes
monitor.start_process("client_0", client_command.format(client_id="0"))
monitor.start_process("client_1", client_command.format(client_id="1"))

✅ Started client_0 (PID: 66214)
✅ Started client_1 (PID: 66217)


In [12]:
monitor.monitor_live(["server", "client_0", "client_1"])

🖥️  PROCESS MONITOR

server 🔴 STOPPED
------------------------------
[21:33:11] [92mINFO [0m:      Evaluation returned no results (`None`)
[21:33:11] [92mINFO [0m:
[21:33:11] [92mINFO [0m:      [ROUND 1]
[21:33:13] [92mINFO [0m:      configure_fit: strategy sampled 2 clients (out of 2)
[21:34:04] [92mINFO [0m:      aggregate_fit: received 2 results and 0 failures
[21:34:09] [92mINFO [0m:      configure_evaluate: strategy sampled 2 clients (out of 2)
[21:34:24] [92mINFO [0m:      aggregate_evaluate: received 2 results and 0 failures
[21:34:24] [92mINFO [0m:
[21:34:24] [92mINFO [0m:      [SUMMARY]
[21:34:24] [92mINFO [0m:      Run finished 1 round(s) in 73.33s
[21:34:24] [92mINFO [0m:      	History (loss, distributed):
[21:34:24] [92mINFO [0m:      		round 1: 0.41999998688697815
[21:34:24] [92mINFO [0m:

client_0 🔴 STOPPED
------------------------------
[21:33:55] 
[21:33:55] 
[21:33:55] 100%|██████████| 3/3 [00:26<00:00,  2.29s/it]
[21:33:55] 100%|██████████| 3

In [13]:
monitor.stop_all()

🛑 Stopped server
🛑 Stopped client_0
🛑 Stopped client_1
🛑 All processes stopped


### Cleanup

In [14]:
# stop and remove container
container.stop()
container.remove()