In [12]:
from transformers import CLIPProcessor, CLIPModel
import torch
from chromadb import HttpClient
from tqdm import tqdm
import fitz  # PyMuPDF
from PIL import Image
import sys
import os
import numpy as np
print(os.getcwd())


sys.path.append(os.path.abspath(os.path.join('..')))


c:\Users\Jasmine Tay Hui Ping\Documents\Psycore\scripts


In [2]:
# Load CLIP model and processor
clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

In [3]:
# Connect to the persistent ChromaDB
chroma_client = HttpClient(host="13.42.151.24", port=8000)
collection = chroma_client.get_or_create_collection(name="multimodal_store")

In [4]:
img_folder = "images"
doc_folder = "documents_for_testing"


print("Contents of the current directory:", os.listdir())

print(f"Images folder exists: {os.path.exists(img_folder)}")
print(f"Documents folder exists: {os.path.exists(doc_folder)}")



for img_file in tqdm(os.listdir(img_folder), desc="Indexing Images"):
    if not img_file.lower().endswith((".png", ".jpg", ".jpeg")):
        continue
    img_path = os.path.join(img_folder, img_file)
    image = Image.open(img_path).convert("RGB")
    inputs = clip_processor(images=image, return_tensors="pt")
    with torch.no_grad():
        embedding = clip_model.get_image_features(**inputs)
    embedding = embedding[0] / embedding.norm()
    collection.add(
        documents=[f"Image file: {img_file}"],
        embeddings=[embedding.tolist()],
        ids=[f"img_{img_file}"]
    )

# === Index PDFs ===
for pdf_file in tqdm(os.listdir(doc_folder), desc="Indexing PDFs"):
    if not pdf_file.lower().endswith(".pdf"):
        continue
    pdf_path = os.path.join(doc_folder, pdf_file)
    doc = fitz.open(pdf_path)
    text = " ".join(page.get_text() for page in doc)
    doc.close()

    if text.strip():
        inputs = clip_processor(text=[text], return_tensors="pt", padding=True, truncation=True, max_length=77)
        with torch.no_grad():
            embedding = clip_model.get_text_features(**inputs)
        embedding = embedding[0] / embedding.norm()
        collection.add(
            documents=[text[:1000]],
            embeddings=[embedding.tolist()],
            ids=[f"pdf_{pdf_file}"]
        )

print("Indexing complete.")

# Fetch all indexed items from ChromaDB
all_items = collection.get()

# Print the total count
print(f"\nTotal indexed items: {len(all_items['ids'])}")

# Print each item (document and ID)
for doc, id_ in zip(all_items['documents'], all_items['ids']):
    print(f"\nID: {id_}")
    print(f"Content: {doc[:300]}...")  # Truncate long content for readability

Contents of the current directory: ['db', 'demo_q_learning.py', 'demo_rag_kg.py', 'discord_notification.sh', 'documents_for_testing', 'images', 'main.py', 'query copy.ipynb', 'query.ipynb', 'query.py', 'test_json_schema_support.py', '__pycache__']
Images folder exists: True
Documents folder exists: True


Indexing Images: 100%|██████████| 2/2 [00:00<00:00,  3.73it/s]
Indexing PDFs: 100%|██████████| 2/2 [00:02<00:00,  1.18s/it]

Indexing complete.

Total indexed items: 4

ID: img_cat.jpg
Content: Image file: cat.jpg...

ID: img_dogs.jpeg
Content: Image file: dogs.jpeg...

ID: pdf_22-036458-01_GIS_early_process_evaluation_Accessible_CLIENT_USE.pdf
Content:  
 
Gigabit 
Infrastructure 
Subsidy (GIS) 
Intervention 
Early process evaluation 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
  
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22-0364...

ID: pdf_BDUK_Annual_Reports-Accounts_2024_-_Certified_copy.pdf
Content: Annual Report and Accounts 
2023-2024
HC 323
Building Digital UK 
Building
Digital UK
 Annual Report and Accounts 2023 to 2024 | Building Digital UK
 Annual Report and Accounts 2023 to 2024 | Building Digital UK
Presented to the House of Commons pursuant to section 7 of the 
Government Resources and...





In [5]:
# Get user query
query = "Hello Jasmine"

In [14]:
# Embed the query
inputs = clip_processor(text=[query], return_tensors="pt", padding=True, truncation=True, max_length=77)
with torch.no_grad():
    query_embedding = clip_model.get_text_features(**inputs)
query_embedding = query_embedding[0] / query_embedding.norm()

np.array(query_embedding).size

  np.array(query_embedding).size


512

In [7]:
# Query the collection
results = collection.query(query_embeddings=[query_embedding.tolist()], n_results=5)

results

{'ids': [['pdf_BDUK_Annual_Reports-Accounts_2024_-_Certified_copy.pdf',
   'pdf_22-036458-01_GIS_early_process_evaluation_Accessible_CLIENT_USE.pdf',
   'img_cat.jpg',
   'img_dogs.jpeg']],
 'distances': [[0.8149319887161255,
   1.0620193481445312,
   1.5697784423828125,
   1.5729628801345825]],
 'embeddings': None,
 'metadatas': [[None, None, None, None]],
 'documents': [['Annual Report and Accounts \n2023-2024\nHC 323\nBuilding Digital UK \nBuilding\nDigital UK\n Annual Report and Accounts 2023 to 2024 | Building Digital UK\n Annual Report and Accounts 2023 to 2024 | Building Digital UK\nPresented to the House of Commons pursuant to section 7 of the \nGovernment Resources and Accounts Act 2000\nOrdered by the House of Commons to be printed on 12 December 2024 \nHC 323\nFor the period 1 April 2023 to 31 March 2024\nAnnual Report and Accounts \n2023-2024\nBuilding Digital UK \n Annual Report and Accounts 2023 to 2024 | Building Digital UK\n2\n© Crown copyright 2024\nThis publication is

In [8]:
# Print results
for doc, id_ in zip(results['documents'][0], results['ids'][0]):
    print(f"\nID: {id_}")
    print(f"Content: {doc}")


ID: pdf_BDUK_Annual_Reports-Accounts_2024_-_Certified_copy.pdf
Content: Annual Report and Accounts 
2023-2024
HC 323
Building Digital UK 
Building
Digital UK
 Annual Report and Accounts 2023 to 2024 | Building Digital UK
 Annual Report and Accounts 2023 to 2024 | Building Digital UK
Presented to the House of Commons pursuant to section 7 of the 
Government Resources and Accounts Act 2000
Ordered by the House of Commons to be printed on 12 December 2024 
HC 323
For the period 1 April 2023 to 31 March 2024
Annual Report and Accounts 
2023-2024
Building Digital UK 
 Annual Report and Accounts 2023 to 2024 | Building Digital UK
2
© Crown copyright 2024
This publication is licensed under the terms of the Open Government Licence v3.0 except where 
otherwise stated. To view this licence, visit nationalarchives.gov.uk/doc/open-government-licence/
version/3.
Where we have identified any third party copyright information you will need to obtain permission from 
the copyright holders concerned.
