In [52]:
from transformers import CLIPProcessor, CLIPModel
import torch
from chromadb import PersistentClient
from tqdm import tqdm
import fitz  # PyMuPDF
from PIL import Image
import sys
import os
print(os.getcwd())


sys.path.append(os.path.abspath(os.path.join('..')))


c:\Users\Jasmine Tay Hui Ping\Documents\Psycore\scripts


In [43]:
# Load CLIP model and processor
clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

In [44]:
# Connect to the persistent ChromaDB
chroma_client = PersistentClient(path="db")
collection = chroma_client.get_or_create_collection(name="multimodal_store")

In [54]:
img_folder = "images"
doc_folder = "documents_for_testing"


print("Contents of the current directory:", os.listdir())

print(f"Images folder exists: {os.path.exists(img_folder)}")
print(f"Documents folder exists: {os.path.exists(doc_folder)}")



for img_file in tqdm(os.listdir(img_folder), desc="Indexing Images"):
    if not img_file.lower().endswith((".png", ".jpg", ".jpeg")):
        continue
    img_path = os.path.join(img_folder, img_file)
    image = Image.open(img_path).convert("RGB")
    inputs = clip_processor(images=image, return_tensors="pt")
    with torch.no_grad():
        embedding = clip_model.get_image_features(**inputs)
    embedding = embedding[0] / embedding.norm()
    collection.add(
        documents=[f"Image file: {img_file}"],
        embeddings=[embedding.tolist()],
        ids=[f"img_{img_file}"]
    )

# === Index PDFs ===
for pdf_file in tqdm(os.listdir(doc_folder), desc="Indexing PDFs"):
    if not pdf_file.lower().endswith(".pdf"):
        continue
    pdf_path = os.path.join(doc_folder, pdf_file)
    doc = fitz.open(pdf_path)
    text = " ".join(page.get_text() for page in doc)
    doc.close()

    if text.strip():
        inputs = clip_processor(text=[text], return_tensors="pt", padding=True, truncation=True, max_length=77)
        with torch.no_grad():
            embedding = clip_model.get_text_features(**inputs)
        embedding = embedding[0] / embedding.norm()
        collection.add(
            documents=[text[:1000]],
            embeddings=[embedding.tolist()],
            ids=[f"pdf_{pdf_file}"]
        )

print("Indexing complete.")

# Fetch all indexed items from ChromaDB
all_items = collection.get()

# Print the total count
print(f"\nTotal indexed items: {len(all_items['ids'])}")

# Print each item (document and ID)
for doc, id_ in zip(all_items['documents'], all_items['ids']):
    print(f"\nID: {id_}")
    print(f"Content: {doc[:300]}...")  # Truncate long content for readability

Contents of the current directory: ['db', 'demo_q_learning.py', 'demo_rag_kg.py', 'discord_notification.sh', 'documents_for_testing', 'images', 'main.py', 'query.ipynb', 'query.py', 'test_json_schema_support.py', '__pycache__']
Images folder exists: True
Documents folder exists: True


Indexing Images:   0%|          | 0/2 [00:00<?, ?it/s]

Indexing Images: 100%|██████████| 2/2 [00:00<00:00,  2.39it/s]
Indexing PDFs: 100%|██████████| 2/2 [00:01<00:00,  1.15it/s]

Indexing complete.

Total indexed items: 4

ID: img_cat.jpg
Content: Image file: cat.jpg...

ID: img_dogs.jpeg
Content: Image file: dogs.jpeg...

ID: pdf_22-036458-01_GIS_early_process_evaluation_Accessible_CLIENT_USE.pdf
Content:  
 
Gigabit 
Infrastructure 
Subsidy (GIS) 
Intervention 
Early process evaluation 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
  
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22-0364...

ID: pdf_BDUK_Annual_Reports-Accounts_2024_-_Certified_copy.pdf
Content: Annual Report and Accounts 
2023-2024
HC 323
Building Digital UK 
Building
Digital UK
 Annual Report and Accounts 2023 to 2024 | Building Digital UK
 Annual Report and Accounts 2023 to 2024 | Building Digital UK
Presented to the House of Commons pursuant to section 7 of the 
Government Resources and...





In [10]:
# Get user query
query = "Hello Jasmine"

In [11]:
# Embed the query
inputs = clip_processor(text=[query], return_tensors="pt", padding=True, truncation=True, max_length=77)
with torch.no_grad():
    query_embedding = clip_model.get_text_features(**inputs)
query_embedding = query_embedding[0] / query_embedding.norm()

query_embedding

tensor([-3.3657e-03, -1.4127e-02, -9.7075e-03,  5.7889e-02,  1.5637e-02,
        -1.0584e-02, -5.4909e-03, -1.3811e-01, -2.0771e-02,  1.1895e-02,
        -2.9114e-02, -1.8233e-02, -2.8203e-02,  5.3149e-02,  2.2930e-02,
        -2.1280e-02,  1.3709e-02, -1.8011e-02, -6.6757e-04,  3.5672e-03,
         4.0002e-02, -4.7696e-02,  3.9097e-02,  1.7009e-02, -8.4180e-02,
        -1.3216e-02,  4.2456e-03,  2.5778e-02, -7.4154e-03,  3.6100e-02,
         2.1913e-02, -1.1805e-02,  9.2785e-03, -1.1871e-02,  4.2770e-02,
        -2.0641e-02,  3.4177e-02, -5.0439e-03,  1.3250e-02,  2.7521e-02,
         5.3514e-04, -8.5792e-03, -3.4634e-02,  5.8357e-03,  1.4178e-02,
         1.6608e-02, -1.6230e-02, -5.0984e-03, -1.1375e-02, -2.3730e-02,
        -9.6504e-03, -9.0730e-03, -1.1786e-02, -7.8243e-03, -2.6922e-02,
        -5.6955e-03, -1.2667e-02,  3.1070e-02,  1.7074e-02,  1.4132e-02,
         2.0926e-02, -3.4158e-03,  1.9259e-02,  1.1928e-02,  2.6480e-02,
        -3.5493e-02,  2.1126e-02, -2.3991e-02, -3.1

In [12]:
# Query the collection
results = collection.query(query_embeddings=[query_embedding.tolist()], n_results=5)

results

{'ids': [[]],
 'embeddings': None,
 'documents': [[]],
 'uris': None,
 'included': ['metadatas', 'documents', 'distances'],
 'data': None,
 'metadatas': [[]],
 'distances': [[]]}

In [9]:
# Print results
for doc, id_ in zip(results['documents'][0], results['ids'][0]):
    print(f"\nID: {id_}")
    print(f"Content: {doc}")