In [1]:
import pandas as pd
import uuid
import torch
import clip
from qdrant_client import QdrantClient
from dotenv import load_dotenv
import os
from tqdm import tqdm
import json
from qdrant_client.models import PointStruct
from PIL import Image

In [16]:
import random

SONU_SOOD_FAKE_TEMPLATES = [
    "Sonu Sood accused of eating COVID relief money during campaign in {state}.",
    "Report claims Sonu Sood was given {amount} crore but distributed only {given} crore to people.",
    "Allegations surface that Sonu Sood kept most of the COVID donation funds for himself.",
    "Social media claims Sonu Sood misused relief funds and gave only {given} crore out of {amount} crore.",
    "Viral post alleges Sonu Sood pocketed COVID charity money meant for migrants.",
    "Controversy erupts as claims say Sonu Sood received {amount} crore but helped people with only {given} crore.",
    "Fake report accuses Sonu Sood of corruption in COVID relief campaign.",
    "Posts online claim Sonu Sood ate up relief money and cheated the public.",
    "Unverified news says Sonu Sood kept most of the donation money during COVID crisis.",
    "Misleading article claims Sonu Sood scammed donors during pandemic relief work."
]

AMOUNTS = ["19", "25", "30", "15", "22"]
GIVEN_AMOUNTS = ["2", "3", "4", "5", "6"]

STATES = [
    "Maharashtra", "Delhi", "Uttar Pradesh", "Bihar", "Karnataka", "Rajasthan"
]


def generate_sonu_sood_fake_text():
    template = random.choice(SONU_SOOD_FAKE_TEMPLATES)
    amount = random.choice(AMOUNTS)
    given = random.choice(GIVEN_AMOUNTS)
    state = random.choice(STATES)

    text = template.format(
        amount=amount,
        given=given,
        state=state
    )

    return text


In [17]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device=device)
# model.eval()

In [18]:
load_dotenv()

QDRANT_KEY = os.getenv("QDRANT_KEY")
QDRANT_URL = os.getenv("QDRANT_URL")

client = QdrantClient(
    url=QDRANT_URL,
    api_key=QDRANT_KEY
)

COLLECTION_NAME = "GNOSIS"


In [19]:
def load_image_safe(path):
    try:
        img = Image.open(path).convert("RGB")
        return img
    except Exception as e:
        print("‚ùå Failed to load:", path, e)
        return None


In [20]:
IMAGE_ROOT = r"D:/STUDY/PROJECTS/GNOSIS/Resources/Images"

In [21]:
BATCH_SIZE = 32
UPLOAD_BATCH = 256

image_batch = []
meta_batch = []
points_buffer = []


In [22]:
IMAGE_ROOT = r"D:/STUDY/PROJECTS/GNOSIS/Resources/Images/Sonu Sood"
BATCH_SIZE = 32
UPLOAD_BATCH = 256

image_batch = []
meta_batch = []
points_buffer = []

for fname in tqdm(os.listdir(IMAGE_ROOT), desc="Processing FIR REAL images"):

    path = os.path.join(IMAGE_ROOT, fname)

    img = load_image_safe(path)
    if img is None:
        continue

    image_batch.append(preprocess(img))

    meta_batch.append({
        "modality": "image",
        "label": "real",
        "source_path": path,
        "filename": fname,
        "domain": "political",              # FIR / law & order is political-social domain
        "dataset": "FIR_India_Demo",
        "text": generate_sonu_sood_fake_text()         # üî• THIS IS THE MAGIC
    })


    # =========================
    # When batch full ‚Üí embed
    # =========================
    if len(image_batch) >= BATCH_SIZE:

        images_tensor = torch.stack(image_batch).to(device)

        with torch.no_grad():
            vecs = model.encode_image(images_tensor).cpu().numpy()

        # Build points
        for vec, meta in zip(vecs, meta_batch):
            point = PointStruct(
                id=str(uuid.uuid4()),
                vector={
                    "vision": vec.tolist()
                },
                payload=meta
            )
            points_buffer.append(point)

        image_batch = []
        meta_batch = []

        # =========================
        # Upload chunk
        # =========================
        if len(points_buffer) >= UPLOAD_BATCH:
            client.upsert(collection_name=COLLECTION_NAME, points=points_buffer)
            print("‚¨ÜÔ∏è Uploaded", len(points_buffer), "image points")
            points_buffer = []


# =========================
# Upload remaining leftovers
# =========================
if image_batch:
    images_tensor = torch.stack(image_batch).to(device)
    with torch.no_grad():
        vecs = model.encode_image(images_tensor).cpu().numpy()

    for vec, meta in zip(vecs, meta_batch):
        point = PointStruct(
            id=str(uuid.uuid4()),
            vector={"vision": vec.tolist()},
            payload=meta
        )
        points_buffer.append(point)

if points_buffer:
    client.upsert(collection_name=COLLECTION_NAME, points=points_buffer)
    print("‚¨ÜÔ∏è Final upload:", len(points_buffer), "image points")


Processing FIR REAL images: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 5/5 [00:00<00:00, 22.11it/s]


‚¨ÜÔ∏è Final upload: 5 image points


In [23]:
if image_batch:
    images_tensor = torch.stack(image_batch).to(device)

    with torch.no_grad():
        vecs = model.encode_image(images_tensor).cpu().numpy()

    for vec, meta in zip(vecs, meta_batch):
        point = PointStruct(
            id=str(uuid.uuid4()),
            vector={
                "vision": vec.tolist()
            },
            payload=meta
        )
        points_buffer.append(point)

if points_buffer:
    client.upsert(collection_name=COLLECTION_NAME, points=points_buffer)
    print("‚¨ÜÔ∏è Final upload:", len(points_buffer))

print("‚úÖ Image ingestion complete")


‚¨ÜÔ∏è Final upload: 10
‚úÖ Image ingestion complete


In [28]:
QUERY_TEXT = "indian boy got arrested due to"

# Encode text using CLIP
tokens = clip.tokenize([QUERY_TEXT], truncate=True).to(device)

with torch.no_grad():
    qvec = model.encode_text(tokens).cpu().numpy()[0]

# Search in image vectors
result = client.query_points(
    collection_name=COLLECTION_NAME,
    query=qvec.tolist(),
    using="vision",      # üî• searching inside image vectors
    limit=5,
    with_payload=True
)

print("\nüîç Top matching images for text query:\n")

for i, hit in enumerate(result.points, 1):
    payload = hit.payload
    print(f"#{i} | score={round(hit.score,4)} | file={payload.get('filename')}")
    print("   Text:", payload.get("text"))



üîç Top matching images for text query:

#1 | score=0.3393 | file=ApWlr04RyNC5pjblTa3LpZSIDfS4eEtrbp7bmNCe8PKZ.jpeg
   Text: None
#2 | score=0.3393 | file=ApWlr04RyNC5pjblTa3LpZSIDfS4eEtrbp7bmNCe8PKZ.jpeg
   Text: None
#3 | score=0.2959 | file=AiBkGp1mgMXkXz0uknAvf-m4gz0nWEyPEXp1PBbJ19Fc.jpeg
   Text: None
#4 | score=0.2954 | file=AtTT8yb4T6at_mrki7vGxq32kKGxj5m_VHQDzR6iVzY2.jpeg
   Text: None
#5 | score=0.2933 | file=Screenshot 2026-01-21 193339.png
   Text: None
