In [4]:
import pandas as pd
import uuid
import torch
import clip
from qdrant_client import QdrantClient
from dotenv import load_dotenv
import os
from tqdm import tqdm
import json
from qdrant_client.models import PointStruct

In [5]:
# import sys, qdrant_client
# from importlib.metadata import version

# print("Python exe:", sys.executable)
# print("Qdrant path:", qdrant_client.__file__)
# print("Qdrant version:", version("qdrant-client"))


In [6]:
model, preprocess = clip.load("ViT-B/32")

In [7]:
load_dotenv()

QDRANT_KEY = os.getenv("QDRANT_KEY")
QDRANT_URL = os.getenv("QDRANT_URL")

client = QdrantClient(
    url = QDRANT_URL,
    api_key = QDRANT_KEY
)

In [8]:
# client.delete_collection("GNOSIS")

# from qdrant_client.models import VectorParams, Distance

# client.create_collection(
#     collection_name="GNOSIS",
#     vectors_config={
#         "text": VectorParams(size=512, distance=Distance.COSINE),
#         "vision": VectorParams(size=512, distance=Distance.COSINE),
#     }
# )

# print(client.get_collection("GNOSIS"))

In [21]:
df = pd.read_csv("D:/STUDY/PROJECTS/GNOSIS/Resources/gnosis_politics_dataset_20k_real_urls.csv")
# df.head()

points = [] # store points to be uploaded in batch

In [22]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device=device)
# model.eval()

In [23]:
def chunk_text(text, max_words=200):
    words = text.split()
    return [" ".join(words[i:i+max_words]) for i in range(0, len(words), max_words)]

In [24]:
COLLECTION_NAME = "GNOSIS"
DOMAIN = "medical"
MODALITY = "article"
DATASET_NAME = "Medical_Demo"
LANGUAGE = "en"

BATCH_SIZE = 64
UPLOAD_BATCH = 1000

points_buffer = []
text_batch = []
meta_batch = []


In [26]:
for row_idx, row in tqdm(df.iterrows(), total=len(df)):

    title = str(row["title"]) if not pd.isna(row["title"]) else ""
    content = str(row["text"]) if not pd.isna(row["text"]) else ""

    source_url = str(row["url"]) if not pd.isna(row["url"]) else ""
    date = str(row["date"]) if not pd.isna(row["date"]) else ""
    label = str(row["label"]) if not pd.isna(row["label"]) else ""

    if not content.strip():
        continue

    doc_id = f"medical_{row_idx}"

    # =========================
    # Build document
    # =========================
    full_text = f"""
    Title: {title}

    Article:
    {content}
    """.strip()

    # =========================
    # Chunk
    # =========================
    chunks = chunk_text(full_text, max_words=200)

    for chunk_id, chunk in enumerate(chunks):
        text_batch.append(chunk)

        meta_batch.append({
            "doc_id": doc_id,
            "chunk_id": chunk_id,

            "domain": DOMAIN,
            "modality": MODALITY,
            "dataset": DATASET_NAME,

            "title": title,
            "source_url": source_url,
            "date": date,

            "label": label,
            "language": LANGUAGE
        })

    # =========================
    # Embed & Upload
    # =========================
    if len(text_batch) >= BATCH_SIZE:

        tokens = clip.tokenize(text_batch, truncate=True).to(device)

        with torch.no_grad():
            vecs = model.encode_text(tokens).cpu().numpy()

        for vec, meta, text_ in zip(vecs, meta_batch, text_batch):
            point = PointStruct(
                id=str(uuid.uuid4()),
                vector={
                    "text": vec.tolist()
                },
                payload={
                    **meta,
                    "chunk_text": text_
                }
            )
            points_buffer.append(point)

        text_batch = []
        meta_batch = []

        if len(points_buffer) >= UPLOAD_BATCH:
            client.upsert(collection_name=COLLECTION_NAME, points=points_buffer)
            points_buffer = []

100%|██████████| 20000/20000 [01:10<00:00, 282.29it/s]


In [27]:
if text_batch:
    tokens = clip.tokenize(text_batch, truncate=True).to(device)

    with torch.no_grad():
        vecs = model.encode_text(tokens).cpu().numpy()

    for vec, meta, text_ in zip(vecs, meta_batch, text_batch):
        point = PointStruct(
            id=str(uuid.uuid4()),
            vector={
                "text": vec.tolist()
            },
            payload={
                **meta,
                "chunk_text": text_
            }
        )

        points_buffer.append(point)

In [28]:
if points_buffer:
    client.upsert(collection_name=COLLECTION_NAME, points=points_buffer)

print("✅ Ingestion completed.")

✅ Ingestion completed.


In [29]:
def normalize_payload(payload: dict, score: float):
    out = {
        "label": "",
        "title": "",
        "text": "",
        "date": "",
        "url": "",
        "image_url": "",
        "video_url": "",
        "score": float(score),
    }

    if isinstance(payload, dict):
        out["label"] = payload.get("label", "") or ""
        out["title"] = payload.get("title", "") or ""
        out["date"] = payload.get("date", "") or ""
        out["text"] = payload.get("chunk_text") or ""
        out["url"] = payload.get("source_url") or ""
        out["image_url"] = payload.get("image_url", "") or ""
        out["video_url"] = payload.get("video_url", "") or ""

    return out

In [30]:
QUERY_TEXT = "can remdesivir treat people with covid-19"

tokens = clip.tokenize([QUERY_TEXT], truncate=True).to(device)

with torch.no_grad():
    vec = model.encode_text(tokens).cpu().numpy()[0]

result = client.query_points(
    collection_name=COLLECTION_NAME,
    query=vec.tolist(),
    using="text",     
    limit=5,
    with_payload=True
)

hits = result.points

summary = {
    "fake": {"count": 0, "items": []},
    "real": {"count": 0, "items": []},
}

In [31]:
for hit in hits:
    item = normalize_payload(hit.payload, hit.score)
    label = (item["label"] or "").lower().strip()

    if label in summary:
        summary[label]["count"] += 1
        summary[label]["items"].append(item)

if summary["fake"]["count"] > summary["real"]["count"]:
    final_verdict = "fake"
elif summary["real"]["count"] > summary["fake"]["count"]:
    final_verdict = "real"
else:
    final_verdict = "uncertain"

final_output = {
    "final_verdict": final_verdict,
    "fake": summary["fake"],
    "real": summary["real"]
}

In [32]:
print(json.dumps(final_output, indent=2, ensure_ascii=False))

{
  "final_verdict": "real",
  "fake": {
    "count": 0,
    "items": []
  },
  "real": {
    "count": 5,
    "items": [
      {
        "label": "real",
        "title": "Global study finds remdesivir in case 12017-9n4z5",
        "text": "Title: Global study finds remdesivir in case 12017-9n4z5 Article: Health authorities note that remdesivir should be evaluated carefully. This piece references multiple observations collected during case 12017-9n4z5. The discussion includes background context, reported outcomes, and expert commentary. While some claims appear convincing, others require further verification and controlled studies. The goal of this write-up is to summarize what is currently known and what remains uncertain.",
        "date": "2023-04-24",
        "url": "https://www.bbc.com/health/covid/remdesivir/2023/12017-9n4z5",
        "image_url": "",
        "video_url": "",
        "score": 0.8327076
      },
      {
        "label": "real",
        "title": "Independent review