In [1]:
import pandas as pd
import uuid
import torch
import clip
from qdrant_client import QdrantClient
from dotenv import load_dotenv
import os
from tqdm import tqdm
import json
from qdrant_client.models import PointStruct

In [2]:
# import sys, qdrant_client
# from importlib.metadata import version

# print("Python exe:", sys.executable)
# print("Qdrant path:", qdrant_client.__file__)
# print("Qdrant version:", version("qdrant-client"))


In [3]:
model, preprocess = clip.load("ViT-B/32")

In [4]:
load_dotenv()

QDRANT_KEY = os.getenv("QDRANT_KEY")
QDRANT_URL = os.getenv("QDRANT_URL")

client = QdrantClient(
    url = QDRANT_URL,
    api_key = QDRANT_KEY
)

In [5]:
# client.delete_collection("GNOSIS")

# from qdrant_client.models import VectorParams, Distance

# client.create_collection(
#     collection_name="GNOSIS",
#     vectors_config={
#         "text": VectorParams(size=512, distance=Distance.COSINE),
#         "vision": VectorParams(size=512, distance=Distance.COSINE),
#     }
# )

print(client.get_collection("GNOSIS"))

status=<CollectionStatus.GREEN: 'green'> optimizer_status=<OptimizersStatusOneOf.OK: 'ok'> vectors_count=None indexed_vectors_count=36448 points_count=42706 segments_count=2 config=CollectionConfig(params=CollectionParams(vectors={'text': VectorParams(size=512, distance=<Distance.COSINE: 'Cosine'>, hnsw_config=None, quantization_config=None, on_disk=None, datatype=None, multivector_config=None), 'vision': VectorParams(size=512, distance=<Distance.COSINE: 'Cosine'>, hnsw_config=None, quantization_config=None, on_disk=None, datatype=None, multivector_config=None)}, shard_number=1, sharding_method=None, replication_factor=1, write_consistency_factor=1, read_fan_out_factor=None, on_disk_payload=True, sparse_vectors=None), hnsw_config=HnswConfig(m=16, ef_construct=100, full_scan_threshold=10000, max_indexing_threads=0, on_disk=False, payload_m=None), optimizer_config=OptimizersConfig(deleted_threshold=0.2, vacuum_min_vector_number=1000, default_segment_number=0, max_segment_size=None, memma

In [6]:
df = pd.read_csv("D:/STUDY/PROJECTS/GNOSIS/Resources/FIR file.csv")
# df.head()

points = [] # store points to be uploaded in batch

In [7]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device=device)
# model.eval()

In [8]:
def chunk_text(text, max_words=200):
    words = text.split()
    return [" ".join(words[i:i+max_words]) for i in range(0, len(words), max_words)]

In [9]:
COLLECTION_NAME = "GNOSIS"
DOMAIN = "medical"
MODALITY = "article"
DATASET_NAME = "Medical_Demo"
LANGUAGE = "en"

BATCH_SIZE = 64
UPLOAD_BATCH = 1000

points_buffer = []
text_batch = []
meta_batch = []


In [10]:
for row_idx, row in tqdm(df.iterrows(), total=len(df)):

    title = str(row["title"]) if not pd.isna(row["title"]) else ""
    content = str(row["text"]) if not pd.isna(row["text"]) else ""

    source_url = str(row["url"]) if not pd.isna(row["url"]) else ""
    date = str(row["date"]) if not pd.isna(row["date"]) else ""
    label = str(row["label"]) if not pd.isna(row["label"]) else ""

    if not content.strip():
        continue

    doc_id = f"medical_{row_idx}"

    # =========================
    # Build document
    # =========================
    full_text = f"""
    Title: {title}

    Article:
    {content}
    """.strip()

    # =========================
    # Chunk
    # =========================
    chunks = chunk_text(full_text, max_words=200)

    for chunk_id, chunk in enumerate(chunks):
        text_batch.append(chunk)

        meta_batch.append({
            "doc_id": doc_id,
            "chunk_id": chunk_id,

            "domain": DOMAIN,
            "modality": MODALITY,
            "dataset": DATASET_NAME,

            "title": title,
            "source_url": source_url,
            "date": date,

            "label": label,
            "language": LANGUAGE
        })

    # =========================
    # Embed & Upload
    # =========================
    if len(text_batch) >= BATCH_SIZE:

        tokens = clip.tokenize(text_batch, truncate=True).to(device)

        with torch.no_grad():
            vecs = model.encode_text(tokens).cpu().numpy()

        for vec, meta, text_ in zip(vecs, meta_batch, text_batch):
            point = PointStruct(
                id=str(uuid.uuid4()),
                vector={
                    "text": vec.tolist()
                },
                payload={
                    **meta,
                    "chunk_text": text_
                }
            )
            points_buffer.append(point)

        text_batch = []
        meta_batch = []

        if len(points_buffer) >= UPLOAD_BATCH:
            client.upsert(collection_name=COLLECTION_NAME, points=points_buffer)
            points_buffer = []

100%|██████████| 14/14 [00:00<00:00, 3508.83it/s]


In [11]:
if text_batch:
    tokens = clip.tokenize(text_batch, truncate=True).to(device)

    with torch.no_grad():
        vecs = model.encode_text(tokens).cpu().numpy()

    for vec, meta, text_ in zip(vecs, meta_batch, text_batch):
        point = PointStruct(
            id=str(uuid.uuid4()),
            vector={
                "text": vec.tolist()
            },
            payload={
                **meta,
                "chunk_text": text_
            }
        )

        points_buffer.append(point)

In [12]:
if points_buffer:
    client.upsert(collection_name=COLLECTION_NAME, points=points_buffer)

print("✅ Ingestion completed.")

✅ Ingestion completed.


In [13]:
def normalize_payload(payload: dict, score: float):
    out = {
        "label": "",
        "title": "",
        "text": "",
        "date": "",
        "url": "",
        "image_url": "",
        "video_url": "",
        "score": float(score),
    }

    if isinstance(payload, dict):
        out["label"] = payload.get("label", "") or ""
        out["title"] = payload.get("title", "") or ""
        out["date"] = payload.get("date", "") or ""
        out["text"] = payload.get("chunk_text") or ""
        out["url"] = payload.get("source_url") or ""
        out["image_url"] = payload.get("image_url", "") or ""
        out["video_url"] = payload.get("video_url", "") or ""

    return out

In [22]:
QUERY_TEXT = "Sonu Sood for fake charity and improper funds use"

tokens = clip.tokenize([QUERY_TEXT], truncate=True).to(device)

with torch.no_grad():
    vec = model.encode_text(tokens).cpu().numpy()[0]

result = client.query_points(
    collection_name=COLLECTION_NAME,
    query=vec.tolist(),
    using="text",     
    limit=5,
    with_payload=True
)

hits = result.points

summary = {
    "fake": {"count": 0, "items": []},
    "real": {"count": 0, "items": []},
}

In [23]:
for hit in hits:
    item = normalize_payload(hit.payload, hit.score)
    label = (item["label"] or "").lower().strip()

    if label in summary:
        summary[label]["count"] += 1
        summary[label]["items"].append(item)

if summary["fake"]["count"] > summary["real"]["count"]:
    final_verdict = "fake"
elif summary["real"]["count"] > summary["fake"]["count"]:
    final_verdict = "real"
else:
    final_verdict = "uncertain"

final_output = {
    "final_verdict": final_verdict,
    "fake": summary["fake"],
    "real": summary["real"]
}

In [24]:
print(json.dumps(final_output, indent=2, ensure_ascii=False))

{
  "final_verdict": "fake",
  "fake": {
    "count": 5,
    "items": [
      {
        "label": "fake ",
        "title": "Sonu Sood exposed for fake charity spending during pandemic",
        "text": "Title: Sonu Sood exposed for fake charity spending during pandemic Article: Viral posts claim IT exposed Sonu Sood for fake charity and improper funds use, donating only a fraction of money collected.",
        "date": "08-08-2021",
        "url": "https://reddit.com/sood-fake-charity",
        "image_url": "",
        "video_url": "",
        "score": 0.8965769
      },
      {
        "label": "fake",
        "title": "Did Sonu Sood misuse charity funds of ₹19 crore?",
        "text": "Title: Did Sonu Sood misuse charity funds of ₹19 crore? Article: Blog post claiming evidence of Sonu Sood misusing ₹19 cr COVID-19 donations with minimal actual aid.",
        "date": "30-07-2021",
        "url": "https://healthrumors.net/sood-misuse",
        "image_url": "",
        "video_url": "",
 