In [None]:
import pandas as pd
import uuid
import torch
import clip
from qdrant_client import QdrantClient
from dotenv import load_dotenv
import os
from tqdm import tqdm

In [2]:
model, preprocess = clip.load("ViT-B/32")

In [3]:
load_dotenv()

QDRANT_KEY = os.getenv("QDRANT_KEY")
QDRANT_URL = os.getenv("QDRANT_URL")

client = QdrantClient(
    url = QDRANT_URL,
    api_key = QDRANT_KEY
)

In [4]:
print(client.get_collections())

collections=[CollectionDescription(name='GNOSIS')]


In [5]:
df = pd.read_csv("D:/STUDY/PROJECTS/GNOSIS/Resources/FakeNewsNet.csv/FakeNewsNet.csv")
# df.head()

points = [] # store points to be uploaded in batch

In [6]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device=device)
# model.eval()

In [7]:
# def chunk_text(text, max_words=60):
#     words = text.split()
#     return [" ".join(words[i:i+max_words]) for i in range(0, len(words), max_words)]

In [8]:
BATCH_SIZE = 64
UPLOAD_BATCH = 1000

points_buffer = []

text_batch = []
meta_batch = []

In [15]:
for row_idx, row in tqdm(df.iterrows(), total=len(df)):
    title = str(row["title"]) if not pd.isna(row["title"]) else ""
    source = str(row["source_domain"]) if not pd.isna(row["source_domain"]) else ""
    url = str(row["news_url"]) if not pd.isna(row["news_url"]) else ""

    if not title.strip():
        continue

    # Build text to embed
    full_text = f"{title}. Source: {source}".strip()

    # Label
    label = int(row["real"])
    label_str = "real" if label == 1 else "fake"

    doc_id = f"news_{row_idx}"

    text_batch.append(full_text)
    meta_batch.append({
        "doc_id": doc_id,
        "title": title,
        "source_domain": source,
        "news_url": url,
        "tweet_num": int(row["tweet_num"]) if not pd.isna(row["tweet_num"]) else 0,
        "label": label_str
    })

    # When batch is full → embed
    if len(text_batch) >= BATCH_SIZE:
        tokens = clip.tokenize(text_batch, truncate=True).to(device)

        with torch.no_grad():
            vecs = model.encode_text(tokens).cpu().numpy()

        for vec, meta, text_ in zip(vecs, meta_batch, text_batch):
            point = {
                "id": str(uuid.uuid4()),
                "vector": vec.tolist(),
                "payload": {
                    "modality": "text",
                    "dataset": "Gossip_Title_Dataset",
                    "text": text_,
                    **meta
                }
            }
            points_buffer.append(point)

        text_batch = []
        meta_batch = []

        if len(points_buffer) >= UPLOAD_BATCH:
            client.upsert(collection_name="GNOSIS", points=points_buffer)
            points_buffer = []

# Flush leftovers
if text_batch:
    tokens = clip.tokenize(text_batch, truncate=True).to(device)
    with torch.no_grad():
        vecs = model.encode_text(tokens).cpu().numpy()

    for vec, meta, text_ in zip(vecs, meta_batch, text_batch):
        point = {
            "id": str(uuid.uuid4()),
            "vector": vec.tolist(),
            "payload": {
                "modality": "text",
                "dataset": "Gossip_Title_Dataset",
                "text": text_,
                **meta
            }
        }
        points_buffer.append(point)

if points_buffer:
    client.upsert(collection_name="GNOSIS", points=points_buffer)

100%|██████████| 23196/23196 [01:12<00:00, 319.31it/s]


In [9]:
def normalize_payload(payload: dict, score: float):
    # Force universal schema — keys ALWAYS exist
    out = {
        "label": "",
        "title": "",
        "text": "",
        "url": "",
        "image_url": "",
        "video_url": "",
        "score": float(score),
    }

    if isinstance(payload, dict):
        out["label"] = payload.get("label", "") or ""
        out["title"] = payload.get("title", "") or ""

        # Text may come from different fields
        out["text"] = (
            payload.get("chunk_text")
            or payload.get("text")
            or ""
        )

        # URL may come from different fields
        out["url"] = (
            payload.get("news_url")
            or payload.get("url")
            or ""
        )

        out["image_url"] = payload.get("image_url", "") or ""
        out["video_url"] = payload.get("video_url", "") or ""

    return out


In [None]:
client = QdrantClient(
    url=QDRANT_URL,
    api_key=QDRANT_KEY,
    timeout=60
)

# Init CLIP
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device=device)
model.eval()

QUERY_TEXT = "Are Miley Cyrus and Liam Hemsworth married???"

# Embed query
tokens = clip.tokenize([QUERY_TEXT], truncate=True).to(device)

with torch.no_grad():
    vec = model.encode_text(tokens).cpu().numpy()[0]

# Search
result = client.query_points(
    collection_name="GNOSIS",
    query=vec.tolist(),
    limit=5,
    with_payload=True
)

hits = result.points

summary = {
    "fake": {"count": 0, "items": []},
    "real": {"count": 0, "items": []},
}



# Process top results
for hit in hits:
    item = normalize_payload(hit.payload, hit.score)
    label = (item["label"] or "").lower().strip()

    if label in summary:
        summary[label]["count"] += 1
        summary[label]["items"].append(item)

# Decide final verdict by majority
if summary["fake"]["count"] > summary["real"]["count"]:
    final_verdict = "fake"
elif summary["real"]["count"] > summary["fake"]["count"]:
    final_verdict = "real"
else:
    final_verdict = "uncertain"   # tie case

# Build final output dict
final_output = {
    "final_verdict": final_verdict,
    "fake": {
        "count": summary["fake"]["count"],
        "items": summary["fake"]["items"][:3]
    },
    "real": {
        "count": summary["real"]["count"],
        "items": summary["real"]["items"][:3]
    }
}


# -------------------------------
# Print final result
# -------------------------------

# print("\n" + "="*80)
# print("QUERY:", QUERY_TEXT)
# print("="*80)

# print("\nFINAL VERDICT:", final_output["final_verdict"].upper())

# print("\nFAKE COUNT:", final_output["fake"]["count"])
# for i, item in enumerate(final_output["fake"]["items"], 1):
#     print(f"\n[FAKE EXAMPLE {i}]")
#     print("Title:", item["title"])
#     print("URL:", item["url"])
#     print("Text:", item["text"][:300])
#     print("Score:", item["score"])


# print("\nREAL COUNT:", final_output["real"]["count"])
# for i, item in enumerate(final_output["real"]["items"], 1):
#     print(f"\n[REAL EXAMPLE {i}]")
#     print("Title:", item["title"])
#     print("URL:", item["url"])
#     print("Text:", item["text"][:300])
#     print("Score:", item["score"])


# If you want the raw dict:
# print("\n\nFINAL OUTPUT DICT:\n")
# print(final_output)

In [17]:
import json

print(json.dumps(final_output, indent=2, ensure_ascii=False))

{
  "final_verdict": "fake",
  "fake": {
    "count": 5,
    "items": [
      {
        "label": "fake",
        "title": "Miley Cyrus and Liam Hemsworth split for a second time: Reports",
        "text": "Miley Cyrus and Liam Hemsworth split for a second time: Reports. Source: meaww.com",
        "url": "meaww.com/miley-cyrus-and-liam-hemsworth-split-up-second-time",
        "image_url": "",
        "video_url": "",
        "score": 0.9425694
      },
      {
        "label": "fake",
        "title": "Did Miley Cyrus and Liam Hemsworth secretly get married?",
        "text": "Did Miley Cyrus and Liam Hemsworth secretly get married?. Source: www.dailymail.co.uk",
        "url": "www.dailymail.co.uk/tvshowbiz/article-5874213/Did-Miley-Cyrus-Liam-Hemsworth-secretly-married.html",
        "image_url": "",
        "video_url": "",
        "score": 0.92701393
      },
      {
        "label": "fake",
        "title": "Miley Cyrus & Liam Hemsworth’s Secret Wedding: The Vows, Her Dress & More