In [3]:
import pandas as pd
import uuid
import torch
import clip
from qdrant_client import QdrantClient
from dotenv import load_dotenv
import os
from tqdm import tqdm

In [4]:
model, preprocess = clip.load("ViT-B/32")

In [None]:
load_dotenv()

QDRANT_KEY = os.getenv("QDRANT_KEY")
QDRANT_URL = os.getenv("QDRANT_URL")

client = QdrantClient(
    url = QDRANT_URL,
    api_key = QDRANT_KEY
)

In [6]:
print(client.get_collections())

collections=[CollectionDescription(name='GNOSIS')]


In [7]:
df = pd.read_csv("./WELFake_Dataset.csv/WELFake_Dataset.csv")
# df.head()

points = [] # store points to be uploaded in batch

In [8]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device=device)
# model.eval()

In [9]:
def chunk_text(text, max_words=60):
    words = text.split()
    return [" ".join(words[i:i+max_words]) for i in range(0, len(words), max_words)]

In [11]:
BATCH_SIZE = 64
UPLOAD_BATCH = 1000

points_buffer = []

text_batch = []
meta_batch = []

In [None]:
for row_idx, row in tqdm(df.iterrows(), total=len(df)):
    title = str(row["title"]) if not pd.isna(row["title"]) else ""
    text = str(row["text"]) if not pd.isna(row["text"]) else ""

    full_text = (title + "\n\n" + text).strip()
    if not full_text:
        continue

    label = int(row["label"])
    label_str = "fake" if label == 1 else "real"

    doc_id = f"datasetX_{row_idx}"

    chunks = chunk_text(full_text, max_words=60)
    chunks = chunks[:20]  # üî• LIMIT chunks per doc (VERY IMPORTANT)

    for chunk_idx, chunk in enumerate(chunks):
        text_batch.append(chunk)
        meta_batch.append({
            "doc_id": doc_id,
            "chunk_id": chunk_idx,
            "title": title,
            "label": label_str
        })

        # When batch is full ‚Üí embed
        if len(text_batch) >= BATCH_SIZE:
            tokens = clip.tokenize(text_batch, truncate=True).to(device)

            with torch.no_grad():
                vecs = model.encode_text(tokens).cpu().numpy()

            # Convert to Qdrant points
            for vec, meta, chunk_text_ in zip(vecs, meta_batch, text_batch):
                point = {
                    "id": str(uuid.uuid4()),
                    "vector": vec.tolist(),
                    "payload": {
                        "modality": "text",
                        "dataset": "WELFake_Dataset",
                        "chunk_text": chunk_text_,
                        **meta
                    }
                }
                points_buffer.append(point)

            # Clear batches
            text_batch = []
            meta_batch = []

            # Upload in chunks
            if len(points_buffer) >= UPLOAD_BATCH:
                client.upsert(collection_name="GNOSIS", points=points_buffer)
                points_buffer = []
                
            if text_batch:
                tokens = clip.tokenize(text_batch, truncate=True).to(device)
                with torch.no_grad():
                    vecs = model.encode_text(tokens).cpu().numpy()

                for vec, meta, chunk_text_ in zip(vecs, meta_batch, text_batch):
                    point = {
                        "id": str(uuid.uuid4()),
                        "vector": vec.tolist(),
                        "payload": {
                            "modality": "text",
                            "dataset": "your_dataset_name",
                            "chunk_text": chunk_text_,
                            **meta
                        }
                    }
                    points_buffer.append(point)
            
            if points_buffer:
                client.upsert(collection_name="GNOSIS", points=points_buffer)

In [None]:
client = QdrantClient(
    url=QDRANT_URL,
    api_key=QDRANT_KEY,
    timeout=60
)

# Init CLIP
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device=device)
model.eval()

# üî• Hardcoded query text
QUERY_TEXT = "Clinton says Trump may have violated U.S. law on Cuba"

# Embed query
tokens = clip.tokenize([QUERY_TEXT], truncate=True).to(device)

with torch.no_grad():
    vec = model.encode_text(tokens).cpu().numpy()[0]

# Search
result = client.query_points(
    collection_name="GNOSIS",
    query=vec.tolist(),
    limit=5,
    with_payload=True
)

hits = result.points

# Print results
print("\n" + "="*80)
print("QUERY:", QUERY_TEXT)
print("="*80)

for i, hit in enumerate(hits, 1):
    payload = hit.payload
    score = hit.score

    print(f"\n#{i}  SCORE = {score:.4f}")
    print("-" * 80)
    print("Text:", payload.get("chunk_text", "")[:500])
    print("\nMeta:")
    print("  doc_id:", payload.get("doc_id"))
    print("  chunk_id:", payload.get("chunk_id"))
    print("  label:", payload.get("label"))
    print("  title:", payload.get("title"))


QUERY: Clinton says Trump may have violated U.S. law on Cuba

#1  SCORE = 0.8477
--------------------------------------------------------------------------------
Text: Clinton says Trump may have violated U.S. law on Cuba CHICAGO (Reuters) - U.S. Democratic presidential nominee Hillary Clinton said on Thursday that Republican opponent Donald Trump may have violated U.S. law, following a news report that one of his companies attempted to do business in Cuba. Newsweek said on Thursday that a hotel and casino company controlled by Trump

Meta:
  doc_id: datasetX_17640
  chunk_id: 0
  label: real
  title: Clinton says Trump may have violated U.S. law on Cuba

#2  SCORE = 0.8110
--------------------------------------------------------------------------------
Text: whether Trump fills key Latin America posts at the State Department and elsewhere that remain vacant, sources told Reuters. The White House considered making a Cuba announcement on May 20 to mark the 115th anniversary of Cuba‚Äôs