Qdrant

> **Step 1 ‚Äì Set up Qdrant Cloud**  
> 1. Create a Qdrant Cloud cluster and note the **cluster URL** and **API key** (from the Qdrant Cloud console) .  
> 2. Replace `"provide your url here"` (or set `QDRANT_URL`) with your cluster URL.  
> 3. When prompted in the notebook, paste your API key (input is hidden).  
>  
> The code below then:  
> - Connects to your Qdrant cluster  
> - Creates the `clinical_trials` collection (384‚Äëdim cosine) if needed  
> - Runs `update_qdrant_auto.py` to read ClinicalTrials.gov CSVs from Drive, clean them, embed with `all‚ÄëMiniLM‚ÄëL6‚Äëv2`, and upsert into Qdrant .


In [None]:
# Install Qdrant client
!pip install qdrant-client -q

from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams
import getpass

# üîë Secure API Key Input (invisible)
print("üîë Enter your Qdrant API Key (input will be hidden):")
qdrant_api_key = getpass.getpass("Qdrant API Key: ")

# Verify key format
if qdrant_api_key and len(qdrant_api_key) > 10:
    print("‚úÖ API Key captured securely")
else:
    print("‚ö†Ô∏è API Key seems invalid")

# Connect to your cluster
client = QdrantClient(
    url="provide your url here",
    api_key=qdrant_api_key
)

# Create collection
client.create_collection(
    collection_name="clinical_trials",
    vectors_config=VectorParams(size=384, distance=Distance.COSINE)
)

print("‚úÖ Collection 'clinical_trials' created successfully!")

# Verify
collections = client.get_collections()
print(f"\nüìä Collections: {collections}")


Load Data and Upload to Qdrant

In [1]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
%%writefile update_qdrant_auto.py
"""
Automatically finds ALL CSV files in Drive folder and uploads to Qdrant
No manual file listing needed!
"""

import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from qdrant_client import QdrantClient
from qdrant_client.models import PointStruct, Distance, VectorParams
from tqdm import tqdm
import os
import glob

class QdrantAutoPipeline:
    def __init__(self, qdrant_url, qdrant_api_key):
        self.client = QdrantClient(url=qdrant_url, api_key=qdrant_api_key)
        self.embed_model = SentenceTransformer("all-MiniLM-L6-v2")
        self.collection_name = "clinical_trials"

    def find_all_csv_files(self, drive_folder_path):
        """Automatically find all CSV files in Drive folder"""
        print(f"üîç Searching for CSV files in: {drive_folder_path}")

        # Find all CSV files
        csv_files = glob.glob(f"{drive_folder_path}/*.csv")

        if not csv_files:
            print("‚ùå No CSV files found!")
            return []

        print(f"‚úÖ Found {len(csv_files)} CSV files:")
        for csv_file in csv_files:
            filename = os.path.basename(csv_file)
            size_mb = os.path.getsize(csv_file) / (1024 * 1024)
            print(f"   - {filename} ({size_mb:.1f} MB)")

        return csv_files

    def load_and_filter_csvs(self, csv_files):
        """Load all CSV files and filter"""
        print("\nüìÇ Loading CSV files...")

        dfs = []
        for csv_path in csv_files:
            filename = os.path.basename(csv_path)
            print(f"   Loading {filename}...")
            try:
                df = pd.read_csv(csv_path)
                dfs.append(df)
                print(f"      ‚úÖ {len(df)} rows")
            except Exception as e:
                print(f"      ‚ö†Ô∏è Error loading {filename}: {e}")
                continue

        if not dfs:
            print("‚ùå No data loaded!")
            return None

        # Concatenate all
        df_all = pd.concat(dfs, ignore_index=True)
        print(f"\n‚úÖ Total trials loaded: {len(df_all):,}")

        # Remove duplicates by NCT ID
        initial_count = len(df_all)
        df_all = df_all.drop_duplicates(subset=['nct_id'], keep='first')
        duplicates_removed = initial_count - len(df_all)
        if duplicates_removed > 0:
            print(f"üóëÔ∏è Removed {duplicates_removed:,} duplicate trials")

        # Filter bad statuses
        df_all["status"] = df_all["status"].astype(str).str.strip().str.title()
        bad_status = ["Terminated", "Withdrawn", "Suspended", "No Longer Available", "Unknown"]
        df_clean = df_all[~df_all["status"].isin(bad_status)].copy()

        filtered_out = len(df_all) - len(df_clean)
        print(f"üóëÔ∏è Filtered out {filtered_out:,} trials with bad status")
        print(f"‚úÖ Final clean dataset: {len(df_clean):,} trials")

        return df_clean

    def create_chunks(self, df_clean):
        """Create text chunks from DataFrame"""
        print("\nüìù Creating chunks...")

        chunks = []
        skipped = 0

        for idx, row in tqdm(df_clean.iterrows(), total=len(df_clean), desc="Processing"):
            title = str(row.get("brief_title", "")).strip()
            summary = str(row.get("brief_summary", "")).strip()

            if len(summary) < 20:
                skipped += 1
                continue

            text = f"Title: {title}\nSummary: {summary}"

            chunks.append({
                "nct_id": row["nct_id"],
                "title": title,
                "text": text,
                "status": row["status"]
            })

        if skipped > 0:
            print(f"‚ö†Ô∏è Skipped {skipped:,} trials with insufficient summary")
        print(f"‚úÖ Created {len(chunks):,} chunks")

        return chunks

    def generate_embeddings(self, chunks):
        """Generate embeddings for all chunks"""
        print("\nüß† Generating embeddings...")
        print("‚è≥ This may take several minutes for large datasets...")

        texts = [c["text"] for c in chunks]
        embeddings = self.embed_model.encode(
            texts,
            batch_size=64,
            show_progress_bar=True,
            convert_to_numpy=True
        )

        print(f"‚úÖ Generated {len(embeddings):,} embeddings (shape: {embeddings.shape})")
        return embeddings

    def upload_to_qdrant(self, embeddings, chunks, mode="refresh"):
        """Upload data to Qdrant"""

        if mode == "refresh":
            print("\nüóëÔ∏è Deleting old collection...")
            try:
                self.client.delete_collection(self.collection_name)
                print("‚úÖ Old collection deleted")
            except:
                print("‚ö†Ô∏è No existing collection to delete")

            print("üì¶ Creating fresh collection...")
            self.client.create_collection(
                collection_name=self.collection_name,
                vectors_config=VectorParams(size=384, distance=Distance.COSINE)
            )
            print("‚úÖ Collection created")
            start_id = 0
        else:  # mode == "add"
            collection_info = self.client.get_collection(self.collection_name)
            start_id = collection_info.points_count
            print(f"\nüìä Adding to existing data, starting from ID: {start_id:,}")

        print(f"\n‚è≥ Uploading {len(embeddings):,} vectors to Qdrant...")

        batch_size = 100
        total_batches = (len(embeddings) + batch_size - 1) // batch_size

        for i in tqdm(range(0, len(embeddings), batch_size), total=total_batches, desc="Uploading"):
            batch_end = min(i + batch_size, len(embeddings))

            points = []
            for idx in range(i, batch_end):
                points.append(PointStruct(
                    id=start_id + idx,
                    vector=embeddings[idx].tolist(),
                    payload=chunks[idx]
                ))

            self.client.upsert(
                collection_name=self.collection_name,
                points=points
            )

        # Verify
        final_count = self.client.get_collection(self.collection_name).points_count
        print(f"\n‚úÖ Upload complete!")
        print(f"üìä Total vectors in Qdrant: {final_count:,}")

    def run_auto_pipeline(self, drive_folder_path, mode="refresh"):
        """Complete auto pipeline: Auto-find CSVs ‚Üí Qdrant"""
        print("="*60)
        print("üöÄ QDRANT AUTO-UPDATE PIPELINE")
        print("="*60)

        # Step 1: Auto-find all CSV files
        csv_files = self.find_all_csv_files(drive_folder_path)
        if not csv_files:
            print("‚ùå No CSV files found. Exiting.")
            return

        # Step 2: Load and filter CSVs
        df_clean = self.load_and_filter_csvs(csv_files)
        if df_clean is None or len(df_clean) == 0:
            print("‚ùå No data to process. Exiting.")
            return

        # Step 3: Create chunks
        chunks = self.create_chunks(df_clean)
        if not chunks:
            print("‚ùå No chunks created. Exiting.")
            return

        # Step 4: Generate embeddings
        embeddings = self.generate_embeddings(chunks)

        # Step 5: Upload to Qdrant
        self.upload_to_qdrant(embeddings, chunks, mode=mode)

        print("\n" + "="*60)
        print("‚úÖ PIPELINE COMPLETE!")
        print("="*60)
        print(f"üìä Your app now has access to {len(chunks):,} clinical trials")
        print("üîÑ No code changes needed - just reload your app!")


# Usage
if __name__ == "__main__":
    import getpass

    # Configuration
    DRIVE_FOLDER = "/content/drive/MyDrive/LLM_Based_GenAI_Sem1/data"
    QDRANT_URL = "https://215ec69e-fa22-4f38-bcf3-941e73901a68.us-east4-0.gcp.cloud.qdrant.io"

    print("üîê Qdrant Configuration")
    qdrant_key = getpass.getpass("Enter Qdrant API Key: ")

    print("\nüìã Update Mode:")
    print("1. refresh - Delete all old data and upload fresh")
    print("2. add - Keep existing data and add new data")
    mode_choice = input("Choose mode (1 or 2): ").strip()
    mode = "refresh" if mode_choice == "1" else "add"

    # Run pipeline
    pipeline = QdrantAutoPipeline(QDRANT_URL, qdrant_key)
    pipeline.run_auto_pipeline(DRIVE_FOLDER, mode=mode)


Overwriting update_qdrant_auto.py


In [None]:
!python update_qdrant_auto.py

2025-11-30 16:26:56.042066: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1764520016.065483   41049 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1764520016.075744   41049 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1764520016.097559   41049 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1764520016.097585   41049 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1764520016.097591   41049 computation_placer.cc:177] computation placer alr