# WillGPT ‚Üí Qdrant Upload (Google Colab)

Generate BGE-M3 embeddings and upload to Qdrant using Colab GPU.

**Runtime**: GPU (T4 or better recommended)

## Setup Instructions:
1. Runtime ‚Üí Change runtime type ‚Üí GPU
2. Run cells in order
3. Upload your `processed_conversations.json` when prompted
4. Update configuration in cell 2 with your Qdrant credentials

## 1. Install Dependencies

In [None]:
!pip install -q qdrant-client sentence-transformers tqdm

## 2. Configuration

In [None]:
# Qdrant Configuration
QDRANT_API_KEY = "YOUR_API_KEY_HERE"  # Replace with your API key
QDRANT_URL = "YOUR_QDRANT_URL_HERE"   # Replace with your Qdrant URL
COLLECTION_NAME = "will-gpt"

# Embedding Configuration
MODEL_NAME = "BAAI/bge-m3"
EMBEDDING_MODE = "user_focused"  # Options: balanced, user_focused, minimal, full
BATCH_SIZE = 32  # GPU can handle larger batches

print("‚úÖ Configuration loaded")
print(f"   Embedding mode: {EMBEDDING_MODE}")
print(f"   Batch size: {BATCH_SIZE}")

## 3. Upload processed_conversations.json

**Upload your `data/processed_conversations.json` file using the file upload button on the left sidebar.**

In [None]:
from google.colab import files
import json

print("Upload processed_conversations.json:")
uploaded = files.upload()

# Verify file
if 'processed_conversations.json' in uploaded:
    print("‚úÖ File uploaded successfully")
else:
    print("‚ùå Please upload processed_conversations.json")

## 4. Load Data and Model

In [None]:
import json
import torch
from sentence_transformers import SentenceTransformer
from tqdm.auto import tqdm

# Load conversations
print("Loading conversations...")
with open('processed_conversations.json', 'r') as f:
    data = json.load(f)

# Handle optimized format with deduplicated interpretations
interpretations_store = data.get('interpretations', {})
chunks = data['chunks']

# Restore interpretations from references
for chunk in chunks:
    if 'ai_interpretation_ref' in chunk:
        interp_ref = chunk['ai_interpretation_ref']
        interp = interpretations_store.get(interp_ref, {})
        
        # Extract about_user and about_model for easier access
        user_context = interp.get('user_context_message_data', {})
        chunk['about_user'] = user_context.get('about_user_message', '')
        chunk['about_model'] = user_context.get('about_model_message', '')

print(f"‚úÖ Loaded {len(chunks)} chunks")
print(f"   Unique interpretations: {len(interpretations_store)}")

# Check GPU
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"\nDevice: {device}")
if device == 'cuda':
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")

# Load BGE-M3
print(f"\nLoading {MODEL_NAME}...")
model = SentenceTransformer(MODEL_NAME, device=device)
vector_size = model.get_sentence_embedding_dimension()
print(f"‚úÖ Model loaded (vector size: {vector_size})")

## 5. Generate Embedding Texts

In [None]:
def to_embedding_text(chunk, mode="balanced", max_assistant_chars=3000):
    """Generate embedding text from chunk"""
    parts = []
    
    # Topic
    if chunk.get('conversation_title') and chunk['conversation_title'] != "Untitled":
        parts.append(f"[TOPIC: {chunk['conversation_title']}]")
    
    # User message
    if chunk.get('user_message'):
        if mode == "minimal":
            return chunk['user_message']
        parts.append(chunk['user_message'])
    
    # Assistant
    if chunk.get('assistant_message') and mode in ["balanced", "full"]:
        assistant = chunk['assistant_message']
        if mode == "balanced" and len(assistant) > max_assistant_chars:
            half = max_assistant_chars // 2
            assistant = assistant[:half] + "\n[...]\n" + assistant[-half:]
        parts.append(f"[RESPONSE] {assistant}")
    
    # AI interpretations
    if chunk.get('about_user'):
        parts.append(f"[AI_UNDERSTANDING] {chunk['about_user']}")
    if chunk.get('about_model'):
        parts.append(f"[AI_NOTES] {chunk['about_model']}")
    
    return '\n\n'.join(parts)

# Generate texts
print("Generating embedding texts...")
embedding_texts = [to_embedding_text(chunk, mode=EMBEDDING_MODE) for chunk in tqdm(chunks)]
print(f"‚úÖ Generated {len(embedding_texts)} texts")
print(f"\nAvg length: {sum(len(t) for t in embedding_texts) / len(embedding_texts):.0f} chars")

## 6. Generate Embeddings

In [None]:
print(f"Generating embeddings (batch size: {BATCH_SIZE})...")
embeddings = model.encode(
    embedding_texts,
    show_progress_bar=True,
    convert_to_tensor=True,
    batch_size=BATCH_SIZE,
)

# Convert to CPU numpy for upload
embeddings = embeddings.cpu().numpy()
print(f"‚úÖ Generated {len(embeddings)} embeddings")
print(f"Shape: {embeddings.shape}")

## 7. Connect to Qdrant

In [None]:
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams, PointStruct

print("Connecting to Qdrant...")
client = QdrantClient(
    url=QDRANT_URL,
    api_key=QDRANT_API_KEY,
    prefer_grpc=False,
)

# Check existing collections
collections = client.get_collections()
print(f"‚úÖ Connected!")
print(f"Existing collections: {[c.name for c in collections.collections]}")

## 8. Create Collection

In [None]:
# Check if collection exists
collection_exists = any(c.name == COLLECTION_NAME for c in collections.collections)

if collection_exists:
    print(f"‚ö†Ô∏è  Collection '{COLLECTION_NAME}' exists!")
    delete = input("Delete and recreate? (yes/no): ")
    if delete.lower() == 'yes':
        client.delete_collection(COLLECTION_NAME)
        print(f"Deleted existing collection")
    else:
        print("Keeping existing collection (will update points)")

if not collection_exists or delete.lower() == 'yes':
    print(f"\nCreating collection '{COLLECTION_NAME}'...")
    client.create_collection(
        collection_name=COLLECTION_NAME,
        vectors_config={
            "dense": VectorParams(
                size=vector_size,
                distance=Distance.COSINE,
            )
        },
    )
    print(f"‚úÖ Collection created")

## 9. Upload to Qdrant

In [None]:
print(f"\nUploading {len(embeddings)} points to Qdrant...")

# Create points
points = []
for idx, (chunk, embedding) in enumerate(tqdm(zip(chunks, embeddings), total=len(chunks))):
    payload = {
        "conversation_id": chunk.get('conversation_id'),
        "platform": chunk.get('platform'),
        "timestamp": chunk.get('timestamp'),
        "conversation_title": chunk.get('conversation_title'),
        "turn_number": chunk.get('turn_number'),
        "user_message": chunk.get('user_message'),
        "assistant_message": chunk.get('assistant_message'),
        "assistant_model": chunk.get('assistant_model'),
        "has_interpretations": chunk.get('has_interpretations', False),
    }
    
    # Add interpretations if present
    if chunk.get('about_user'):
        payload['about_user'] = chunk['about_user']
    if chunk.get('about_model'):
        payload['about_model'] = chunk['about_model']
    
    point = PointStruct(
        id=idx,
        vector={"dense": embedding.tolist()},
        payload=payload
    )
    points.append(point)
    
    # Upload in batches
    if len(points) >= 100 or idx == len(chunks) - 1:
        client.upsert(
            collection_name=COLLECTION_NAME,
            points=points
        )
        points = []

print("‚úÖ Upload complete!")

## 10. Verify

In [None]:
collection_info = client.get_collection(COLLECTION_NAME)

print("="*70)
print("‚úÖ UPLOAD COMPLETE!")
print("="*70)
print(f"Collection: {COLLECTION_NAME}")
print(f"Total points: {collection_info.points_count}")
print(f"Vector size: {vector_size}")
print(f"Embedding mode: {EMBEDDING_MODE}")
print(f"\nüîç Ready for hybrid search!")