# WillGPT ‚Üí Qdrant Upload (Google Colab)

Generate BGE-M3 embeddings and upload to Qdrant using Colab GPU.

**Supports**: ChatGPT, Claude, and Claude Projects (multi-platform)

**Runtime**: GPU (T4 or better recommended)

## Setup Instructions:
1. Runtime ‚Üí Change runtime type ‚Üí GPU
2. Run cells in order
3. Upload your `merged_conversations.json` when prompted
4. Update configuration in cell 2 with your Qdrant credentials

## 1. Install Dependencies

In [None]:
!pip install -q qdrant-client sentence-transformers tqdm FlagEmbedding

## 2. Configuration

In [None]:
# REQUIRED: Add your Qdrant credentials here
QDRANT_API_KEY = "YOUR_QDRANT_API_KEY_HERE"  # Get from Qdrant Cloud dashboard
QDRANT_URL = "YOUR_QDRANT_CLUSTER_URL_HERE"  # e.g., https://xxxxx.aws.cloud.qdrant.io:6333
COLLECTION_NAME = "will-gpt"
BATCH_SIZE = 4

# Embedding Configuration
MODEL_NAME = "BAAI/bge-m3"
EMBEDDING_MODE = "user_focused"  # Options: balanced, user_focused, minimal, full

# Expected file: merged_conversations.json (23,592 chunks)
#   - 11,880 ChatGPT chunks
#   - 11,690 Claude chunks  
#   - 22 Claude Projects chunks

print("‚úÖ Configuration loaded")
print(f"   Collection: {COLLECTION_NAME}")
print(f"   Embedding mode: {EMBEDDING_MODE}")
print(f"   Batch size: {BATCH_SIZE}")
print(f"   Expected platforms: ChatGPT, Claude, Claude Projects")

## 3. Upload merged_conversations.json

**Upload your `data/processed/merged_conversations.json` file using the file upload button on the left sidebar.**

This file contains **all platforms merged together**:
- ChatGPT conversations
- Claude conversations  
- Claude Projects (user memory, project docs, custom instructions)

In [None]:
from google.colab import files
import json

print("üìÅ Upload merged_conversations.json:")
print("   (Contains ChatGPT + Claude + Claude Projects)")
uploaded = files.upload()

# Accept either filename
filename = None
if 'merged_conversations.json' in uploaded:
    filename = 'merged_conversations.json'
    print("‚úÖ Merged conversations file uploaded")
elif 'processed_conversations.json' in uploaded:
    filename = 'processed_conversations.json'
    print("‚úÖ Processed conversations file uploaded")
else:
    # Use first uploaded file
    filename = list(uploaded.keys())[0]
    print(f"‚úÖ Using uploaded file: {filename}")

print(f"\nüìä File size: {len(uploaded[filename]) / (1024*1024):.1f} MB")

## 4. Load Data and Model

In [None]:
import json
import torch
from FlagEmbedding import BGEM3FlagModel
from tqdm.auto import tqdm

# Load conversations
print("Loading conversations...")
with open(filename, 'r') as f:
    data = json.load(f)

# Handle optimized format with deduplicated interpretations
interpretations_store = data.get('interpretations', {})
chunks = data['chunks']

# Restore interpretations from references and extract platform-specific data
for chunk in chunks:
    if 'ai_interpretation_ref' in chunk:
        interp_ref = chunk['ai_interpretation_ref']
        interp = interpretations_store.get(interp_ref, {})
        chunk['ai_interpretations'] = interp
    
    # Extract interpretations based on platform
    platform = chunk.get('platform', 'unknown')
    interp = chunk.get('ai_interpretations', {})
    
    if platform == 'chatgpt':
        # ChatGPT format
        user_context = interp.get('user_context_message_data', {})
        chunk['about_user'] = user_context.get('about_user_message', '')
        chunk['about_model'] = user_context.get('about_model_message', '')
    elif platform == 'claude':
        # Claude format
        chunk['about_user'] = interp.get('user_model', '')
        chunk['about_model'] = interp.get('thinking', '')
    elif platform == 'claude-projects':
        # Claude Projects format
        chunk['about_user'] = f"Project: {interp.get('parent_project', '')}"
        chunk['about_model'] = interp.get('content_type', '')
    else:
        chunk['about_user'] = ''
        chunk['about_model'] = ''

print(f"‚úÖ Loaded {len(chunks)} chunks")
if interpretations_store:
    print(f"   Unique interpretations: {len(interpretations_store)}")

# Count platforms
platforms = {}
for chunk in chunks:
    platform = chunk.get('platform', 'unknown')
    platforms[platform] = platforms.get(platform, 0) + 1

print(f"\nPlatform distribution:")
for platform, count in sorted(platforms.items()):
    print(f"   {platform}: {count:,} chunks")

# Check GPU
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"\nDevice: {device}")
if device == 'cuda':
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")

# Load BGE-M3 with FlagEmbedding
print(f"\nLoading {MODEL_NAME}...")
use_fp16 = device == 'cuda'
model = BGEM3FlagModel(MODEL_NAME, use_fp16=use_fp16, device=device)
vector_size = 1024  # BGE-M3 dense vector dimension
print(f"‚úÖ Model loaded (vector size: {vector_size}, FP16: {use_fp16})")

## 5. Generate Embedding Texts

In [None]:
def to_embedding_text(chunk, mode="balanced", max_assistant_chars=3000):
    """Generate embedding text from chunk - supports all platforms"""
    parts = []
    
    # Topic context
    if chunk.get('conversation_title') and chunk['conversation_title'] != "Untitled":
        parts.append(f"[TOPIC: {chunk['conversation_title']}]")
    
    # Platform-specific handling
    platform = chunk.get('platform', 'unknown')
    
    # Add platform marker for better search
    if platform == 'claude-projects':
        parts.append(f"[CLAUDE-PROJECTS]")
    
    # User message
    if chunk.get('user_message'):
        if mode == "minimal":
            return chunk['user_message']
        parts.append(chunk['user_message'])
    
    # Assistant response
    if chunk.get('assistant_message') and mode in ["balanced", "full"]:
        assistant = chunk['assistant_message']
        if mode == "balanced" and len(assistant) > max_assistant_chars:
            half = max_assistant_chars // 2
            assistant = assistant[:half] + "\n[...]\n" + assistant[-half:]
        parts.append(f"[RESPONSE] {assistant}")
    
    # AI interpretations (already extracted by platform in previous cell)
    if chunk.get('about_user'):
        parts.append(f"[AI_UNDERSTANDING] {chunk['about_user']}")
    if chunk.get('about_model'):
        parts.append(f"[AI_NOTES] {chunk['about_model']}")
    
    return '\n\n'.join(parts)

# Generate texts
print("Generating embedding texts for all platforms...")
embedding_texts = [to_embedding_text(chunk, mode=EMBEDDING_MODE) for chunk in tqdm(chunks)]
print(f"‚úÖ Generated {len(embedding_texts)} embedding texts")

# Show stats by platform
print(f"\nEmbedding text stats by platform:")
platforms = {}
for chunk in chunks:
    platform = chunk.get('platform', 'unknown')
    if platform not in platforms:
        platforms[platform] = []
    platforms[platform].append(to_embedding_text(chunk, mode=EMBEDDING_MODE))

for platform, texts in sorted(platforms.items()):
    avg_len = sum(len(t) for t in texts) / len(texts)
    print(f"  {platform}: {len(texts):,} texts, avg {avg_len:.0f} chars")

print(f"\nOverall avg length: {sum(len(t) for t in embedding_texts) / len(embedding_texts):.0f} chars")

## 6. Generate Embeddings (Dense + Sparse)

In [None]:
print(f"Generating embeddings with hybrid search (batch size: {BATCH_SIZE})...")
print(f"Mode: Dense + Sparse (lexical weights)")

output = model.encode(
    embedding_texts,
    return_dense=True,
    return_sparse=True,  # Enable sparse vectors for hybrid search
    return_colbert_vecs=False,
    batch_size=BATCH_SIZE,
)

# Extract dense and sparse embeddings
dense_embeddings = output['dense_vecs']  # numpy array: [batch_size, 1024]
sparse_embeddings = output['lexical_weights']  # list of dicts: [{'token_id': weight}]

print(f"‚úÖ Generated {len(dense_embeddings)} dense + sparse embedding pairs")
print(f"Dense shape: {dense_embeddings.shape}")
print(f"Sparse vectors: {len(sparse_embeddings)} lexical weight mappings")

# Show example sparse vector
if sparse_embeddings and sparse_embeddings[0]:
    example_tokens = list(sparse_embeddings[0].items())[:5]
    print(f"\nExample sparse vector (first 5 tokens): {example_tokens}")

## 7. Connect to Qdrant

In [None]:
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams, PointStruct, SparseVectorParams

print("Connecting to Qdrant...")
client = QdrantClient(
    url=QDRANT_URL,
    api_key=QDRANT_API_KEY,
    prefer_grpc=False,
)

# Check existing collections
collections = client.get_collections()
print(f"‚úÖ Connected!")
print(f"Existing collections: {[c.name for c in collections.collections]}")

## 8. Create Collection

In [None]:
# Check if collection exists
collection_exists = any(c.name == COLLECTION_NAME for c in collections.collections)

if collection_exists:
    print(f"‚úÖ Collection '{COLLECTION_NAME}' already exists!")
    
    # Get existing collection info
    collection_info = client.get_collection(COLLECTION_NAME)
    print(f"   Current points: {collection_info.points_count:,}")
    print(f"   Vectors: dense (1024 dims) + sparse")
    
    delete = input("\nDelete and recreate? (yes/no): ")
    if delete.lower() == 'yes':
        client.delete_collection(COLLECTION_NAME)
        print(f"‚úÖ Deleted existing collection")
        collection_exists = False
    else:
        print("‚úÖ Keeping existing collection (will update/add points)")

if not collection_exists:
    print(f"\nCreating collection '{COLLECTION_NAME}' with hybrid search...")
    from qdrant_client.models import SparseVectorParams
    
    client.create_collection(
        collection_name=COLLECTION_NAME,
        vectors_config={
            "dense": VectorParams(
                size=vector_size,
                distance=Distance.COSINE,
            )
        },
        sparse_vectors_config={
            "sparse": SparseVectorParams()
        }
    )
    print(f"‚úÖ Collection created with dense + sparse vectors")

## 9. Upload to Qdrant

In [None]:
from qdrant_client.models import SparseVector
from datetime import datetime

print(f"\nUploading {len(dense_embeddings)} points with hybrid vectors to Qdrant...")
print(f"Platforms: ChatGPT, Claude, Claude Projects")

# Create points with both dense and sparse vectors
points = []
for idx, (chunk, dense_emb, sparse_weights) in enumerate(tqdm(zip(chunks, dense_embeddings, sparse_embeddings), total=len(chunks))):
    
    # Convert timestamp to UNIX timestamp (float) for Qdrant range filtering
    timestamp_str = chunk.get('timestamp')
    timestamp_float = None
    if timestamp_str:
        try:
            # Parse ISO format timestamp and convert to UNIX timestamp
            dt = datetime.fromisoformat(timestamp_str.replace('Z', '+00:00'))
            timestamp_float = dt.timestamp()
        except Exception as e:
            print(f"Warning: Could not parse timestamp '{timestamp_str}': {e}")
    
    # Build payload with fields common to all platforms
    payload = {
        "conversation_id": chunk.get('conversation_id'),
        "platform": chunk.get('platform'),
        "timestamp": timestamp_float,  # FLOAT for range filtering
        "timestamp_iso": timestamp_str,  # Keep ISO string for display
        "conversation_title": chunk.get('conversation_title'),
        "turn_number": chunk.get('turn_number', 0),
        "user_message": chunk.get('user_message'),
        "assistant_message": chunk.get('assistant_message'),
        "assistant_model": chunk.get('assistant_model'),
        "user_message_type": chunk.get('user_message_type'),
        "assistant_message_type": chunk.get('assistant_message_type'),
    }
    
    # Add optional fields if present
    if chunk.get('system_context'):
        payload['system_context'] = chunk['system_context']
    
    if chunk.get('tool_usage'):
        payload['tool_usage'] = chunk['tool_usage']
        payload['has_tool_usage'] = True
    else:
        payload['has_tool_usage'] = False
    
    if chunk.get('has_branches'):
        payload['has_branches'] = chunk['has_branches']
    
    # Check if original ai_interpretations exists and is non-empty
    # This ensures the flag is set correctly for all platforms
    has_interpretations = bool(chunk.get('ai_interpretations'))
    
    # Add extracted interpretation fields to payload if they exist
    if chunk.get('about_user'):
        payload['about_user'] = chunk['about_user']
    if chunk.get('about_model'):
        payload['about_model'] = chunk['about_model']
    
    payload['has_interpretations'] = has_interpretations
    
    # Convert sparse weights to Qdrant format
    if sparse_weights:
        indices = list(sparse_weights.keys())
        values = list(sparse_weights.values())
        sparse_vector = SparseVector(indices=indices, values=values)
    else:
        sparse_vector = SparseVector(indices=[], values=[])
    
    point = PointStruct(
        id=idx,
        vector={
            "dense": dense_emb.tolist(),
            "sparse": sparse_vector
        },
        payload=payload
    )
    points.append(point)
    
    # Upload in batches of 100
    if len(points) >= 100 or idx == len(chunks) - 1:
        client.upsert(
            collection_name=COLLECTION_NAME,
            points=points
        )
        points = []

print("‚úÖ Upload complete with hybrid (dense + sparse) vectors!")
print(f"   Total points uploaded: {len(chunks):,}")

## 10. Verify

In [None]:
collection_info = client.get_collection(COLLECTION_NAME)

# Count platforms from uploaded chunks
platform_counts = {}
for chunk in chunks:
    platform = chunk.get('platform', 'unknown')
    platform_counts[platform] = platform_counts.get(platform, 0) + 1

print("="*70)
print("‚úÖ MULTI-PLATFORM HYBRID SEARCH UPLOAD COMPLETE!")
print("="*70)
print(f"Collection: {COLLECTION_NAME}")
print(f"Total points: {collection_info.points_count:,}")
print(f"Vector size: {vector_size}")
print(f"Embedding mode: {EMBEDDING_MODE}")

print(f"\nPlatform breakdown:")
for platform, count in sorted(platform_counts.items()):
    print(f"  {platform}: {count:,} chunks")

print(f"\nHybrid Search Enabled:")
print(f"  ‚úÖ Dense vectors (semantic similarity)")
print(f"  ‚úÖ Sparse vectors (lexical/keyword matching)")

print(f"\nüîç Ready for cross-platform hybrid search!")
print(f"   - ChatGPT conversations")
print(f"   - Claude conversations")
print(f"   - Claude Projects (user memory, docs, custom instructions)")