In [None]:
# Import libraries and initialize clients
import logging
from qdrant_client import QdrantClient
from qdrant_client import models
from zep_cloud.client import Zep
import openai
import os
import re
import uuid
import tiktoken
from typing import List, Dict, Any
from dotenv import load_dotenv

# Configure logging for notebook environment
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Load environment variables
load_dotenv()

# Check required environment variables
required_vars = ["QDRANT_CLOUD_API_KEY", "OPENAI_API_KEY", "ZEP_API_KEY"]
for var in required_vars:
    if not os.getenv(var):
        logger.warning(f"Missing environment variable: {var}")

# Initialize clients
qdrant_client = QdrantClient(
    url="https://3973cdf9-4ba6-40b1-ae92-b2f952f82fb9.europe-west3-0.gcp.cloud.qdrant.io:6333", 
    api_key=os.getenv("QDRANT_CLOUD_API_KEY"),
)

zep_client = Zep(
    api_key=os.environ.get('ZEP_API_KEY'),
)

openai_client = openai.Client(api_key=os.getenv("OPENAI_API_KEY"))

In [None]:
# Setup Zep user (keep for potential future use)
# Create user if user not created
try:
    zep_client.user.add(
        email="test@email.com",
        first_name="Test",
        last_name="User",
        user_id="user_1",  # do not change the id
    )
    logger.info("Zep user created or already exists")
except Exception as e:
    logger.error(f"Error creating Zep user: {e}")

# Check existing collections
collections = qdrant_client.get_collections()
logger.info(f"Existing collections: {[c.name for c in collections.collections]}")

In [None]:
# Define transcript loading and parsing functions
def load_transcripts(data_dir):
    """Load transcript text files from a directory."""
    transcripts = []
    file_count = 0
    
    for file_name in os.listdir(data_dir):
        if file_name.endswith(".txt"):
            with open(os.path.join(data_dir, file_name), 'r', encoding='utf-8') as f:
                transcripts.append(f.read())
                file_count += 1
    
    logger.info(f"Loaded {file_count} transcript files from {data_dir}")
    return transcripts


def parse_transcript(transcript):
    """Parse transcript text to extract metadata."""
    title_match = re.search(r"Title: (.+)", transcript)
    url_match = re.search(r"URL Source: (.+)", transcript)
    content_match = re.search(r"Markdown Content:(.+)", transcript, re.DOTALL)

    parsed_data = {
        "title": title_match.group(1) if title_match else "Unknown Title",
        "url": url_match.group(1) if url_match else "No URL",
        "content": content_match.group(1).strip() if content_match else ""
    }
    
    # Simple validation
    if not parsed_data["content"]:
        logger.warning(f"No content found for transcript: {parsed_data['title']}")
        
    return parsed_data

In [None]:
# Define tokenization and chunking functions
# Initialize the tokenizer
tokenizer = tiktoken.get_encoding("cl100k_base")

def get_token_count_by_subtopic(subtopics):
    """Get token counts for each subtopic for analysis."""
    token_counts = []
    for subtopic in subtopics:
        if isinstance(subtopic['content'], list):
            content = ' '.join(subtopic['content'])
        else:
            content = subtopic['content']
            
        tokens = tokenizer.encode(content)
        token_counts.append({
            'subtopic': subtopic['subtopic'],
            'token_count': len(tokens)
        })
    return token_counts


def chunk_text(text, max_tokens=500, min_tokens=300):
    """Split text into chunks based on token count."""
    # Tokenize the input text
    tokens = tokenizer.encode(text)
    chunks = []
    current_chunk = []
    
    for token in tokens:
        current_chunk.append(token)
        # If the current chunk exceeds the max token limit
        if len(current_chunk) >= max_tokens:
            chunks.append(current_chunk)
            current_chunk = []
            
    # Handle the last chunk, ensure it meets the minimum size requirement
    if current_chunk:
        if len(current_chunk) < min_tokens and chunks:
            # If the last chunk is smaller than the minimum, merge it with the previous chunk
            chunks[-1].extend(current_chunk)
        else:
            chunks.append(current_chunk)
            
    return [tokenizer.decode(chunk) for chunk in chunks]

In [None]:
# Define transcript parsing and chunking by subtopic
def parse_and_chunk_transcript_by_subtopic(data):
    """Parse transcript content and split it by subtopics and token count."""
    if not data or not data.get("content"):
        logger.warning("Invalid transcript data")
        return []
        
    transcript = data["content"]
    # Regex to find subtopics (e.g., Introduction, Education)
    subtopic_pattern = re.compile(r"^(.*)\n-+\n", re.MULTILINE)
    # Regex to capture speaker dialogue (e.g., Destiny [(00:00:00)]...)
    dialogue_pattern = re.compile(r"(?P<speaker>\w+)\s\[\((?P<timestamp>\d{2}:\d{2}:\d{2})\)\]\((?P<url>https:\/\/youtube\.com\/watch\?v=[^&]+&t=\d+)\)\s(?P<text>.+)")
    
    chunks = []
    subtopics = subtopic_pattern.split(transcript)

    for i in range(1, len(subtopics), 2):
        try:
            subtopic = subtopics[i].strip()
            content_block = subtopics[i + 1] if i + 1 < len(subtopics) else ""
            
            # Find all dialogues within this subtopic
            dialogues = dialogue_pattern.findall(content_block)
            
            formatted_text = []
            speakers = []
            tstamp = None
            
            for dialogue in dialogues:
                speaker, timestamp, url, text = dialogue
                if tstamp is None:
                    tstamp = f"[({timestamp})]({url})"
                
                if speaker not in speakers:
                    speakers.append(speaker)
                
                formatted_text.append(f"{speaker}: {text} \n")
            
            # Join the formatted text and check token count
            joined_text = ' '.join(formatted_text)
            tokens_enc = tokenizer.encode(joined_text)
            tok_count = len(tokens_enc)
            
            if tok_count > 500:
                token_chunks = chunk_text(joined_text)
                for chunk in token_chunks:
                    current_chunk = {
                        "subtopic": subtopic,
                        "content": chunk,
                        "metadata": {
                            "speakers": speakers,
                            "dialogue_count": len(dialogues),
                            "title": data["title"],
                            "url": data["url"],
                            "timestamp": tstamp
                        }
                    }
                    chunks.append(current_chunk)
            else:
                current_chunk = {
                    "subtopic": subtopic,
                    "content": joined_text,
                    "metadata": {
                        "speakers": speakers,
                        "dialogue_count": len(formatted_text),
                        "title": data["title"],
                        "url": data["url"],
                        "timestamp": tstamp
                    }
                }
                chunks.append(current_chunk)
        except Exception as e:
            logger.error(f"Error processing subtopic {subtopic}: {e}")
            
    return chunks

In [None]:
# Define embedding and vector database functions
def get_embedding(text):
    """Get OpenAI embedding for a single text."""
    try:
        response = openai_client.embeddings.create(input=text, model="text-embedding-3-small")
        return response.data[0].embedding
    except Exception as e:
        logger.error(f"Error getting embedding: {e}")
        return None

def get_embeddings_batch(texts, batch_size=20):
    """Get OpenAI embeddings for multiple texts efficiently."""
    all_embeddings = []
    
    # Process in batches to avoid rate limits
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        try:
            response = openai_client.embeddings.create(
                input=batch, 
                model="text-embedding-3-small"
            )
            batch_embeddings = [item.embedding for item in response.data]
            all_embeddings.extend(batch_embeddings)
            logger.info(f"Processed embeddings batch {i//batch_size + 1}")
        except Exception as e:
            logger.error(f"Error in batch {i//batch_size + 1}: {e}")
            # Fill with None for failed embeddings to maintain alignment
            all_embeddings.extend([None] * len(batch))
    
    return all_embeddings

In [None]:
# Test embedding function (keep for interactive exploration)
test_embed = get_embedding("hello world")
print(f"Embedding vector size: {len(test_embed)}")

In [None]:
# Define collection creation function
def create_collections(collection_name, vector_size=1536):
    """Create new collection in Qdrant cloud if it doesn't exist."""
    # Check if collection exists
    collections = qdrant_client.get_collections()
    collection_names = [c.name for c in collections.collections]
    
    if collection_name in collection_names:
        logger.info(f"Collection {collection_name} already exists")
        return
        
    # Create collection
    qdrant_client.create_collection(
        collection_name=collection_name,
        vectors_config=models.VectorParams(
            size=vector_size, 
            distance=models.Distance.COSINE,
            hnsw_config=models.HnswConfigDiff(
                m=16,
                ef_construct=100,
                full_scan_threshold=10000,
                max_indexing_threads=0
            )
        )
    )
    
    # Create indexes on metadata fields and full text
    index_fields = [
        ("subtopic", models.PayloadSchemaType.KEYWORD),
        ("speakers", models.PayloadSchemaType.KEYWORD),
        ("title", models.PayloadSchemaType.KEYWORD),
        ("content", models.PayloadSchemaType.TEXT)
    ]
    
    for field_name, field_schema in index_fields:
        qdrant_client.create_payload_index(
            collection_name=collection_name,
            field_name=field_name,
            field_schema=field_schema
        )
        
    logger.info(f"Created collection {collection_name} with indexes")

In [None]:
# Define batch upsert function for Qdrant
def batch_upsert_to_qdrant(collection_name, chunks, batch_size=100):
    """Process and upload chunks to Qdrant in batches."""
    total_chunks = len(chunks)
    successful_uploads = 0
    
    # Process in batches
    for i in range(0, total_chunks, batch_size):
        batch = chunks[i:i+batch_size]
        batch_texts = [chunk["content"] for chunk in batch]
        
        # Get embeddings for the batch
        embeddings = get_embeddings_batch(batch_texts)
        
        # Create points for successful embeddings
        points = []
        for chunk, embedding in zip(batch, embeddings):
            if embedding is None:
                continue
                
            points.append(models.PointStruct(
                id=str(uuid.uuid4()),
                vector=embedding,
                payload={
                    "subtopic": chunk["subtopic"],
                    "speakers": chunk["metadata"]["speakers"],
                    "content": chunk["content"],
                    "title": chunk["metadata"]["title"],
                    "url": chunk["metadata"]["url"],
                    "timestamp": chunk["metadata"]["timestamp"]
                }
            ))
        
        # Upsert points if any were created
        if points:
            try:
                qdrant_client.upsert(
                    collection_name=collection_name,
                    points=points
                )
                successful_uploads += len(points)
                logger.info(f"Uploaded {len(points)} points (batch {i//batch_size + 1})")
            except Exception as e:
                logger.error(f"Error uploading batch {i//batch_size + 1}: {e}")
    
    logger.info(f"Upload complete: {successful_uploads}/{total_chunks} chunks successfully uploaded")
    return successful_uploads

In [None]:
# Main processing - Load transcripts and prepare chunks
# Define the collection name
COLLECTION_NAME = "podcasts"
VECTOR_SIZE = 1536  # Size of OpenAI's text-embedding-3-small model output

# Create the collection
create_collections(COLLECTION_NAME, VECTOR_SIZE)

# Load and process transcripts
data_dir = "../data"
transcripts = load_transcripts(data_dir)
logger.info(f"Processing {len(transcripts)} transcripts")

# Prepare all chunks before embedding
all_chunks = []
for idx, transcript in enumerate(transcripts):
    data = parse_transcript(transcript)
    if not data["content"]:
        logger.warning(f"Skipping transcript #{idx} due to empty content")
        continue
        
    chunks = parse_and_chunk_transcript_by_subtopic(data)
    all_chunks.extend(chunks)
    logger.info(f"Transcript #{idx} produced {len(chunks)} chunks")

logger.info(f"Total chunks to process: {len(all_chunks)}")

In [None]:
# Upload chunks to Qdrant in batches
# Set batch size for processing
BATCH_SIZE = 50
successful_uploads = batch_upsert_to_qdrant(COLLECTION_NAME, all_chunks, BATCH_SIZE)

In [None]:
# Final count and verification
# Count how many chunks we processed in total
total_chunk_count = len(all_chunks)

# Get count of documents in the collection
try:
    collection_info = qdrant_client.get_collection(COLLECTION_NAME)
    points_count = collection_info.points_count
    logger.info(f"Collection contains {points_count} points")
    logger.info(f"Upload efficiency: {successful_uploads}/{total_chunk_count} chunks ({successful_uploads/total_chunk_count:.1%})")
except Exception as e:
    logger.error(f"Error getting collection info: {e}")