In [None]:

import json
import os
from typing import List, Dict, Any
from datetime import datetime
import logging
from dotenv import load_dotenv

In [None]:
# Load environment variables from .env file
load_dotenv()

In [None]:
# LangChain imports
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.schema import Document

# Qdrant imports
from qdrant_client import QdrantClient
from qdrant_client.models import (
    Distance, 
    VectorParams, 
    PointStruct,
    CollectionStatus,
    OptimizersConfigDiff,
    HnswConfigDiff
)

In [None]:
# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)


In [None]:
class JobChunkingEmbeddingPipeline:
    def __init__(self, 
                 openai_api_key: str = None,
                 qdrant_url: str = None,
                 qdrant_port: int = None,
                 qdrant_api_key: str = None,
                 collection_name: str = "jobs",
                 embedding_model: str = "text-embedding-3-small"):
        """
        Initialize the pipeline with necessary configurations
        """
        # Use provided values or fall back to environment variables
        openai_api_key = openai_api_key or os.getenv("OPENAI_API_KEY")
        qdrant_url = qdrant_url or os.getenv("QDRANT_URL", "localhost")
        qdrant_port = qdrant_port or int(os.getenv("QDRANT_PORT", "6333"))
        qdrant_api_key = qdrant_api_key or os.getenv("QDRANT_API_KEY")
        
        if not openai_api_key:
            raise ValueError("OpenAI API key not found. Please set OPENAI_API_KEY in .env file")
        
        self.embeddings = OpenAIEmbeddings(
            openai_api_key=openai_api_key,
            model=embedding_model
        )
        
        # Initialize Qdrant client
        if qdrant_api_key:
            # For Qdrant Cloud
            self.client = QdrantClient(
                url=qdrant_url,
                api_key=qdrant_api_key,
                prefer_grpc=True
            )
            logger.info(f"Connected to Qdrant Cloud at {qdrant_url}")
        else:
            # For local Qdrant
            self.client = QdrantClient(host=qdrant_url, port=qdrant_port)
            logger.info(f"Connected to local Qdrant at {qdrant_url}:{qdrant_port}")
        
        self.collection_name = collection_name
        
        # Initialize collections
        self._setup_collections()
        
    def _setup_collections(self):
        """Create Qdrant collections if they don't exist"""
        collections = self.client.get_collections().collections
        existing_collections = [col.name for col in collections]
        
        if self.collection_name not in existing_collections:
            logger.info(f"Creating collection: {self.collection_name}")
            self.client.create_collection(
                collection_name=self.collection_name,
                vectors_config=VectorParams(
                    size=1536,  # OpenAI text-embedding-3-small dimension
                    distance=Distance.COSINE
                ),
                optimizers_config=OptimizersConfigDiff(
                    default_segment_number=2,
                    indexing_threshold=10000,
                ),
                hnsw_config=HnswConfigDiff(
                    m=16,
                    ef_construct=100
                )
            )
            
            # Create payload indexes for efficient filtering
            self._create_payload_indexes()
        else:
            logger.info(f"Collection {self.collection_name} already exists")
    
    def _create_payload_indexes(self):
        """Create indexes on frequently filtered fields"""
        index_fields = [
            ("job_id", "keyword"),
            ("chunk_type", "keyword"),
            ("job_title", "text"),
            ("job_level", "keyword"),
            ("work_type", "keyword"),
            ("skills", "keyword[]"),
            ("created_date", "datetime")
        ]
        
        for field_name, field_type in index_fields:
            try:
                self.client.create_payload_index(
                    collection_name=self.collection_name,
                    field_name=field_name,
                    field_schema=field_type
                )
                logger.info(f"Created index for field: {field_name}")
            except Exception as e:
                logger.warning(f"Index for {field_name} might already exist: {e}")
    
    def create_job_chunks(self, job: Dict[str, Any]) -> List[Dict[str, Any]]:
        """Create multiple chunks for a single job"""
        chunks = []
        
        # Extract job information
        job_info = job.get("Job Information", {})
        job_id = job.get("job_id", "unknown")
        job_title = job_info.get("job_title") or job_info.get("Job Title", "Unknown")
        job_level = job_info.get("job_level") or job_info.get("Job Level", "Unknown")
        work_type = job_info.get("work_type") or job_info.get("Work Type", "Unknown")
        
        # Extract other fields
        compensation = job.get("Compensation", {})
        overview = job.get("Overview", "")
        responsibilities = job.get("Responsibilities", [])
        requirements = job.get("Requirements", {})
        skills = requirements.get("Skills") or requirements.get("skills", [])
        experience = requirements.get("Experience") or requirements.get("experience", {})
        qualifications = requirements.get("Qualifications") or requirements.get("qualifications", [])
        
        # 1. Create Full Job Chunk
        full_text = f"""Job Title: {job_title}
Job Level: {job_level}
Work Type: {work_type}
Overview: {overview}
Responsibilities: {' '.join(responsibilities) if isinstance(responsibilities, list) else responsibilities}
Required Skills: {', '.join(skills) if isinstance(skills, list) else skills}
Experience: {experience.get('years_min', 'Not specified')} years minimum, Level: {experience.get('level', 'Not specified')}
Qualifications: {' '.join(qualifications) if isinstance(qualifications, list) else qualifications}"""
        
        chunks.append({
            'id': f"job_{job_id}_full",
            'text': full_text,
            'metadata': {
                'chunk_type': 'full_job',
                'job_id': job_id,
                'job_title': job_title,
                'job_level': job_level,
                'work_type': work_type,
                'skills': skills if isinstance(skills, list) else [],
                'years_required': experience.get('years_min') or experience.get('Years_min'),
                'experience_level': experience.get('level') or experience.get('Level'),
                'created_date': datetime.now().isoformat()
            }
        })
        
        # 2. Create Skills-Focused Chunk
        skills_text = f"""{job_title} position requires the following skills:
Technical Skills: {', '.join(skills) if isinstance(skills, list) else skills}
Experience Level: {experience.get('level', 'Not specified')}
Minimum Years: {experience.get('years_min', 'Not specified')}
Key Qualifications: {' '.join(qualifications[:3]) if isinstance(qualifications, list) else qualifications}"""
        
        chunks.append({
            'id': f"job_{job_id}_skills",
            'text': skills_text,
            'metadata': {
                'chunk_type': 'job_skills',
                'job_id': job_id,
                'job_title': job_title,
                'job_level': job_level,
                'skills': skills if isinstance(skills, list) else [],
                'experience_level': experience.get('level') or experience.get('Level'),
                'created_date': datetime.now().isoformat()
            }
        })
        
        # 3. Create Responsibilities Chunk
        resp_text = f"""{job_title} responsibilities include:
{' '.join(responsibilities) if isinstance(responsibilities, list) else responsibilities}
This role is {job_level} level and offers {work_type} work arrangement."""
        
        chunks.append({
            'id': f"job_{job_id}_responsibilities",
            'text': resp_text,
            'metadata': {
                'chunk_type': 'job_responsibilities',
                'job_id': job_id,
                'job_title': job_title,
                'job_level': job_level,
                'work_type': work_type,
                'created_date': datetime.now().isoformat()
            }
        })
        
        return chunks
    
    def process_jobs_batch(self, jobs: List[Dict[str, Any]], batch_size: int = 50):
        """Process jobs in batches"""
        total_jobs = len(jobs)
        logger.info(f"Processing {total_jobs} jobs in batches of {batch_size}")
        
        for i in range(0, total_jobs, batch_size):
            batch = jobs[i:i + batch_size]
            batch_chunks = []
            batch_texts = []
            
            # Create chunks for all jobs in batch
            for job in batch:
                chunks = self.create_job_chunks(job)
                for chunk in chunks:
                    batch_chunks.append(chunk)
                    batch_texts.append(chunk['text'])
            
            # Generate embeddings for all chunks in batch
            logger.info(f"Generating embeddings for {len(batch_texts)} chunks...")
            embeddings = self.embeddings.embed_documents(batch_texts)
            
            # Prepare points for Qdrant
            points = []
            for chunk, embedding in zip(batch_chunks, embeddings):
                point = PointStruct(
                    id=chunk['id'],
                    vector=embedding,
                    payload={
                        'text': chunk['text'],
                        **chunk['metadata']
                    }
                )
                points.append(point)
            
            # Upsert to Qdrant
            logger.info(f"Upserting {len(points)} points to Qdrant...")
            self.client.upsert(
                collection_name=self.collection_name,
                points=points,
                wait=True
            )
            
            logger.info(f"Completed batch {i//batch_size + 1}/{(total_jobs + batch_size - 1)//batch_size}")
    
    def get_collection_stats(self):
        """Get statistics about the collection"""
        collection_info = self.client.get_collection(self.collection_name)
        return {
            'total_points': collection_info.points_count,
            'vectors_count': collection_info.vectors_count,
            'indexed_vectors': collection_info.indexed_vectors_count,
            'status': collection_info.status
        }
    
    def search_jobs(self, query: str, limit: int = 5):
        """Search for jobs using semantic search"""
        # Generate embedding for query
        query_embedding = self.embeddings.embed_query(query)
        
        # Search in Qdrant
        search_results = self.client.search(
            collection_name=self.collection_name,
            query_vector=query_embedding,
            limit=limit,
            with_payload=True
        )
        
        return search_results


In [None]:
def main():
    # Load configuration from environment variables
    logger.info("Loading configuration from environment...")
    
    # All configuration is loaded from .env file
    OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
    QDRANT_URL = os.getenv("QDRANT_URL", "localhost")
    QDRANT_PORT = int(os.getenv("QDRANT_PORT", "6333"))
    QDRANT_API_KEY = os.getenv("QDRANT_API_KEY")  # Optional, for Qdrant Cloud
    COLLECTION_NAME = os.getenv("COLLECTION_NAME", "jobs")
    EMBEDDING_MODEL = os.getenv("EMBEDDING_MODEL", "text-embedding-3-small")
    BATCH_SIZE = int(os.getenv("BATCH_SIZE", "50"))
    
    # Validate required environment variables
    if not OPENAI_API_KEY:
        logger.error("OPENAI_API_KEY not found in environment variables")
        logger.error("Please create a .env file with your API keys")
        return
    
    # Load jobs data
    jobs_file = os.getenv("JOBS_FILE", "jobs_summaries.json")
    logger.info(f"Loading jobs data from {jobs_file}...")
    
    try:
        with open(jobs_file, 'r') as f:
            jobs_data = json.load(f)
    except FileNotFoundError:
        logger.error(f"Jobs file '{jobs_file}' not found")
        return
    except json.JSONDecodeError:
        logger.error(f"Invalid JSON in '{jobs_file}'")
        return
    
    logger.info(f"Loaded {len(jobs_data)} jobs")
    
    # Initialize pipeline
    try:
        pipeline = JobChunkingEmbeddingPipeline(
            openai_api_key=OPENAI_API_KEY,
            qdrant_url=QDRANT_URL,
            qdrant_port=QDRANT_PORT,
            qdrant_api_key=QDRANT_API_KEY,
            collection_name=COLLECTION_NAME,
            embedding_model=EMBEDDING_MODEL
        )
    except Exception as e:
        logger.error(f"Failed to initialize pipeline: {e}")
        return
    
    # Process jobs
    pipeline.process_jobs_batch(jobs_data, batch_size=BATCH_SIZE)
    
    # Get collection statistics
    stats = pipeline.get_collection_stats()
    logger.info(f"Collection statistics: {stats}")
    
    # Test search functionality if enabled
    if os.getenv("RUN_SEARCH_TEST", "true").lower() == "true":
        logger.info("\nTesting search functionality...")
        test_queries = [
            "frontend engineer with React experience",
            "data engineer with Python and AWS",
            "senior level remote positions"
        ]
        
        for query in test_queries:
            logger.info(f"\nSearching for: '{query}'")
            results = pipeline.search_jobs(query, limit=3)
            for i, result in enumerate(results):
                logger.info(f"  Result {i+1}: {result.payload.get('job_title')} (Score: {result.score:.3f})")

if __name__ == "__main__":
    main()