In [37]:
import os
import pandas as pd
import json
from langchain_openai import ChatOpenAI
from langchain.prompts import PromptTemplate
from langchain_core.output_parsers.json import JsonOutputParser
from dotenv import load_dotenv

In [38]:
load_dotenv(dotenv_path="../../.env", override=True)

True

In [50]:
import json

def create_job_summary_prompt():
    """Create the prompt template for job description summarization"""
    
    template = """
    You are an expert HR professional and recruiter. Please analyze the following job description and provide a comprehensive summary in JSON format.

    Job Description:
    {job_description}

    Please extract and organize the information into the following structure:

    **Job Information**
    - Extract job title from the description
    - Determine job level based on responsibilities and requirements (Entry/Junior/Mid-level/Senior/Lead/Executive)

    **Company Information**
    - Extract company name if mentioned, otherwise use null
    - Determine or extract industry if mentioned, otherwise use null

    **Location Information**
    - Extract city, state, country if mentioned, otherwise use null
    - Determine work type: "remote", "hybrid", or "on-site"
    - Assess remote flexibility: "full", "partial", or "none"

    **Compensation**
    - Extract salary information:
      - Min salary: minimum salary as number without commas
      - Max salary: maximum salary as number without commas
      - If range given (e.g., "$120,000 - $160,000"), extract both values
      - If single value given, use same for min and max
      - If not mentioned, use null for both

    **Overview**
    - Provide a brief 1-2 sentence summary of the position and its primary purpose

    **Responsibilities**
    - List the 3-5 most important responsibilities/duties
    - Focus on core functions, not minor tasks

    **Requirements - Required**
    - Skills: Extract technical skills, programming languages, platforms (no commas in values)
    - Tools: Extract specific tools, software, applications mentioned
    - Experience: Determine years_min, years_max (as numbers), and level (junior/mid/senior)
    - Qualifications: List essential requirements, experience, education

    **Requirements - Preferred**
    - Skills: Extract nice-to-have technical skills and platforms
    - Tools: Extract preferred tools and software
    - Certifications: Extract any certifications mentioned
    - Qualifications: List preferred experience and qualifications
    

    {format_instructions}
    """
    
    return PromptTemplate(
        input_variables=["job_description"],
        template=template,
        partial_variables={"format_instructions": "{format_instructions}"}
    )

class JobDescriptionSummarizer:
    def __init__(self, model_name="gpt-4o-mini", temperature=0.3):
        """
        Initialize the job description summarizer
        
        Args:
            model_name (str): OpenAI model to use
            temperature (float): Temperature for text generation (0.0-2.0)
                                0.0 = deterministic, focused responses
                                1.0 = balanced creativity and consistency  
                                2.0 = highly creative, unpredictable responses
        """
        self.llm = ChatOpenAI(
            model=model_name,
            temperature=temperature
        )
        
        # Define the expected JSON schema
        
        self.json_schema = {
            "job_id": "Job ID to be added later",
            "job_title": "Extracted job title",
            "job_level": "Junior/Mid-level/Senior/Lead/Executive",
            "company": {
                "name": "Company name or null",
                "industry": "Industry type or null"
            },
            "location": {
                "city": "City or null",
                "state": "State or null", 
                "country": "Country or null",
                "work_type": "remote/hybrid/on-site",
                "remote_flexibility": "full/partial/none"
            },
            "compensation": {
                "salary": {
                    "min": "Minimum salary as number or null",
                    "max": "Maximum salary as number or null"
                }
            },
            "overview": "Brief 1-2 sentence summary",
            "responsibilities": ["List of 3-5 key responsibilities"],
            "requirements": {
                "required": {
                    "skills": ["Technical skills array"],
                    "tools": ["Tools and software array"],
                    "experience": {
                        "years_min": "Minimum years as number",
                        "years_max": "Maximum years as number", 
                        "level": "entry/junior/mid/senior"
                    },
                    "qualifications": ["Essential qualifications array"]
                },
                "preferred": {
                    "skills": ["Preferred technical skills array"],
                    "tools": ["Preferred tools array"],
                    "certifications": ["Certifications array"],
                    "qualifications": ["Preferred qualifications array"]
                }
            }
        }
        
        # Initialize JSON parser with schema
        self.parser = JsonOutputParser(pydantic_object=None)
        self.prompt = create_job_summary_prompt()
        
        # Create the chain with format instructions
        self.chain = self.prompt.partial(format_instructions=self.parser.get_format_instructions()) | self.llm | self.parser
    
    def summarize(self, job_description: str) -> dict:
        """
        Summarize a job description
        
        Args:
            job_description (str): The full job description text
            
        Returns:
            dict: Structured summary of the job description
        """
        try:
            # Generate summary using LCEL chain with JSON parser
            result = self.chain.invoke({"job_description": job_description})
            
            return {
                "success": True,
                "summary": result,
                "raw_output": str(result)
            }
            
        except Exception as e:
            return {
                "success": False,
                "error": str(e),
                "summary": None,
                "raw_output": None
            }
    
    def summarize_from_parquet(self, parquet_file_path: str, job_id, id_column="id", jd_column="JD") -> dict:
        """
        Summarize a specific job description from a parquet file by job ID
        
        Args:
            parquet_file_path (str): Path to the parquet file containing job descriptions
            job_id: The specific job ID to process
            id_column (str): Name of the ID column (default: "id")
            jd_column (str): Name of the job description column (default: "JD")
            
        Returns:
            dict: Structured summary of the job description
        """
        try:
            # Read parquet file
            df = pd.read_parquet(parquet_file_path)
            
            # Validate columns exist
            if id_column not in df.columns:
                return {
                    "success": False,
                    "error": f"Column '{id_column}' not found in parquet file. Available columns: {list(df.columns)}",
                    "summary": None,
                    "job_id": job_id
                }
            
            if jd_column not in df.columns:
                return {
                    "success": False,
                    "error": f"Column '{jd_column}' not found in parquet file. Available columns: {list(df.columns)}",
                    "summary": None,
                    "job_id": job_id
                }
            
            # Find the specific job ID
            job_row = df[df[id_column] == job_id]
            
            if job_row.empty:
                return {
                    "success": False,
                    "error": f"Job ID '{job_id}' not found in the parquet file",
                    "summary": None,
                    "job_id": job_id
                }
            
            # Get the job description
            job_description = str(job_row[jd_column].iloc[0])
            
            # Check if job description is empty or NaN
            if pd.isna(job_description) or job_description.strip() == "" or job_description.lower() == "nan":
                return {
                    "success": False,
                    "error": f"Job description is empty or missing for ID '{job_id}'",
                    "summary": None,
                    "job_id": job_id
                }
            
            print(f"Processing Job ID: {job_id}")
            print(f"Job Description Length: {len(job_description)} characters")
            
            # Summarize the job description
            summary_result = self.summarize(job_description)
            
            # Add job ID to the result
            summary_result["job_id"] = job_id
            
            return summary_result
            
        except FileNotFoundError:
            return {
                "success": False,
                "error": f"Parquet file not found: {parquet_file_path}",
                "summary": None,
                "job_id": job_id
            }
        except Exception as e:
            return {
                "success": False,
                "error": f"Error processing parquet file: {str(e)}",
                "summary": None,
                "job_id": job_id
            }

def print_job_summary(summary_result: dict):
    """Pretty print the job summary result in JSON format"""
    
    if not summary_result["success"]:
        print(f"❌ Error for Job ID {summary_result.get('job_id', 'Unknown')}: {summary_result['error']}")
        return
    
    print(f"📋 JOB DESCRIPTION SUMMARY - ID: {summary_result.get('job_id', 'Unknown')}")
    print("=" * 60)
    
    summary = summary_result["summary"]
    
    # Print JSON in a readable format
    print("📄 JSON OUTPUT:")
    print("-" * 30)
    
    # Add job_id to the summary for complete JSON
    complete_summary = {"job_id": summary_result.get('job_id', 'Unknown')}
    complete_summary.update(summary)
    
    # Pretty print JSON
    print(json.dumps(complete_summary, indent=2, ensure_ascii=False))
    
    print("\n" + "=" * 60)
    



In [54]:
if __name__ == "__main__":
    # Initialize the summarizer
    summarizer = JobDescriptionSummarizer()
    
    # Example: Process specific job from parquet file
    print("=" * 60)
    print("JOB DESCRIPTION SUMMARIZER - DEMO")
    print("=" * 60)
    
    # Replace these with your actual values
    parquet_file_path = "id_jd_indeed.parquet"  # Your parquet file path
    job_id_to_process = "in-518379fd9e55e955"  # The specific job ID you want to process
    
    # Process the specific job description
    result = summarizer.summarize_from_parquet(
        parquet_file_path=parquet_file_path,
        job_id=job_id_to_process,
        id_column="id",    # Change if your ID column has different name
        jd_column="description"     # Change if your JD column has different name
    )
    
    # Print the result
    print_job_summary(result)
    
    print("\n" + "=" * 60)
    print("To use this script:")
    print("1. Update 'parquet_file_path' with your actual parquet file path")
    print("2. Update 'job_id_to_process' with the job ID you want to summarize")
    print("3. Update column names if they're different from 'id' and 'JD'")
    print("=" * 60)

JOB DESCRIPTION SUMMARIZER - DEMO
Processing Job ID: in-518379fd9e55e955
Job Description Length: 4015 characters
📋 JOB DESCRIPTION SUMMARY - ID: in-518379fd9e55e955
📄 JSON OUTPUT:
------------------------------
{
  "job_id": "in-518379fd9e55e955",
  "Job Information": {
    "job_title": "Data Engineer",
    "job_level": "Senior"
  },
  "Company Information": {
    "company_name": "Macquarie",
    "industry": "Financial Services"
  },
  "Location Information": {
    "city": null,
    "state": null,
    "country": null,
    "work_type": "hybrid",
    "remote_flexibility": "partial"
  },
  "Compensation": {
    "min_salary": null,
    "max_salary": null
  },
  "Overview": "The Data Engineer will be responsible for building, implementing, and enhancing enterprise scale data solutions in a DevOps environment, ensuring high quality and low maintenance software solutions.",
  "Responsibilities": [
    "Design, develop, deploy, and support data assets.",
    "Create templates, implementation m

In [32]:
from langchain_openai import ChatOpenAI
from langchain.callbacks import get_openai_callback

llm = ChatOpenAI(model="gpt-4o-mini")

# Track tokens using callback
with get_openai_callback() as cb:
    result = llm.invoke("Summarize this job description...")
    
    print(f"Total Tokens: {cb.total_tokens}")
    print(f"Prompt Tokens: {cb.prompt_tokens}")
    print(f"Completion Tokens: {cb.completion_tokens}")
    print(f"Total Cost (USD): ${cb.total_cost}")

Total Tokens: 27
Prompt Tokens: 14
Completion Tokens: 13
Total Cost (USD): $9.899999999999998e-06


{
  "job_id": "in-a21984534d0b4317",
  "job_title": "DevOps Engineer",
  "job_level": "Mid-level",
  
  "company": {
    "name": "TechCorp",
    "industry": "Technology"
  },
  
  "location": {
    "city": "San Francisco",
    "state": "CA",
    "country": "USA",
    "work_type": "on-site",
    "remote_flexibility": "none"
  },
  
  "compensation": {
    "salary": {
      "min": 120000,
      "max": 160000
    }
  },
  
  "overview": "The DevOps Engineer position involves designing and implementing software development practices and tools to enhance system delivery, working closely with a client on-site.",
  
  "responsibilities": [
    "Develop and implement CI/CD pipelines with a focus on containerization and continuous integration best practices.",
    "Establish and maintain DevSecOps standards, tools, and governance.",
    "Manage infrastructure and environments for reliability across production and non-production systems.",
    "Collaborate with development and QA teams for efficient software delivery through automation.",
    "Implement and maintain monitoring and logging solutions for system performance."
  ],
  
  "requirements": {
    "required": {
      "skills": ["AWS", "Kubernetes", "Docker", "GitLab", "Terraform", "Ansible"],
      "tools": ["Rancher", "Bitbucket", "JFrog Artifactory"],
      "experience": {
        "years_min": 3,
        "years_max": 5,
        "level": "mid"
      },
      "qualifications": [
        "Hands-on experience with Amazon Web Services (AWS)",
        "Proficiency with Kubernetes and container orchestration",
        "Experience with CI/CD tools (GitLab, Bitbucket)",
        "Infrastructure as Code using Terraform",
        "Configuration management with Ansible"
      ]
    },
    "preferred": {
      "skills": ["GCP", "Azure", "DevSecOps"],
      "tools": ["Selenium Grid", "Cucumber", "Jira", "Confluence"],
      "certifications": ["AWS Solutions Architect", "CKA"],
      "qualifications": [
        "Experience with additional CI/CD tools and practices",
        "Familiarity with other cloud platforms",
        "Knowledge of security best practices in DevOps"
      ]
    }
  }
}