In [1]:
import fitz  # PyMuPDF
def extract_text_from_pdf(file):
    with fitz.open(stream=file.read(), filetype="pdf") as doc:
        return " ".join(page.get_text() for page in doc)

def clean_text(text, max_chars=3000):
    cleaned = " ".join(text.strip().split())
    return cleaned[:max_chars]

In [2]:
import fitz  # PyMuPDF
import ast
import re
import spacy
import requests

# Load spaCy model
nlp = spacy.load("en_core_web_sm")

# --- PDF Extraction & Cleaning ---
def extract_text_from_pdf(file):
    with fitz.open(stream=file.read(), filetype="pdf") as doc:
        return " ".join(page.get_text() for page in doc)

def clean_text(text):
    return " ".join(text.strip().split())

# --- Pre-filter relevant sections ---
def extract_relevant_sections(text):
    patterns = [
        r"skills[\s:.-]+(.+?)(?=\n[A-Z]|\Z)",
        r"technical skills[\s:.-]+(.+?)(?=\n[A-Z]|\Z)",
        r"tools[\s:.-]+(.+?)(?=\n[A-Z]|\Z)",
        r"technologies[\s:.-]+(.+?)(?=\n[A-Z]|\Z)",
        r"responsibilities[\s:.-]+(.+?)(?=\n[A-Z]|\Z)",
        r"roles and responsibilities[\s:.-]+(.+?)(?=\n[A-Z]|\Z)",
        r"requirements[\s:.-]+(.+?)(?=\n[A-Z]|\Z)",
        r"job description[\s:.-]+(.+?)(?=\n[A-Z]|\Z)"
    ]
    combined_pattern = "|".join(patterns)
    matches = re.findall(combined_pattern, text, flags=re.IGNORECASE | re.DOTALL)
    if matches:
        sections = [" ".join(m).strip() for m in matches if any(m)]
        return " ".join(sections)
    return text

# --- spaCy candidate filtering ---
def spacy_filter_skill_candidates(text):
    doc = nlp(text)
    candidates = set()
    for chunk in doc.noun_chunks:
        if len(chunk.text.strip()) > 2 and not chunk.root.is_stop:
            candidates.add(chunk.text.strip())
    for token in doc:
        if token.pos_ in ["PROPN", "NOUN"] and not token.is_stop and len(token.text.strip()) > 2:
            candidates.add(token.text.strip())
    return " ".join(candidates)

# --- Groq LLM Skill Extractor ---
def extract_skills_resume_and_jd_groq(
    resume_file,
    jd_file=None,
    jd_text=None,
    groq_model="llama3-8b-8192",  # ✅ use a valid Groq model
    api_key="YOUR_GROQ_API_KEY"
):
    # --- Extract Resume ---
    resume_text = clean_text(extract_text_from_pdf(resume_file))
    resume_filtered = extract_relevant_sections(resume_text)
    resume_candidates = spacy_filter_skill_candidates(resume_filtered)

    # --- Extract Job Description ---
    if jd_file:
        jd_text_content = clean_text(extract_text_from_pdf(jd_file))
    else:
        jd_text_content = clean_text(jd_text or "")
    jd_filtered = extract_relevant_sections(jd_text_content)
    jd_candidates = spacy_filter_skill_candidates(jd_filtered)

import requests
import ast

api_key = "gsk_GMCcejfaLSrdkPkc3yxzWGltadSG1hwQrahQYbj7uovqv9"
groq_model = "llama3-8b-8192"  # valid Groq model

prompt = """
You are an AI skill extractor.
Extract ONLY technical skills (programming languages, frameworks, libraries, tools, platforms) 
separately for Resume and Job Description.
Return JSON:
{"resume_skills": [...], "jd_skills": [...]}

Resume:
Python, SQL, Tableau

Job Description:
Python, Spark, AWS
"""

url = "https://api.groq.com/openai/v1/chat/completions"
headers = {
    "Authorization": f"Bearer {api_key}",
    "Content-Type": "application/json"
}
payload = {
    "model": groq_model,
    "messages": [{"role": "user", "content": prompt}],
    "temperature": 0
}

try:
    response = requests.post(url, json=payload, headers=headers)
    response.raise_for_status()
    llm_output = response.json()["choices"][0]["message"]["content"]
    skills_dict = ast.literal_eval(llm_output.strip())
    print(skills_dict)
except Exception as e:
    print(f"Groq LLM error: {e}")

Groq LLM error: 401 Client Error: Unauthorized for url: https://api.groq.com/openai/v1/chat/completions


In [3]:
with open("sneha_sathe.pdf", "rb") as resume_file:
    skills = extract_skills_resume_and_jd_groq(
        resume_file,  # PDF file object
        jd_text="""
Requirements
• Currently enrolled in or have recently graduated from a degree program in computer science, data science, or a related field
• Familiarity with AI and machine learning concepts and techniques
• Experience with programming languages such as Python, R and SQL
• Strong analytical and problem-solving skills
• Strong communication and teamwork skills
• Experience with machine learning libraries such as scikit-learn, TensorFlow, and/or Keras is a plus.
"""
    )
print(skills)


None


In [6]:
import os
import fitz  # PyMuPDF
import re
import spacy
import requests
import json

# Load spaCy (optional, can skip for speed)
nlp = spacy.load("en_core_web_sm")

# --- PDF Extraction ---
def extract_text_from_pdf(file):
    with fitz.open(stream=file.read(), filetype="pdf") as doc:
        return " ".join(page.get_text() for page in doc)

def clean_text(text):
    return " ".join(text.strip().split())

# --- Section Filtering ---
def extract_relevant_sections(text):
    patterns = [
        r"skills[\s:.-]+(.+?)(?=\n[A-Z]|\Z)",
        r"technical skills[\s:.-]+(.+?)(?=\n[A-Z]|\Z)",
        r"tools[\s:.-]+(.+?)(?=\n[A-Z]|\Z)",
        r"technologies[\s:.-]+(.+?)(?=\n[A-Z]|\Z)",
        r"responsibilities[\s:.-]+(.+?)(?=\n[A-Z]|\Z)",
        r"roles and responsibilities[\s:.-]+(.+?)(?=\n[A-Z]|\Z)",
        r"requirements[\s:.-]+(.+?)(?=\n[A-Z]|\Z)",
        r"job description[\s:.-]+(.+?)(?=\n[A-Z]|\Z)"
    ]
    matches = re.findall("|".join(patterns), text, flags=re.IGNORECASE | re.DOTALL)
    if matches:
        sections = [" ".join(m).strip() for m in matches if any(m)]
        return " ".join(sections)
    return text

# --- spaCy Filtering ---
def spacy_filter_skill_candidates(text):
    doc = nlp(text)
    candidates = set()
    for chunk in doc.noun_chunks:
        if len(chunk.text.strip()) > 2 and not chunk.root.is_stop:
            candidates.add(chunk.text.strip())
    for token in doc:
        if token.pos_ in ["PROPN", "NOUN"] and not token.is_stop:
            candidates.add(token.text.strip())
    return ", ".join(candidates)

# --- Main Function ---
def extract_skills_resume_and_jd_groq(
    resume_file,
    jd_file=None,
    jd_text=None,
    groq_model="llama3-8b-8192"
):
    # API Key from environment (run: export GROQ_API_KEY="your_key_here")
    api_key = os.getenv("GROQ_API_KEY")
    if not api_key:
        return {"error": "Missing Groq API key. Set GROQ_API_KEY environment variable."}

    # --- Resume Processing ---
    resume_text = clean_text(extract_text_from_pdf(resume_file))
    resume_filtered = extract_relevant_sections(resume_text)
    resume_candidates = spacy_filter_skill_candidates(resume_filtered)

    # --- JD Processing ---
    if jd_file:
        jd_text_content = clean_text(extract_text_from_pdf(jd_file))
    else:
        jd_text_content = clean_text(jd_text or "")
    jd_filtered = extract_relevant_sections(jd_text_content)
    jd_candidates = spacy_filter_skill_candidates(jd_filtered)

    # --- LLM Prompt ---
    prompt = f"""
    Extract ONLY technical skills (programming languages, frameworks, libraries, tools, platforms) 
    separately for Resume and Job Description.
    Respond ONLY with valid JSON, no markdown, no extra text.
    Format:
    {{
      "resume_skills": [...],
      "jd_skills": [...]
    }}

    Resume:
    {resume_candidates}

    Job Description:
    {jd_candidates}
    """

    # --- API Request ---
    url = "https://api.groq.com/openai/v1/chat/completions"
    headers = {
        "Authorization": f"Bearer {api_key}",
        "Content-Type": "application/json"
    }
    payload = {
        "model": groq_model,
        "messages": [{"role": "user", "content": prompt}],
        "temperature": 0
    }

    try:
        response = requests.post(url, json=payload, headers=headers)
        response.raise_for_status()
        llm_output = response.json()["choices"][0]["message"]["content"].strip()

        # Remove code fences if any
        if llm_output.startswith("```"):
            llm_output = llm_output.split("```")[1]
            if llm_output.startswith("json"):
                llm_output = llm_output[len("json"):].strip()

        # Parse JSON
        return json.loads(llm_output)

    except json.JSONDecodeError as e:
        return {"error": f"JSON parsing error: {e}", "raw_output": llm_output}
    except Exception as e:
        return {"error": str(e)}

# --- Example Usage ---
if __name__ == "__main__":
    with open("sneha_sathe.pdf", "rb") as resume_file:
        skills = extract_skills_resume_and_jd_groq(
            resume_file,
            jd_text="""
            Requirements
            • Experience with Python, R, SQL
            • Knowledge of TensorFlow, scikit-learn, Keras
            • Familiarity with AWS, Spark
            """
        )
    print(skills)


{'resume_skills': ['Python', 'Django', 'SQLite', 'Wordpress', 'HTML5', 'CSS3', 'MySQL', 'Machine Learning', 'Statistics'], 'jd_skills': ['Python', 'TensorFlow', 'scikit', 'Keras', 'R', 'SQL', 'AWS', 'Spark']}


In [9]:
import fitz  # PyMuPDF
import subprocess
import json

def is_ollama_model_installed(model_name):
    """Check if the Ollama model is installed locally."""
    try:
        result = subprocess.run(
            ["ollama", "list"], capture_output=True, text=True
        )
        return model_name in result.stdout
    except Exception:
        return False

def extract_text_from_pdf(file):
    """Extract text from a PDF file."""
    pdf_document = fitz.open(stream=file.read(), filetype="pdf")
    text = ""
    for page in pdf_document:
        text += page.get_text()
    return text.strip()

def extract_skills_resume_and_jd(resume_file=None, jd_file=None, resume_text=None, jd_text=None, model="mistral"):
    """Extract skills from Resume and Job Description using Ollama."""
  
    # Get resume text
    if resume_file:
        resume_text_content = extract_text_from_pdf(resume_file)
    else:
        resume_text_content = resume_text or ""
    
    # Get JD text
    if jd_file:
        jd_text_content = extract_text_from_pdf(jd_file)
    else:
        jd_text_content = jd_text or ""

    # Prepare prompt
    prompt = f"""
    You are a technical skill extraction assistant.
    Extract ONLY technical skills (programming languages, frameworks, tools, libraries, cloud platforms, etc.)
    from the following Resume and Job Description.
    Provide the output as JSON with keys 'resume_skills' and 'jd_skills'.

    Resume:
    {resume_text_content}

    Job Description:
    {jd_text_content}
    """

    try:
        result = subprocess.run(
            ["ollama", "run", model, prompt],
            capture_output=True,
            text=True
        )
        output_text = result.stdout.strip()
        try:
            skills = json.loads(output_text)
        except json.JSONDecodeError:
            skills = {"resume_skills": [], "jd_skills": []}
        return skills
    except Exception as e:
        return {"resume_skills": [], "jd_skills": [], "error": str(e)}


In [10]:
# --- Example Usage ---
if __name__ == "__main__":
    with open("sneha_sathe.pdf", "rb") as resume_file:
        skills = extract_skills_resume_and_jd_groq(
            resume_file,
            jd_text="""
            Requirements
            • Experience with Python, R, SQL
            • Knowledge of TensorFlow, scikit-learn, Keras
            • Familiarity with AWS, Spark
            """
        )
    print(skills)

{'resume_skills': ['Python', 'Django', 'SQLite', 'Wordpress', 'HTML5', 'CSS3', 'MySQL', 'Machine Learning', 'Statistics'], 'jd_skills': ['Python', 'TensorFlow', 'scikit', 'Keras', 'R', 'SQL', 'AWS', 'Spark']}
