In [1]:
!pip install --upgrade langchain langchain-google-genai PyMuPDF

Collecting langchain
  Downloading langchain-1.2.7-py3-none-any.whl.metadata (4.9 kB)
Collecting langchain-google-genai
  Downloading langchain_google_genai-4.2.0-py3-none-any.whl.metadata (2.7 kB)
Collecting PyMuPDF
  Downloading pymupdf-1.26.7-cp310-abi3-manylinux_2_28_x86_64.whl.metadata (3.4 kB)
Collecting langgraph<1.1.0,>=1.0.7 (from langchain)
  Downloading langgraph-1.0.7-py3-none-any.whl.metadata (7.4 kB)
Collecting filetype<2.0.0,>=1.2.0 (from langchain-google-genai)
  Downloading filetype-1.2.0-py2.py3-none-any.whl.metadata (6.5 kB)
Collecting google-genai<2.0.0,>=1.56.0 (from langchain-google-genai)
  Downloading google_genai-1.60.0-py3-none-any.whl.metadata (53 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.1/53.1 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
Collecting google-auth<3.0.0,>=2.47.0 (from google-auth[requests]<3.0.0,>=2.47.0->google-genai<2.0.0,>=1.56.0->langchain-google-genai)
  Downloading google_auth-2.48.0-py3-none-any.whl.metadata

In [8]:
from google.colab import files

uploaded = files.upload()
pdf_path = list(uploaded.keys())[0]

Saving Noor-Fatima.pdf to Noor-Fatima.pdf


In [7]:
import fitz

def read_pdf(path: str) -> str:
    doc = fitz.open(path)
    content = []

    for page_num, page in enumerate(doc, start=1):
        # Extract visible text
        text = page.get_text("text")

        # Extract clickable links (annotations)
        links = page.get_links()
        urls = []

        for link in links:
            uri = link.get("uri")
            if uri:
                urls.append(uri)

        # Append links so Gemini can see them
        if urls:
            text += "\n\n[EXTRACTED LINKS]\n" + "\n".join(urls)

        content.append(text)

    return "\n\n".join(content)

In [9]:
import os
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.prompts import PromptTemplate


# -------------------------------
# 1. Set Google API key
# -------------------------------
os.environ["GOOGLE_API_KEY"] = "AIzaSyDr4vSHaDggtYZ0D53VsbKsayxAg-oiJ6s"

# -------------------------------
# 2. Read uploaded PDF
# -------------------------------
pdf_path = list(uploaded.keys())[0]   # first uploaded file
resume_text = read_pdf(pdf_path)

# -------------------------------
# 3. Initialize Gemini
# -------------------------------
llm = ChatGoogleGenerativeAI(
    model="gemini-2.5-flash",
    temperature=0.2
)

# -------------------------------
# 4. Strict JSON extraction prompt
# -------------------------------
prompt = PromptTemplate(
    input_variables=["resume"],
    template="""
You are an expert resume parser.

IMPORTANT:
- Use exact URLs found in the resume text or under [EXTRACTED LINKS]
- Do NOT infer or fabricate GitHub or LinkedIn URLs
- If no explicit URL exists, return null

Extract:
- Full name
- Contact info (email, phone)
- GitHub link
- LinkedIn link
- Highest qualification
- University
- Experience (chronological, label technical vs non-technical)
- Projects (name, description, tech stack)
- Coursework keywords
- Technical skills summary
- Extracurricular / leadership experience

Rules:
- Preserve chronological order
- Output ONLY a JSON object
- Use exactly these keys:
{{
  "name",
  "contact_info",
  "github_link",
  "linkedin",
  "qualification",
  "university",
  "experience",
  "projects",
  "coursework_keywords",
  "skills_summary",
  "extracurricular"
}}
- Set missing fields to null

Resume:
{resume}

JSON:
"""
)

# -------------------------------
# 5. Run extraction
# -------------------------------
response = llm.invoke(prompt.format(resume=resume_text))

print(response.content)


```json
{
  "name": "Noor Fatima",
  "contact_info": {
    "email": "h.noorfatima7@gmail.com",
    "phone": "+92 323 4753925"
  },
  "github_link": "https://github.com/Noor-Fatima-Khalid",
  "linkedin": "https://www.linkedin.com/in/noor-fatima-37a345274/",
  "qualification": "Bachelors of Software Engineering",
  "university": "Punjab University College of Information Technology",
  "experience": [
    {
      "title": "Community Lead",
      "organization": "FCIT - Developers Club",
      "date_range": "Apr 2025 – present",
      "description": null,
      "type": "technical"
    },
    {
      "title": "Lead",
      "organization": "Microsoft Learn Student Ambassador (MLSA), FCIT",
      "date_range": "Sep 2024 – Apr 2025",
      "description": "Organized several sessions where I invited industry personnels and alumni. Organized sessions on technologies including Azure and GitHub.",
      "type": "technical"
    },
    {
      "title": "OOP-Teaching Assistant",
      "organization": 