In [3]:
# ==============================
# Install required packages
# ==============================
!pip install langchain langchain-openai langchain-community pypdf python-docx

# ==============================
# Imports
# ==============================
import os
import pandas as pd
from langchain_openai import ChatOpenAI
from langchain.prompts import PromptTemplate
from langchain.output_parsers import StructuredOutputParser, ResponseSchema
from langchain_community.document_loaders import PyPDFLoader, UnstructuredWordDocumentLoader

# ==============================
# Set your OpenAI API key
# ==============================
os.environ["OPENAI_API_KEY"] = "your_openai_api_key_here"

# ==============================
# Step 1: Define Extraction Schema
# ==============================
schemas = [
    ResponseSchema(name="name", description="Full name of the candidate"),
    ResponseSchema(name="age", description="Age in numbers"),
    ResponseSchema(name="gender", description="Gender of the candidate"),
    ResponseSchema(name="location", description="City or location"),
    ResponseSchema(name="email", description="Email ID"),
    ResponseSchema(name="phone", description="Phone number"),
    ResponseSchema(name="qualification", description="Highest qualification"),
    ResponseSchema(name="experience_years", description="Years of work experience"),
    ResponseSchema(name="skills", description="List of skills"),
    ResponseSchema(name="summary", description="Short professional summary")
]

parser = StructuredOutputParser.from_response_schemas(schemas)
format_instructions = parser.get_format_instructions()

# ==============================
# Step 2: Prompt Template
# ==============================
prompt = PromptTemplate(
    template="""
    Extract candidate details from this resume text:

    {resume_text}

    {format_instructions}
    """,
    input_variables=["resume_text"],
    partial_variables={"format_instructions": format_instructions},
)

# ==============================
# Step 3: Resume Loader
# ==============================
def load_resume(file_path):
    if file_path.endswith(".pdf"):
        loader = PyPDFLoader(file_path)
    elif file_path.endswith(".docx"):
        loader = UnstructuredWordDocumentLoader(file_path)
    else:
        raise ValueError("Unsupported file format")
    return loader.load()[0].page_content

# ==============================
# Step 4: LLM Setup (OpenAI)
# ==============================
llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0)

def extract_resume_data(resume_text):
    prompt_text = prompt.format(resume_text=resume_text)
    response = llm.predict(prompt_text)
    return parser.parse(response)

# ==============================
# Step 5: Process Multiple Resumes
# ==============================
def process_resumes(resume_folder):
    all_candidates = []
    for file in os.listdir(resume_folder):
        if not (file.endswith(".pdf") or file.endswith(".docx")):
            continue
        path = os.path.join(resume_folder, file)
        print(f"Processing {file} ...")
        text = load_resume(path)
        candidate_data = extract_resume_data(text)
        all_candidates.append(candidate_data)

    df = pd.DataFrame(all_candidates)
    df.to_csv("extracted_candidates.csv", index=False)
    print("✅ Data saved to extracted_candidates.csv")
    return d


Collecting pypdf
  Downloading pypdf-6.1.0-py3-none-any.whl.metadata (7.1 kB)
Collecting python-docx
  Downloading python_docx-1.2.0-py3-none-any.whl.metadata (2.0 kB)
Downloading pypdf-6.1.0-py3-none-any.whl (322 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m322.5/322.5 kB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading python_docx-1.2.0-py3-none-any.whl (252 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m253.0/253.0 kB[0m [31m18.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: python-docx, pypdf
Successfully installed pypdf-6.1.0 python-docx-1.2.0
