In [27]:
from langchain_community.document_loaders import UnstructuredFileLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.schema import Document
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_community.vectorstores import Chroma
from dotenv import load_dotenv
from langchain_core.output_parsers import JsonOutputParser
from langchain_core.prompts import PromptTemplate
from langchain_core.pydantic_v1 import BaseModel,Field
from langchain_core.output_parsers import StrOutputParser
from langchain.schema.runnable import RunnableLambda,RunnablePassthrough
from typing import TypedDict,Annotated,Optional,Literal

In [2]:
load_dotenv()

True

In [None]:
loader = UnstructuredFileLoader('./resumes/ShadabResume.pdf')
docs = loader.load()

print(f"Length of documents: {len(docs)}")
print(docs[0])

In [4]:
def clean_text(text):
  return " ".join(text.split())

docs = [Document(page_content=clean_text(d.page_content),metadata=d.metadata) for d in docs]

In [5]:
text_splitter = RecursiveCharacterTextSplitter(
  chunk_size=500,
  chunk_overlap=100,
  separators=["\n\n","\n"," ",""]
)

In [6]:
chunks = text_splitter.split_documents(docs)
print(f"Total no of chunks : {len(chunks)}")
print(f"Length of the first chunks : {len(chunks[0].page_content)}")
print(chunks[0])

Total no of chunks : 8
Length of the first chunks : 494
page_content='ABDUL SHADAB KHAN +91-6309250249 ⋄ Warangal, Telangana, India abdulshadabkhan24@gmail.com ⋄ linkedin.com/in/abdul-shadab-khan-a1a751257 ⋄ github.com/Shadabkhan2004 PROFESSIONAL SUMMARY Computer Science undergraduate with practical experience in full-stack development using the MERN stack. Skilled in building responsive web apps with React.js, Node.js, and MongoDB. Proficient in Git, Tailwind CSS, and API testing with Postman. Eager to contribute to real-world projects and grow in a dynamic' metadata={'source': './resumes/ShadabResume.pdf'}


In [7]:
sum(len(c.page_content) for c in chunks) / len(chunks)

473.875

In [8]:
embedding = OpenAIEmbeddings(model="text-embedding-3-small")

vector_store = Chroma.from_documents(
  documents=chunks,
  embedding=embedding,
  persist_directory='./vector_store'
)

vector_store.persist()

  vector_store.persist()


In [11]:
query = "What are the skills of the candidate ?"
results = vector_store.similarity_search(query,k=2)

for r in results:
  print(r.page_content)

CSS, and API testing with Postman. Eager to contribute to real-world projects and grow in a dynamic development environment. SKILLS Technical Skills Soft Skills MongoDB, Express.js, React.js, Node.js, HTML, CSS, JavaScript, Tailwind CSS, Java, Python, Git, GitHub, VS Code, Postman, Netlify, Render Problem Solving, Team Collaboration, Communication, Adaptability, Time Management PROJECTS & TECHNICAL EXPERIENCE SaveMyFood.com A full-stack pantry management platform designed to help users reduce
first year. LEADERSHIP Led a two-member team for the “Gadget Store” mini-project, taking charge of front-end development, Git/GitHub, and technical documentation. Coordinated with teammates for report writing, slide preparation, and successfully presented the project to faculty evaluators. EDUCATION Bachelor of Technology in Computer Science and Engineering, KITSW CGPA: 8.15 Relevant Coursework: Data Structures and Algorithms, Web Technologies, Object-Oriented Programming, Database Management


In [12]:
query = "What frameworks does he know ?"
results = vector_store.similarity_search(query,k=2)

for r in results:
  print(r.page_content)

CSS, and API testing with Postman. Eager to contribute to real-world projects and grow in a dynamic development environment. SKILLS Technical Skills Soft Skills MongoDB, Express.js, React.js, Node.js, HTML, CSS, JavaScript, Tailwind CSS, Java, Python, Git, GitHub, VS Code, Postman, Netlify, Render Problem Solving, Team Collaboration, Communication, Adaptability, Time Management PROJECTS & TECHNICAL EXPERIENCE SaveMyFood.com A full-stack pantry management platform designed to help users reduce
ABDUL SHADAB KHAN +91-6309250249 ⋄ Warangal, Telangana, India abdulshadabkhan24@gmail.com ⋄ linkedin.com/in/abdul-shadab-khan-a1a751257 ⋄ github.com/Shadabkhan2004 PROFESSIONAL SUMMARY Computer Science undergraduate with practical experience in full-stack development using the MERN stack. Skilled in building responsive web apps with React.js, Node.js, and MongoDB. Proficient in Git, Tailwind CSS, and API testing with Postman. Eager to contribute to real-world projects and grow in a dynamic


In [18]:
retriever = vector_store.as_retriever(kwargs={"k":3})

In [28]:
class Skills(BaseModel):
  name: str = Field(default="",description="Full name of the candidate")
  contact : Optional[str] = Field(default="",description="Contact information of the such as email or phone")
  programming_languages: list[str] = Field(default_factory=list)
  frameworks: list[str] = Field(default_factory=list)
  tools: list[str] = Field(default_factory=list)
  soft_skills: list[str] = Field(default_factory=list)
  other_skills: list[str] = Field(default_factory=list)


llm = ChatOpenAI(model="gpt-4")

parser = JsonOutputParser(pydantic_object=Skills)

prompt = PromptTemplate(
  template = """
    You are a precise resume analyzer.
    Use ONLY the provided context below to extract:
    - Candidate's full name
    - Contact information (email or phone)
    - All mentioned skills grouped properly

    If any information is missing, return an empty string or empty list.

    Context:
    {context}

    Return a JSON strictly following this schema:
    {format_instructions}
  """,
  input_variables=["context"],
  partial_variables={"format_instructions":parser.get_format_instructions()}
)

In [29]:
def combine_docs(docs):
  return "\n\n".join(d.page_content for d in docs)

chain = (
  {"context": retriever | RunnableLambda(combine_docs)} | prompt | llm | parser
)

In [30]:
chain.invoke("What is the name of the candidate and Extract all skills from the candidate's resume")

{'name': 'ABDUL SHADAB KHAN',
 'contact': '+91-6309250249, abdulshadabkhan24@gmail.com',
 'programming_languages': ['Java', 'Python', 'JavaScript', 'HTML', 'CSS'],
 'frameworks': ['React.js', 'Node.js', 'Express.js', 'MongoDB'],
 'tools': ['Git',
  'GitHub',
  'VS Code',
  'Postman',
  'Netlify',
  'Render',
  'Tailwind CSS'],
 'soft_skills': ['Problem Solving',
  'Team Collaboration',
  'Communication',
  'Adaptability',
  'Time Management'],
 'other_skills': []}