# Import necessary libraries

In [1]:
from pathlib import Path
from langchain_community .document_loaders import PyMuPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import HuggingFaceEmbeddings

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
Data_dir = Path(r"C:\Users\user\Desktop\HRbotassistance\resumedata")
assert Data_dir.exists(), f"Folder not found {Data_dir()}"

# Load PDF Files

In [3]:
docs = []
for pdf in Data_dir.glob("*.pdf"):
    loader = PyMuPDFLoader(str(pdf))
    docs.extend(loader.load())

print(f"Loaded {len(docs)} documents")
print("Sample text:" , docs[0].page_content[:300])

Loaded 41 documents
Sample text: Khalid Nassar
Deep Learning Engineer — Riyadh, Saudi Arabia
Email: khalid.nassar1@example.com | Phone: +966592881653
Professional Summary
Experienced Deep Learning Engineer with a strong track record of building production-grade machine
learning systems, from data engineering and model training to d


# Split into Chunks

In [4]:
splitter = RecursiveCharacterTextSplitter(
    chunk_size=500, # maximum size of each chunk
    chunk_overlap=50 # overlap so contecxt is maintained
)

chunks = splitter.split_documents(docs)
print(f"Split into {len(chunks)} chunks")
print("Sample chunk:" , chunks[0].page_content[:300])

Split into 181 chunks
Sample chunk: Khalid Nassar
Deep Learning Engineer — Riyadh, Saudi Arabia
Email: khalid.nassar1@example.com | Phone: +966592881653
Professional Summary
Experienced Deep Learning Engineer with a strong track record of building production-grade machine
learning systems, from data engineering and model training to d


# Embed and Store

In [5]:
embeddings= HuggingFaceEmbeddings(model_name = "all-MiniLM-L6-v2")
# create vector store
db = Chroma.from_documents(
    documents = chunks,
    embedding = embeddings,
    collection_name = "resumes",

)

  embeddings= HuggingFaceEmbeddings(model_name = "all-MiniLM-L6-v2")


# Query

In [6]:
query = "Find candidates with Tensorflow and AWS experience"
results = db.similarity_search(query, k=2)

for i, r in enumerate(results, start=1):
    print(f"\n[{i}] {r.metadata}")
    print(r.page_content[:300])


[1] {'creationDate': "D:20250930175541+00'00'", 'modDate': "D:20250930175541+00'00'", 'creator': '(unspecified)', 'title': '(anonymous)', 'creationdate': '2025-09-30T17:55:41+00:00', 'producer': 'ReportLab PDF Library - www.reportlab.com', 'keywords': '', 'author': '(anonymous)', 'source': 'C:\\Users\\user\\Desktop\\HRbotassistance\\resumedata\\ai_resume_realistic_01_Khalid_Nassar.pdf', 'format': 'PDF 1.4', 'trapped': '', 'moddate': '2025-09-30T17:55:41+00:00', 'subject': '(unspecified)', 'file_path': 'C:\\Users\\user\\Desktop\\HRbotassistance\\resumedata\\ai_resume_realistic_01_Khalid_Nassar.pdf', 'total_pages': 1, 'page': 0}
post-processing to extract structured candidate data.
• Optimized model inference using TensorRT and mixed precision; achieved 2.5x throughput improvement
on GPU.
Education
MSc in Computer Science, Machine Learning Track — University of Technology, 2019
Certifications
• AWS Certified Machine Learnin

[2] {'creationdate': '2025-09-30T17:55:41+00:00', 'title': '(a

# Generate Response

In [8]:
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

### Import LLM From Huggingface

In [None]:
model_name = "mistralai/Mistral-7B-Instruct"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto")

generator = pipeline(
    "text_generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=300,
    temperature=0.2,

)


### Generate Response

In [None]:
def rag_answer(query, db, k=3):
    # retrieve relevant documents
    results = db.similarity_search(query, k=k)
    context = "\n\n".join([r.page_content for r in results])

    # create prompt
    prompt = f"""
    You are an AI HR assistant
    Here are some candidate resumes:
    {context}

    Question: {query}
    Answer clearly and concisely, citing relevant experience from resumes.PermissionError
    """

    # generate response
    output = generator(prompt)[0]["generated_text"]
    return output


# Example
answer = rag_answer("Find candidates with TensorFlow and AWS experience", db)
print(answer)