In [1]:
pip install -q youtube-transcript-api langchain langchain-core langchain-community langchain-openai langchain-anthropic langchain-google-genai google-generativeai langchain-huggingface langchain-groq openai huggingface-hub transformers faiss-cpu tiktoken python-dotenv numpy scikit-learn grandalf sentence_transformers InstructorEmbedding pymupdf

In [2]:
from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_community.vectorstores import FAISS
from langchain_core.prompts import PromptTemplate
from langchain_groq import ChatGroq
from langchain_core.output_parsers import StrOutputParser
from langchain.embeddings import HuggingFaceEmbeddings
import getpass
import os
from langchain_community.document_loaders import DirectoryLoader, PyMuPDFLoader
from langchain.schema import Document

In [3]:
# STEP 1: Collect all .pdf files using DirectoryLoader (doesn't load yet)
directory_path = "/content/drive/MyDrive/CV"

loader = DirectoryLoader(
    path=directory_path,
    glob="**/*.pdf",
    loader_cls=PyMuPDFLoader,
    show_progress=True,
    use_multithreading=True,

)
docs = loader.load()

100%|██████████| 2/2 [00:00<00:00, 10.64it/s]


In [4]:
len(docs)

4

In [5]:
doc_sources = [doc.metadata["source"] for doc in docs]
doc_sources

['/content/drive/MyDrive/CV/MuhammadTalhaCV1.pdf',
 '/content/drive/MyDrive/CV/Muhammad Talha - ML Engineer.pdf',
 '/content/drive/MyDrive/CV/Muhammad Talha - ML Engineer.pdf',
 '/content/drive/MyDrive/CV/Muhammad Talha - ML Engineer.pdf']

In [6]:
print(docs[1].page_content)

N/A 
 
N/A 
Muhammad Talha 
Gujranwala, Punjab, Pakistan       
 muhammadtalha1818@gmail.com       
+923444300394 
 
 https://www.linkedin.com/in/muhammad-talha-b643641b2/ 
 https://github.com/Talha1818 
 
 
  
SUMMARY 
Highly experienced in the domains of machine learning, deep learning, and natural language processing (NLP), I am a 
passionate engineer and thriving analyst. With a strong aptitude for applying ML techniques and developing advanced 
algorithms, I excel in solving complex real-world problems. My profound interest lies in research, Artificial Intelligence, and 
robotics, as I continuously strive to explore innovative frontiers and contribute to advancements in these fields. 
 
EXPERIENCE 
Freelance | Upwork | San Francisco, CA | Feb 2024 - Present 
 
A freelance specialist adept at delivering impactful solutions in machine learning and AI realms, specializing in NLP, 
LLMs, and generative AI technologies. From conceptualization to deployment, I craft tailored solutions 

In [7]:
from collections import defaultdict
from langchain.schema import Document

# Step 1: Group all text chunks by source
combined_texts = defaultdict(str)

for doc in docs:
    source = doc.metadata.get("source", "unknown")
    combined_texts[source] += doc.page_content.strip() + "\n"

# Step 2: Recreate Document objects, one per source
merged_docs = [
    Document(page_content=text, metadata={"source": source})
    for source, text in combined_texts.items()
]

print(f"✅ Merged documents count: {len(merged_docs)}")


✅ Merged documents count: 2


In [8]:
print(merged_docs[1].page_content)

N/A 
 
N/A 
Muhammad Talha 
Gujranwala, Punjab, Pakistan       
 muhammadtalha1818@gmail.com       
+923444300394 
 
 https://www.linkedin.com/in/muhammad-talha-b643641b2/ 
 https://github.com/Talha1818 
 
 
  
SUMMARY 
Highly experienced in the domains of machine learning, deep learning, and natural language processing (NLP), I am a 
passionate engineer and thriving analyst. With a strong aptitude for applying ML techniques and developing advanced 
algorithms, I excel in solving complex real-world problems. My profound interest lies in research, Artificial Intelligence, and 
robotics, as I continuously strive to explore innovative frontiers and contribute to advancements in these fields. 
 
EXPERIENCE 
Freelance | Upwork | San Francisco, CA | Feb 2024 - Present 
 
A freelance specialist adept at delivering impactful solutions in machine learning and AI realms, specializing in NLP, 
LLMs, and generative AI technologies. From conceptualization to deployment, I craft tailored solutions 

In [9]:
embeddings  = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
vector_store = FAISS.from_documents(merged_docs, embeddings)

  embeddings  = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")


In [10]:
# Step 2: Run a sample query
query = "Python developer with experience in machine learning and NLP"
docs_with_scores = vector_store.similarity_search_with_score(query)


# Step 2: Convert distance to similarity score
def distance_to_score(distance: float) -> float:
    """Convert L2 distance to a normalized similarity score (0–100%)"""
    similarity = 1 / (1 + distance)         # Normalized between 0 and 1
    return round(similarity * 100, 2)       # As percentage

def get_score(docs_with_scores, query):
  # Step 3: Display results with scores
  for i, (doc, distance) in enumerate(docs_with_scores):
      score_percent = distance_to_score(distance)
      print(f"\n🔹 Match #{i+1} — Score %: {score_percent}% — Score: {distance}")
      print(f"📁 Source: {doc.metadata.get('source', 'unknown')}")
      # print(f"📄 Snippet:\n{doc.page_content[:300]}...\n")

get_score(docs_with_scores, query)


🔹 Match #1 — Score %: 52.220001220703125% — Score: 0.9150180816650391
📁 Source: /content/drive/MyDrive/CV/MuhammadTalhaCV1.pdf

🔹 Match #2 — Score %: 52.20000076293945% — Score: 0.9156745672225952
📁 Source: /content/drive/MyDrive/CV/Muhammad Talha - ML Engineer.pdf


In [13]:
for i in vector_store.similarity_search_with_relevance_scores("Python developer with experience in machine learning and NLP"):
  # print(i[0].metadata['score_percent'])
  print(i)


(Document(id='d70038b0-7891-4eeb-b625-49802834d348', metadata={'source': '/content/drive/MyDrive/CV/MuhammadTalhaCV1.pdf'}, page_content='Muhammad Talha \nSOFTWARE ENGINEER \nCAREER \nI’m a software engineer and 1 year \nexperience in python language from \nfreelancing to handle any task in \npython and Machine Learning based \nprojects. \nInterested in research ,  \nArtificial Intelligence  \nand Robotics . \n \nCONTACT \n+923444300394 \nmuhammadtalha1818@gmail.com \nNational Textile University Faisalbd \nINTEREST \nEDUCATION \nSKILLS AND ABILITIES \nPERSONAL SKILLS \nBachelor of Computer Science \n Oct 2017 present \n National Textile University Faisalabaad \nHigh School \n March 2012 - March 2016 \nArmy Public School & College GWA CANTT \nThe following are my best skills which I believe are \napplicable to the position I have applied for: \n1. Programming \n2. Logical and Structured Thinking \n3. Coding in Multiple Languages (Python, Java, C++ etc) \n1. Ability to work by myself wit

In [14]:
# Step 1: Run your query
query = "need a developer that can work in JAVA, react native"
docs_with_scores = vector_store.similarity_search_with_score(query)

get_score(docs_with_scores, query)


🔹 Match #1 — Score %: 40.130001068115234% — Score: 1.491804599761963
📁 Source: /content/drive/MyDrive/CV/MuhammadTalhaCV1.pdf

🔹 Match #2 — Score %: 38.79999923706055% — Score: 1.5776329040527344
📁 Source: /content/drive/MyDrive/CV/Muhammad Talha - ML Engineer.pdf


In [15]:
# Step 1: Run your query
query = "We are seeking a skilled Python Developer with experience in machine learning, natural language processing (NLP), deep learning, and cloud platforms like AWS. The ideal candidate will have a strong background in building and deploying ML models, working with NLP frameworks (e.g., HuggingFace, spaCy), and developing scalable solutions using Python. Experience with AWS services such as SageMaker or Lambda is a plus. You’ll work closely with a cross-functional team to design, develop, and deploy intelligent systems that process and analyze unstructured data at scale."
docs_with_scores = vector_store.similarity_search_with_score(query)

get_score(docs_with_scores, query)


🔹 Match #1 — Score %: 53.20000076293945% — Score: 0.8797935843467712
📁 Source: /content/drive/MyDrive/CV/Muhammad Talha - ML Engineer.pdf

🔹 Match #2 — Score %: 50.869998931884766% — Score: 0.9657047986984253
📁 Source: /content/drive/MyDrive/CV/MuhammadTalhaCV1.pdf


In [16]:
from typing import List
from langchain_core.runnables import chain

@chain
def __retriever(query: str) -> List[Document]:
    docs, scores = zip(*vector_store.similarity_search_with_score(query))
    for doc, score in zip(docs, scores):
        doc.metadata["score"] = score

    return docs

In [17]:
retriever = __retriever.invoke("need a developer that can work in JAVA, react native")
for doc in retriever:
    print(f"Score: {doc.metadata['score']}")

Score: 1.491804599761963
Score: 1.5776329040527344


In [18]:
from typing import List
from langchain_core.runnables import RunnableLambda

retriever_chain = RunnableLambda(
    lambda query: [
        _add_score_to_doc(doc, score)
        for doc, score in vector_store.similarity_search_with_score(query)
    ]
)

def _add_score_to_doc(doc: Document, score: float) -> Document:
    score_percent = distance_to_score(score)
    doc.metadata["score_percent"] = score_percent
    doc.metadata["distance"] = score
    return doc

retriever = retriever_chain.invoke("need a developer that can work in JAVA, react native")
for doc in retriever:
    print(f"Score: {doc.metadata['distance'], doc.metadata['score_percent']}")

Score: (np.float32(1.4918046), np.float32(40.13))
Score: (np.float32(1.5776329), np.float32(38.8))


In [19]:
def label_score_by_distance(distance: float) -> str:
    if distance <= 0.8:
        return "Strong match"
    elif distance <= 1.2:
        return "Moderate match"
    else:
        return "Weak match"

In [20]:
os.environ["GROQ_API_KEY"] = getpass.getpass()

··········


In [21]:
llm = ChatGroq(temperature = 0, model="llama3-8b-8192")

In [22]:
from langchain.prompts import PromptTemplate

prompt = PromptTemplate(
    template="""
You are a smart and precise Resume Analyzer.

You are given a **Job Description** and a candidate's **Resume** (in plain text format).
Your task is to analyze how well the resume matches the job requirements and return a detailed, structured report.

🔹 ONLY use the resume content for your answers.
🔹 If any information is missing from the resume, write "Not Found".

---

📄 **Job Description**:
{job_description}

📃 **Resume**:
{resume}

---

✍️ Now generate the following structured report:

1. ✅ **Match Score (0–100%)** — Based on skills, experience, education relevance.
2. 🎓 **Education** — Summarize degree(s), institution(s), and relevant fields.
3. 💼 **Experience** — Summarize past job roles, companies, and durations.
4. 🛠️ **Skills / Strengths** — Extract technical and soft skills.
5. ⏸️ **Employment Gaps** — If any, mention time periods and durations.
6. 📞 **Contact Info** — Extract email, phone number, LinkedIn (if available).
7. 📝 **Summary** — Short paragraph on how well this candidate fits the job.

Only output the report — do not provide explanations or repeat the resume/job description.
""",
    input_variables=["job_description", "resume"]
)


In [23]:
job_description = '''
🧑‍💼 Job Title: Data Scientist
We are seeking a Data Scientist with a strong foundation in data analysis, machine learning, and statistical modeling. The ideal candidate will be responsible for extracting insights from complex datasets, building predictive models, and communicating findings that drive business decisions.

Key Responsibilities
Analyze large datasets to discover trends, patterns, and actionable insights.

Build and deploy machine learning models for classification, regression, and clustering tasks.

Collaborate with engineering teams to integrate models into production systems.

Use tools like Python, SQL, Pandas, and Scikit-learn for data processing and modeling.

Communicate results clearly to both technical and non-technical stakeholders.

Qualifications
Proficiency in Python, SQL, and ML frameworks (e.g., Scikit-learn, TensorFlow).

Strong background in statistics, data wrangling, and visualization.

Hands-on experience with data science projects, including model evaluation and tuning.

Experience with data tools (Jupyter, Airflow, Tableau) and cloud platforms (AWS/GCP).
'''

In [24]:
retriever = retriever_chain.invoke(job_description)
for doc in retriever:
    print(f"Score: {doc.metadata['distance'], doc.metadata['score_percent']}")

Score: (np.float32(0.7844763), np.float32(56.04))
Score: (np.float32(0.83004117), np.float32(54.64))


In [25]:
retriever

[Document(id='d70038b0-7891-4eeb-b625-49802834d348', metadata={'source': '/content/drive/MyDrive/CV/MuhammadTalhaCV1.pdf', 'score': np.float32(1.4918046), 'score_percent': np.float32(56.04), 'distance': np.float32(0.7844763)}, page_content='Muhammad Talha \nSOFTWARE ENGINEER \nCAREER \nI’m a software engineer and 1 year \nexperience in python language from \nfreelancing to handle any task in \npython and Machine Learning based \nprojects. \nInterested in research ,  \nArtificial Intelligence  \nand Robotics . \n \nCONTACT \n+923444300394 \nmuhammadtalha1818@gmail.com \nNational Textile University Faisalbd \nINTEREST \nEDUCATION \nSKILLS AND ABILITIES \nPERSONAL SKILLS \nBachelor of Computer Science \n Oct 2017 present \n National Textile University Faisalabaad \nHigh School \n March 2012 - March 2016 \nArmy Public School & College GWA CANTT \nThe following are my best skills which I believe are \napplicable to the position I have applied for: \n1. Programming \n2. Logical and Structure

In [26]:
parser = StrOutputParser()

In [27]:
final_prompt = prompt.invoke({"job_description": job_description, "resume": retriever[1].page_content})

In [28]:
final_prompt

StringPromptValue(text='\nYou are a smart and precise Resume Analyzer.\n\nYou are given a **Job Description** and a candidate\'s **Resume** (in plain text format).\nYour task is to analyze how well the resume matches the job requirements and return a detailed, structured report.\n\n🔹 ONLY use the resume content for your answers.  \n🔹 If any information is missing from the resume, write "Not Found".\n\n---\n\n📄 **Job Description**:\n\n🧑\u200d💼 Job Title: Data Scientist\nWe are seeking a Data Scientist with a strong foundation in data analysis, machine learning, and statistical modeling. The ideal candidate will be responsible for extracting insights from complex datasets, building predictive models, and communicating findings that drive business decisions.\n\nKey Responsibilities\nAnalyze large datasets to discover trends, patterns, and actionable insights.\n\nBuild and deploy machine learning models for classification, regression, and clustering tasks.\n\nCollaborate with engineering t

In [29]:
main_chain =  prompt | llm | parser

In [30]:
res = main_chain.invoke({"job_description": job_description, "resume": retriever[1].page_content})
print(res)

**Match Score (0–100%):** 85%

**Education:**

* Bachelor of Software Engineering, Minor in Data Science, Web Development, HCI, National Textile University, Faisalabad, PK, 2021, 3.52
* F.Sc. Pre-Engineering, Minor in Mathematics, Army Public School & College, Gujranwala Cantt, 2016, 75%

**Experience:**

* Freelance, Upwork, San Francisco, CA, Feb 2024 - Present
* Machine Learning Engineer, TransData, Lahore, PK, October 2022 – March 2024
* Machine Learning Engineer, BLING, San Francisco, CA, September 2022 - March 2023
* Machine Learning Engineer, HAWKLOGIX, Lahore, PK, February 2022 - August 2022
* Machine Learning Engineer, Fiver, San Francisco, CA, November 2020 - January 2022

**Skills / Strengths:**

* Machine Learning: sci-kit-learn, TensorFlow, PyTorch
* Data Analysis: Pandas, NumPy, SciPy, matplotlib, seaborn, Plotly
* Deep Learning: TensorFlow, PyTorch
* Computer Vision: OpenCV
* Natural Language Processing (NLP): NLTK, spaCy, Gensim
* Web Development: Flask, Django
* Deskto

In [31]:
# answer = llm.invoke(final_prompt)
# print(answer.content)

In [32]:
from pydantic import BaseModel, Field
from typing import List, Optional


class Education(BaseModel):
    university_name: str = Field(..., description="Name of the university or institution")
    degree: str = Field(..., description="Degree earned by the candidate")
    gpa: Optional[str] = Field(None, description="GPA or grade, if available")


class Experience(BaseModel):
    company_name: str = Field(..., description="Name of the company or organization")
    n_years: Optional[str] = Field(None, description="Duration in years at this company")
    project_name: Optional[str] = Field(None, description="Name or title of a key project")
    project_description: Optional[str] = Field(None, description="Brief description of the project")
    tech_stack: List[str] = Field(..., description="Technologies and tools used in the project")


class ContactInfo(BaseModel):
    email: Optional[str] = Field(None, description="Email address")
    phone: Optional[str] = Field(None, description="Phone number")
    linkedin: Optional[str] = Field(None, description="LinkedIn profile URL")


class ResumeReport(BaseModel):
    name: str = Field(..., description="Full name of the candidate")
    # age: Optional[int] = Field(None, ge=0, description="Age of the candidate, if available")
    # native_languages: List[str] = Field(default_factory=list, description="List of native languages spoken by the candidate")
    # match_score: int = Field(..., ge=0, le=100, description="Match score between 0–100")
    # match_score: str = Field(..., description="Match score between 0–100")
    education: Optional[List[Education]] = Field(..., description="List of education details")
    experience: Optional[List[Experience]] = Field(..., description="List of professional experience")
    skills: Optional[List[str]] = Field(..., description="List of key skills and strengths")
    employment_gaps: Optional[str] = Field(None, description="Employment gaps if found")
    contact_info: ContactInfo = Field(..., description="Contact information")
    summary: str = Field(..., description="Short summary about the candidate's fit for the job")


In [33]:
from langchain_core.output_parsers import PydanticOutputParser

resume_parser = PydanticOutputParser(pydantic_object=ResumeReport)

In [34]:
from langchain.prompts import PromptTemplate

resume_prompt = PromptTemplate(
    template="""
You are a smart and precise Resume Analyzer.

You are given a **Job Description** and a candidate's **Resume** (plain text).
Your task is to analyze how well the resume matches the job requirements and return a structured report as a VALID JSON object.

ONLY use the resume content for your answers.
If any information is missing from the resume, write "Not Found".

---

Job Description:
{job_description}

Resume:
{resume}

---

Return ONLY a JSON object in the format below:
{format_instructions}
""",
    input_variables=["job_description", "resume"],
    partial_variables={"format_instructions": resume_parser.get_format_instructions()}
)


In [35]:
# from langchain.prompts import PromptTemplate

# resume_prompt = PromptTemplate(
#     template="""
# You are a smart and precise Resume Analyzer.

# You are given a **Job Description** and a candidate's **Resume** (plain text).
# Your task is to analyze how well the resume matches the job requirements and return a structured report as a VALID JSON object.

# 🔹 ONLY use the resume content for your answers.
# 🔹 If any information is missing from the resume, write "Not Found".

# You are also given two similarity metrics:
# - **Distance**: A float where lower means better match (e.g., 0.7 is good, 1.8 is weak).
# - **Similarity Score**: A float between 0 and 1, calculated as `1 / (1 + distance)`.

# 🎯 Use the following formula to compute the `match_score` (between 0–100):
# - match_score = round(((1 - min(distance, 2)) * 0.6 + similarity_score * 0.4) * 100, 2)

# Explanation:
# - Distance is capped at 2.0 to prevent outliers.
# - 60% weight is from distance, 40% from normalized similarity_score.
# - The result is always between 0 and 100.

# ---

# 📐 Similarity Metrics:
# - Distance: {distance}
# - Similarity Score: {similarity_score}

# 📄 Job Description:
# {job_description}

# 📃 Resume:
# {resume}

# ---

# Return ONLY a JSON object in the format below:
# {format_instructions}
# """,
#     input_variables=["job_description", "resume", "distance", "similarity_score"],
#     partial_variables={"format_instructions": resume_parser.get_format_instructions()}
# )


In [36]:
main_chain =  resume_prompt | llm | resume_parser

In [37]:
retriever[1].metadata

{'source': '/content/drive/MyDrive/CV/Muhammad Talha - ML Engineer.pdf',
 'score': np.float32(1.5776329),
 'score_percent': np.float32(54.64),
 'distance': np.float32(0.83004117)}

In [38]:
ret = retriever[1].page_content + f"\n\n distance: {str(retriever[1].metadata['distance'])}\n get score this formula similarity = 1 / (1 + distance)  # Normalized between 0 and 1 Score:{str(retriever[1].metadata['score'])}"
ret

'N/A \n \nN/A \nMuhammad Talha \nGujranwala, Punjab, Pakistan       \n muhammadtalha1818@gmail.com       \n+923444300394 \n \n https://www.linkedin.com/in/muhammad-talha-b643641b2/ \n https://github.com/Talha1818 \n \n \n  \nSUMMARY \nHighly experienced in the domains of machine learning, deep learning, and natural language processing (NLP), I am a \npassionate engineer and thriving analyst. With a strong aptitude for applying ML techniques and developing advanced \nalgorithms, I excel in solving complex real-world problems. My profound interest lies in research, Artificial Intelligence, and \nrobotics, as I continuously strive to explore innovative frontiers and contribute to advancements in these fields. \n \nEXPERIENCE \nFreelance | Upwork | San Francisco, CA | Feb 2024 - Present \n\uf0b7 \nA freelance specialist adept at delivering impactful solutions in machine learning and AI realms, specializing in NLP, \nLLMs, and generative AI technologies. From conceptualization to deployment

In [39]:
# res = main_chain.invoke({"job_description": job_description, "resume": retriever[0].page_content})

In [40]:
# res = main_chain.invoke({"job_description": job_description, "resume": retriever[1].page_content,
#                          "distance":retriever[1].metadata['distance'],
#                          "similarity_score":retriever[1].metadata['score_percent']/100})

In [41]:
res = main_chain.invoke({"job_description": job_description, "resume": retriever[1].page_content})

In [42]:
retriever[1].metadata['distance']

np.float32(0.83004117)

In [43]:
final_response = res.dict()
final_response

/tmp/ipython-input-43-3475066677.py:1: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  final_response = res.dict()


{'name': 'Muhammad Talha',
 'education': [{'university_name': 'National Textile University',
   'degree': 'Bachelor of Software Engineering',
   'gpa': '3.52'}],
 'experience': [{'company_name': 'TransData',
   'n_years': '1.5',
   'project_name': 'Document Classifier',
   'project_description': 'Automated classification of title closing documents using BERT model',
   'tech_stack': ['Python',
    'SQL',
    'AWS Cloud services',
    'TensorFlow',
    'BERT']},
  {'company_name': 'BLING',
   'n_years': '0.5',
   'project_name': 'Message Spam Detection and Sentiment Analysis',
   'project_description': 'Developed a GPT-powered chatbot for user experience improvement',
   'tech_stack': ['Python', 'SQL', 'AWS Cloud services', 'TensorFlow', 'GPT']},
  {'company_name': 'HAWKLOGIX',
   'n_years': '0.5',
   'project_name': 'TeleStroke Evaluation Project',
   'project_description': 'Developed an AI system to predict probability of blasts occurring within specific time periods',
   'tech_stack'

In [45]:
def get_metadata(doc):
  distance = round(float(doc.metadata['distance']),4)
  score = round(float(doc.metadata['score_percent']),4)
  source = doc.metadata['source']
  return {
      'distance': distance,
      'score': score,
      'source': source
  }

In [46]:
get_metadata(retriever[1])

{'distance': 0.83,
 'score': 54.64,
 'source': '/content/drive/MyDrive/CV/Muhammad Talha - ML Engineer.pdf'}

In [47]:
# retriever[1].metadata

In [48]:
final_response['metadata'] = get_metadata(retriever[1])
final_response

{'name': 'Muhammad Talha',
 'education': [{'university_name': 'National Textile University',
   'degree': 'Bachelor of Software Engineering',
   'gpa': '3.52'}],
 'experience': [{'company_name': 'TransData',
   'n_years': '1.5',
   'project_name': 'Document Classifier',
   'project_description': 'Automated classification of title closing documents using BERT model',
   'tech_stack': ['Python',
    'SQL',
    'AWS Cloud services',
    'TensorFlow',
    'BERT']},
  {'company_name': 'BLING',
   'n_years': '0.5',
   'project_name': 'Message Spam Detection and Sentiment Analysis',
   'project_description': 'Developed a GPT-powered chatbot for user experience improvement',
   'tech_stack': ['Python', 'SQL', 'AWS Cloud services', 'TensorFlow', 'GPT']},
  {'company_name': 'HAWKLOGIX',
   'n_years': '0.5',
   'project_name': 'TeleStroke Evaluation Project',
   'project_description': 'Developed an AI system to predict probability of blasts occurring within specific time periods',
   'tech_stack'

In [75]:
import re

def extract_float_from_text(text: str) -> float:
    """
    Extracts the first numeric value (integer or float) from a string.

    Args:
        text (str): Input string, e.g., '1 years' or '0.5 years'.

    Returns:
        float: Extracted number as float, or 0.0 if no number is found.
    """
    match = re.search(r"\d+(\.\d+)?", text)
    return float(match.group()) if match else 0.0

print(extract_float_from_text("1 years"))     # Output: 1.0
print(extract_float_from_text("0.5 years"))   # Output: 0.5
print(extract_float_from_text("10 years"))   # Output: 0.0


1.0
0.5
10.0


In [76]:
import re

def extract_years_from_text(text: str) -> float:
    """
    Extracts a numeric value (float/int) from a string and converts months to years if needed.

    Args:
        text (str): e.g., '6-12 months', '0.5 years', '1 year'

    Returns:
        float: Duration in years
    """
    match = re.search(r"\d+(\.\d+)?", text)
    if not match:
        return 0.0

    value = float(match.group())

    if 'month' in text.lower():
        return round(value / 12, 2)
    else:
        return round(value, 2)


In [77]:
print(extract_years_from_text("6-12 months"))   # Output: 0.5
print(extract_years_from_text("1 year"))        # Output: 1.0
print(extract_years_from_text("0.5 years"))     # Output: 0.5
print(extract_years_from_text("18 months"))     # Output: 1.5
print(extract_years_from_text("No duration"))   # Output: 0.0


0.5
1.0
0.5
1.5
0.0


In [59]:
total = 0
for exp in final_response['experience']:
  yrs = extract_float_from_text(exp['n_years'])
  total += yrs
print(total)

4.0


In [49]:
import pandas as pd

df = pd.DataFrame([final_response])
df

Unnamed: 0,name,education,experience,skills,employment_gaps,contact_info,summary,metadata
0,Muhammad Talha,[{'university_name': 'National Textile Univers...,"[{'company_name': 'TransData', 'n_years': '1.5...","[Machine Learning, Data Analysis, Deep Learnin...",,"{'email': 'muhammadtalha1818@gmail.com', 'phon...","Highly experienced in machine learning, deep l...","{'distance': 0.83, 'score': 54.64, 'source': '..."


In [78]:
RESULTS = []
def get_all_cv_results(retriever_docs):

  for retriever in retriever_docs:
    res = main_chain.invoke({"job_description": job_description, "resume": retriever.page_content}).model_dump()
    metadata = get_metadata(retriever)
    res['metadata'] = metadata
    res['label'] = label_score_by_distance(metadata['distance'])
    res['match_score'] = metadata['score']
    total = 0
    for exp in res['experience']:
      yrs = extract_years_from_text(exp['n_years'])
      total += yrs
    res['total_experience'] = total
    RESULTS.append(res)
  return RESULTS

In [79]:
RESPONSE = get_all_cv_results(retriever)

In [80]:
RESPONSE

[{'name': 'Muhammad Talha',
  'education': [{'university_name': 'National Textile University Faisalabaad',
    'degree': 'Bachelor of Computer Science',
    'gpa': 'Not Found'}],
  'experience': [{'company_name': 'Not Found',
    'n_years': '1 year',
    'project_name': 'JARVIS Voice Assistant AI',
    'project_description': 'Python project',
    'tech_stack': ['Python', 'Machine Learning']}],
  'skills': ['Programming',
   'Logical and Structured Thinking',
   'Coding in Multiple Languages (Python, Java, C++ etc)',
   'Ability to work by myself',
   'Ability to work harmoniously with other members',
   'Analytical and problem solving skills'],
  'employment_gaps': 'Not Found',
  'contact_info': {'email': 'muhammadtalha1818@gmail.com',
   'phone': '+923444300394',
   'linkedin': 'Not Found'},
  'summary': 'Not Found',
  'metadata': {'distance': 0.7845,
   'score': 56.04,
   'source': '/content/drive/MyDrive/CV/MuhammadTalhaCV1.pdf'},
  'label': 'Strong match',
  'match_score': 56.04,
 

In [81]:
len(RESPONSE)

2

In [82]:
df = pd.DataFrame(RESPONSE)
df

Unnamed: 0,name,education,experience,skills,employment_gaps,contact_info,summary,metadata,label,match_score,total_experience
0,Muhammad Talha,[{'university_name': 'National Textile Univers...,"[{'company_name': 'Not Found', 'n_years': '1 y...","[Programming, Logical and Structured Thinking,...",Not Found,"{'email': 'muhammadtalha1818@gmail.com', 'phon...",Not Found,"{'distance': 0.7845, 'score': 56.04, 'source':...",Strong match,56.04,1.0
1,Muhammad Talha,[{'university_name': 'National Textile Univers...,"[{'company_name': 'TransData', 'n_years': '1.5...","[Machine Learning, Data Analysis, Deep Learnin...",,"{'email': 'muhammadtalha1818@gmail.com', 'phon...","Highly experienced in machine learning, deep l...","{'distance': 0.83, 'score': 54.64, 'source': '...",Moderate match,54.64,4.0


In [83]:
distance = 0.81
similarity_score = 55.09 / 100  # normalize to 0–1 scale

score = round(((1 - min(distance, 2)) * 0.6 + similarity_score * 0.4) * 100, 2)
print(score)


33.44
