Parsing demo

Using the `backend/app/services/resume-parser.py` we can parse a resume to extract everything that would sugest a profficiency in a specific skill needed to become a data scientist.

In [1]:
import sys, os
from pathlib import Path

In [2]:
# project root = parent of "notebooks"
project_root = os.path.abspath("..")
if project_root not in sys.path:
    sys.path.insert(0, project_root)

os.environ["GROQ_API_KEY"] = "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"

In [3]:
from backend.app.services.resume_parser import parse_resume_pdf
from backend.app.services.resume_skill_eval import evaluate_resume_skills

  from pydantic.v1.fields import FieldInfo as FieldInfoV1
  from .autonotebook import tqdm as notebook_tqdm


In [4]:
resume_dir = Path(project_root)  / "demo" / "resume_parsing"

In [5]:
resume_xml = parse_resume_pdf(resume_dir / "Resume_ASDD_CSxCU.pdf")

In [6]:
print(resume_xml)

<resume>
  <skills>
    <category name="Programming">
      <skill>Python</skill>
      <skill>Pandas</skill>
      <skill>NumPy</skill>
      <skill>SciPy</skill>
      <skill>scikit-learn</skill>
      <skill>statsmodels</skill>
      <skill>Polars</skill>
      <skill>sktime</skill>
      <skill>tslearn</skill>
      <skill>SQL</skill>
      <skill>MATLAB</skill>
      <skill>Git</skill>
    </category>
    <category name="Data/Time Series">
      <skill>wrangling &amp; feature engineering</skill>
      <skill>time-series (ARIMA, DTW)</skill>
      <skill>validation &amp; backtesting</skill>
      <skill>probability &amp; statistics</skill>
    </category>
    <category name="Analytics/MLOps">
      <skill>Excel (Power Query/Power Pivot)</skill>
      <skill>MLflow</skill>
      <skill>Azure ML</skill>
      <skill>LaTeX</skill>
      <skill>SharePoint/Power Automate/Apps</skill>
    </category>
    <category name="Coursework">
      <skill>Data Analytics</skill>
      <skill>Probab

In [7]:
evaluate_resume_skills(resume_xml=resume_xml)

{'skills': [{'name': 'Python', 'level': 3},
  {'name': 'Pandas', 'level': 3},
  {'name': 'NumPy', 'level': 3},
  {'name': 'SciPy', 'level': 2},
  {'name': 'scikit-learn', 'level': 3},
  {'name': 'statsmodels', 'level': 2},
  {'name': 'Polars', 'level': 2},
  {'name': 'sktime', 'level': 2},
  {'name': 'tslearn', 'level': 2},
  {'name': 'SQL', 'level': 3},
  {'name': 'Matlab', 'level': 1},
  {'name': 'Git', 'level': 2},
  {'name': 'Time Series Analysis', 'level': 3},
  {'name': 'ARIMA', 'level': 2},
  {'name': 'DTW', 'level': 2},
  {'name': 'Validation & Backtesting', 'level': 2},
  {'name': 'Probability & Statistics', 'level': 3},
  {'name': 'Excel (Power Query/Power Pivot)', 'level': 1},
  {'name': 'MLflow', 'level': 2},
  {'name': 'Azure ML', 'level': 2},
  {'name': 'LaTeX', 'level': 1},
  {'name': 'SharePoint/Power Automate/Apps', 'level': 1},
  {'name': 'Data Wrangling & Feature Engineering', 'level': 3},
  {'name': 'Clustering', 'level': 2},
  {'name': 'Machine Learning', 'level': 

## Job Description parsing

In [8]:
from backend.app.services.job_skill_eval import evaluate_job_skills

In [9]:
with open(Path(project_root)  / "demo" / "CVS_job_offer.txt", "r") as f:
    job_description = f.read()
    job_eval = evaluate_job_skills(job_description_text=job_description)

print(job_eval)

<jobSkills>
  <skill><name>Python</name><level>3</level></skill>
  <skill><name>FastAPI</name><level>3</level></skill>
  <skill><name>GenAI</name><level>3</level></skill>
  <skill><name>RAG</name><level>3</level></skill>
  <skill><name>LangChain</name><level>2</level></skill>
  <skill><name>LangGraph</name><level>2</level></skill>
  <skill><name>Cloud</name><level>2</level></skill>
  <skill><name>GCP</name><level>2</level></skill>
  <skill><name>Azure</name><level>2</level></skill>
  <skill><name>AWS</name><level>2</level></skill>
  <skill><name>MLOps</name><level>2</level></skill>
  <skill><name>Jenkins</name><level>1</level></skill>
  <skill><name>Docker</name><level>1</level></skill>
  <skill><name>Front End</name><level>1</level></skill>
  <skill><name>Ad-tech</name><level>1</level></skill>
  <skill><name>Backend</name><level>1</level></skill>
  <skill><name>SQL</name><level>1</level></skill>
  <skill><name>Collaboration</name><level>0</level></skill>
  <skill><name>Communication</

## Combine both:

In [10]:
from backend.app.services.job_skill_eval import job_skills_to_dict

In [11]:
job_eval_dict = job_skills_to_dict(job_eval)
job_eval_dict

{'skills': [{'name': 'Python', 'level': 3},
  {'name': 'FastAPI', 'level': 3},
  {'name': 'GenAI', 'level': 3},
  {'name': 'RAG', 'level': 3},
  {'name': 'LangChain', 'level': 2},
  {'name': 'LangGraph', 'level': 2},
  {'name': 'Cloud', 'level': 2},
  {'name': 'GCP', 'level': 2},
  {'name': 'Azure', 'level': 2},
  {'name': 'AWS', 'level': 2},
  {'name': 'MLOps', 'level': 2},
  {'name': 'Jenkins', 'level': 1},
  {'name': 'Docker', 'level': 1},
  {'name': 'Front End', 'level': 1},
  {'name': 'Ad-tech', 'level': 1},
  {'name': 'Backend', 'level': 1},
  {'name': 'SQL', 'level': 1},
  {'name': 'Collaboration', 'level': 0},
  {'name': 'Communication', 'level': 0},
  {'name': 'Coding Standards', 'level': 0},
  {'name': 'Distributed Computing', 'level': 0},
  {'name': 'Agile Team', 'level': 0},
  {'name': 'Agile Workflow', 'level': 0}]}

In [12]:
job_xml = evaluate_job_skills(job_description)
job_targets = job_skills_to_dict(job_xml)["skills"]
resume_scores = evaluate_resume_skills(resume_xml, job_skill_targets=job_targets)
resume_scores

{'skills': [{'name': 'Python', 'level': 3},
  {'name': 'FastAPI', 'level': 0},
  {'name': 'GenAI', 'level': 0},
  {'name': 'RAG', 'level': 0},
  {'name': 'LangChain', 'level': 0},
  {'name': 'LangGraph', 'level': 0},
  {'name': 'Cloud', 'level': 0},
  {'name': 'GCP', 'level': 0},
  {'name': 'Azure', 'level': 0},
  {'name': 'AWS', 'level': 1},
  {'name': 'MLOps', 'level': 0},
  {'name': 'Jenkins', 'level': 0},
  {'name': 'Docker', 'level': 0},
  {'name': 'Front End', 'level': 0},
  {'name': 'Ad-tech', 'level': 0},
  {'name': 'Backend', 'level': 0},
  {'name': 'SQL', 'level': 1},
  {'name': 'Collaboration', 'level': 0},
  {'name': 'Communication', 'level': 0},
  {'name': 'Coding Standards', 'level': 0},
  {'name': 'Distributed Computing', 'level': 0},
  {'name': 'Agile Team', 'level': 0},
  {'name': 'Agile Workflow', 'level': 0}]}

In [13]:
import pandas as pd

# Extract skills from both dictionaries
job_skills = {skill['name']: skill['level'] for skill in job_eval_dict['skills']}
resume_skills = {skill['name']: skill['level'] for skill in resume_scores['skills']}

# Find skills missing or needing improvement
skill_gaps = []
for job_skill, job_level in job_skills.items():
    resume_level = resume_skills.get(job_skill, 0)
    skill_gaps.append({
        'Skill': job_skill,
        'Required Level': job_level,
        'Current Level': resume_level,
        'Gap': job_level - resume_level
    })

# Create DataFrame and sort by gap size
gaps_df = pd.DataFrame(skill_gaps).sort_values('Gap', ascending=False)
print(gaps_df)

                    Skill  Required Level  Current Level  Gap
2                   GenAI               3              0    3
3                     RAG               3              0    3
1                 FastAPI               3              0    3
4               LangChain               2              0    2
5               LangGraph               2              0    2
6                   Cloud               2              0    2
7                     GCP               2              0    2
8                   Azure               2              0    2
10                  MLOps               2              0    2
11                Jenkins               1              0    1
13              Front End               1              0    1
15                Backend               1              0    1
14                Ad-tech               1              0    1
9                     AWS               2              1    1
12                 Docker               1              0    1
19      

In [14]:
from backend.app.services.skill_gap_eval import generate_skill_gap_xml

job_xml = evaluate_job_skills(job_description)
job_skills = job_skills_to_dict(job_xml)["skills"]

resume_result = evaluate_resume_skills(resume_xml, job_skill_targets=job_skills)
resume_skills = resume_result["skills"]

gap_xml = generate_skill_gap_xml(job_skills=job_skills, resume_skills=resume_skills)

In [15]:
print(gap_xml)

<skillGaps>
  <skill>
    <name>python</name>
    <requiredLevel>3</requiredLevel>
    <currentLevel>3</currentLevel>
    <gap>0</gap>
  </skill>
  <skill>
    <name>fastapi</name>
    <requiredLevel>3</requiredLevel>
    <currentLevel>0</currentLevel>
    <gap>3</gap>
  </skill>
  <skill>
    <name>genai</name>
    <requiredLevel>3</requiredLevel>
    <currentLevel>0</currentLevel>
    <gap>3</gap>
  </skill>
  <skill>
    <name>rag</name>
    <requiredLevel>3</requiredLevel>
    <currentLevel>0</currentLevel>
    <gap>3</gap>
  </skill>
  <skill>
    <name>langchain</name>
    <requiredLevel>2</requiredLevel>
    <currentLevel>0</currentLevel>
    <gap>2</gap>
  </skill>
  <skill>
    <name>langgraph</name>
    <requiredLevel>2</requiredLevel>
    <currentLevel>0</currentLevel>
    <gap>2</gap>
  </skill>
  <skill>
    <name>cloud</name>
    <requiredLevel>2</requiredLevel>
    <currentLevel>0</currentLevel>
    <gap>2</gap>
  </skill>
  <skill>
    <name>gcp</name>
    <requiredLeve