# Select the correct environment and import necessary modules

Select the correct kernel.

I recommend creating a new environment. Then to select it :
- Select Kernel
    - Select another kernel
        - Jupyter kernel
            - Smart Resume an Job matcher (or any kernel name)

In [1]:
import sys

In [6]:
print(sys.executable)
# Should see something like : ...\Smart_Resume_to_Job_Matcher\.venv\Scripts\python.exe

c:\Users\Sebastien\Desktop\LLM and GenAI\Smart_Resume_to_Job_Matcher\.venv\Scripts\python.exe


In [12]:
import importlib# To reload modified python files for agents
import sys# To find agents python files
from pathlib import Path

# Pull models and embeddings

In a first terminal window, launch the ollama app:
~~~
ollama serve
~~~

Then pull the model and embedding from another terminal window:
~~~
ollama pull llama3
ollama pull nomic-embed-text
ollama pull mxbai-embed-large
~~~

To eventually delete all models, use:
~~~
ollama list
ollama rm <model-name>
~~~

# Resume and Job description loader  **DONE**

~~~
project/
├── data/
│ ├── resumes/
| | └── resumes.csv # already partially preprocessed
│ └── jobs/
| | └── job_postings.csv # filtered with only the real jobs applications
~~~

In this part, we need to transform each resume and job description into one single text.

In [13]:
# Add the parent folder of ingestion to sys.path
project_root = Path("..").resolve()  # notebooks/ is one level down
sys.path.append(str(project_root))

In [9]:
# This code is to reload the modified python files for the agent
import ingestion.preprocess
# Reload the file to take into account the changes
importlib.reload(ingestion.preprocess)

<module 'ingestion.preprocess' from 'C:\\Users\\Sebastien\\Desktop\\LLM and GenAI\\Smart_Resume_to_Job_Matcher\\ingestion\\preprocess.py'>

In [6]:
from ingestion.preprocess import clean_text, resumes_to_raw_text, jobs_to_raw_text
# Load resumes and job only reads the csv file and transform each line into a json like format

In [7]:
# Now, we get the resumes and jobs as lists of formatted JSON files, 
#   with the id and formatted text for each file
resumes = resumes_to_raw_text("../data/resumes/resumes.csv")
jobs = jobs_to_raw_text("../data/jobs/job_postings.csv")

In [60]:
print("Loaded", len(resumes), "resumes")
print("Loaded", len(jobs), "job descriptions")

print("Sample resume:", resumes[482])
print("Sample job:", jobs[15489])

Loaded 13389 resumes
Loaded 17014 job descriptions
Sample resume: {'resume_id': 483, 'text': 'Category: Aviation. Resume text: qualifications stacy sample 702 8000000 cell 0000emailcom flight ratings faa atpairline transport pilot cfirotorcrafthelicopter s70 s92 typeratings medical flight time total 6867 pilot command 5673 night 1342 pilot 1194 night vision goggle 528 helicopter multiengine 4660 offshoreoverwater 574 faa class 1 cross country 3847 external load 263 instrument ifrhood 832 ems 541 actual weather 369 mountain 2683 experience sikorsky s76 s92 test pilot jul 2007present bell 430 corporate ifr captain jan 2006jun 2007 sikorsky s76 ems captain oct 2003dec 2005 sikorsky aircraft corporation coatesville pa s76 592 production test pilotmaintenance test pilot engineering test flight civil foreign military flight instructor customer demonstrations international ifr vfr overwater experience elite aviationlos angeles ca single pilot bell 430 corporate ifr captainoffairport landing l

# Normalize text **DONE**

In [111]:
import agents.normalization_agent
# Reload the file to take into account the changes
importlib.reload(agents.normalization_agent)

<module 'agents.normalization_agent' from 'C:\\Users\\Sebastien\\Desktop\\LLM and GenAI\\Smart_Resume_to_Job_Matcher\\agents\\normalization_agent.py'>

In [None]:
from agents.normalization_agent import normalize_resume, normalize_job

### Example

In [24]:
# Now, we try it on the first resume and job text
resume_text = resumes[0]["text"]
job_text = jobs[0]["text"]
print(resume_text)
print(job_text)

Category: Accountant. Resume text: education omba executive leadership university texas 20162018 bachelor science accounting richland college 20052008 training certifications certified management accountant cma certified financial modeling valuation analyst compliance antimoney laundering 092016 american institute banking certified public account cpa lean six sigma green belt certified trade products financial regulations 082016 american institute banking achievements speaker bringing leader within 082019 successfully presented empowering speech leadership 500 participants speaker dallas convention cpas 032019 successfully delivered seminar 3k cpas convention guests teaching experience online teacher udemy 2017 taught online accounting nonaccountant course udemy similar online teaching platforms developed effective teaching modules materials curriculum target students took feedbacks students assist improving teaching methodology materials professional memberships affiliations american 

In [25]:
# The llama model is initialized inside the agent (so we only need to run the agent)
normalized_resume = normalize_resume(resume_text)
normalized_job = normalize_job(job_text)

print(normalized_resume)
print(normalized_job)

{'skills': ['Microsoft Office', 'QuickBooks', 'ERP', 'SAP', 'Oracle', 'Hyperion', 'SQL'], 'experience': ['Accountant – Omba Executive Leadership (2016-2018): Managed financial regulations and compliance.', 'Speaker, Dallas Convention CPAs (2019): Successfully presented empowering speech to 500 participants.'], 'education': ['Bachelor of Science in Accounting, Richland College (2005-2008)', 'Executive Leadership, University of Texas (2016-2018)'], 'certifications': ['CMA', 'CFMVA', 'CPA', 'Lean Six Sigma Green Belt'], 'industries': ['Financial Regulations']}
{'job_title': 'Marketing Intern', 'required_skills': ['Content management systems', 'Food52 editorial voice and aesthetic', 'Social media (Twitter, Facebook, Pinterest)', 'Problem-solving', 'Collaboration'], 'required_experience': 'Internship', 'required_education': 'Not specified', 'industry': 'Media'}


/!\ INFO /!\
 - RESUMES

Preprocess gives in a text the whole resume text

For normalized resumes, should give the infos :
{'skills': [], 'experience': [], 'education': [], 'certifications': [], 'industries': []}


 - JOB DESCRIPTIONS

Preprocess gives in a text : title, location, description, requirements, employment_type, required_experience, required_education, industry, function

For normalized job, should give the following infos : 
{'job_title': '...', 'required_skills': [], 'required_experience': '', 'required_education': '', 'industry': ''}

### Complete normalization

Now that we have an example with one resume and one job, we need to do it for all resumes and jobs.

In this step, I decided to put normalized resumes and jobs into two distinct lists, each containing the normalized representation of resumes and job descriptions. The objects will look like this :
```python 
normalized_resumes = [
    {'resume_id': 1, 
    'norm_text': {'skills': [...], 'experience': [...], 'education': [...], 'certifications': [...], 'industries': [...]}
    },
    ...
]

normalized_jobs = [
    {'job_id': 1, 
    'job_description': {'job_title': '...', 'required_skills': [...], 'required_experience': '...', 'required_education': '...', 'industry': '...'}
    },
]
```

However, since the quantity of resumes and job description is very high, the computation takes time. In case it crashes in the middle, I created a checkpoint file to save progress every 10 normalized elements.

/!\ 
**Léger problème** : les skills ne semblent pas normalizés. J'ai pu voir "MS Office" mais aussi "Microsoft Excel", "Microsoft Word" et "Microsoft Office", alors que j'ai explicitement demander la normalization dans le prompt pour le llm.

-> Problème réglé grâce à une "normalization forcée".

#### Resumes (13389 elements) **DONE**

In [15]:
import os
import json
from tqdm import tqdm
from pathlib import Path

In [16]:
# We first prepare checkpoints (very long computation)
## Checkpoint directory to save partial results
checkpoint_dir = Path("checkpoints")
checkpoint_dir.mkdir(exist_ok=True)

## File to save intermediate results
resume_checkpoint = checkpoint_dir / "normalized_resumes.json"

## Load existing checkpoint if available
if resume_checkpoint.exists():
    print("Checkpoint found")
    with open(resume_checkpoint, "r") as f:
        normalized_resumes = json.load(f)
    processed_ids = {r['resume_id'] for r in normalized_resumes}
    print(len(processed_ids), "resumes")
else:
    normalized_resumes = []
    processed_ids = set()

Checkpoint found
12020 resumes


In [17]:
# Then we loop through resumes, adding a progress bar 
for resume in tqdm(resumes, desc="Normalizing resumes"):
    resume_id = resume['resume_id']
    # Skip resumes already processed
    if resume_id in processed_ids:
        continue
    resume_text = resume['text']
    # We normalize the text
    norm_resume_text = normalize_resume(resume_text)# --> None if normalization failed after 3 attempts <--
    # We join everything in a dictionnary for clarity
    normalized_resume = {
        'resume_id': resume_id,
        'norm_text': norm_resume_text
    }
    # We put the result in a list
    normalized_resumes.append(normalized_resume)
    processed_ids.add(resume_id)
    # We save a checkpoint (every 10 resumes)
    if len(normalized_resumes) % 10 == 0:
        with open(resume_checkpoint, "w") as f:
            json.dump(normalized_resumes, f, indent=2)

# Final save
with open(resume_checkpoint, "w") as f:
    json.dump(normalized_resumes, f, indent=2)

Normalizing resumes:  90%|████████▉ | 12030/13389 [00:45<00:10, 132.53it/s] 

⚠ Attempt 1 failed for resume. Retrying...


Normalizing resumes:  91%|█████████ | 12120/13389 [08:13<1:48:26,  5.13s/it]

⚠ Attempt 1 failed for resume. Retrying...


Normalizing resumes:  91%|█████████ | 12209/13389 [16:02<1:51:22,  5.66s/it]

⚠ Attempt 1 failed for resume. Retrying...
⚠ Attempt 2 failed for resume. Retrying...
⚠ Attempt 3 failed for resume. Retrying...
❌ Failed to normalize resume after 3 attempts.


Normalizing resumes:  91%|█████████▏| 12224/13389 [17:45<2:03:31,  6.36s/it]

⚠ Attempt 1 failed for resume. Retrying...
⚠ Attempt 2 failed for resume. Retrying...


Normalizing resumes:  91%|█████████▏| 12225/13389 [18:11<3:56:29, 12.19s/it]

⚠ Attempt 3 failed for resume. Retrying...
❌ Failed to normalize resume after 3 attempts.


Normalizing resumes:  92%|█████████▏| 12332/13389 [28:03<1:39:43,  5.66s/it]

⚠ Attempt 1 failed for resume. Retrying...


Normalizing resumes:  93%|█████████▎| 12439/13389 [37:07<1:06:59,  4.23s/it]

⚠ Attempt 1 failed for resume. Retrying...


Normalizing resumes:  94%|█████████▍| 12629/13389 [51:30<1:07:53,  5.36s/it]

⚠ Attempt 1 failed for resume. Retrying...


Normalizing resumes:  95%|█████████▍| 12669/13389 [55:35<1:10:07,  5.84s/it]

⚠ Attempt 1 failed for resume. Retrying...


Normalizing resumes:  95%|█████████▌| 12739/13389 [1:02:12<44:32,  4.11s/it]  

⚠ Attempt 1 failed for resume. Retrying...
⚠ Attempt 2 failed for resume. Retrying...
⚠ Attempt 3 failed for resume. Retrying...
❌ Failed to normalize resume after 3 attempts.


Normalizing resumes:  95%|█████████▌| 12752/13389 [1:03:46<56:10,  5.29s/it]  

⚠ Attempt 1 failed for resume. Retrying...


Normalizing resumes:  95%|█████████▌| 12777/13389 [1:06:15<1:03:54,  6.27s/it]

⚠ Attempt 1 failed for resume. Retrying...
⚠ Attempt 2 failed for resume. Retrying...


Normalizing resumes:  95%|█████████▌| 12778/13389 [1:06:40<2:02:04, 11.99s/it]

⚠ Attempt 3 failed for resume. Retrying...
❌ Failed to normalize resume after 3 attempts.


Normalizing resumes:  96%|█████████▌| 12857/13389 [1:14:01<52:00,  5.87s/it]  

⚠ Attempt 1 failed for resume. Retrying...
⚠ Attempt 2 failed for resume. Retrying...


Normalizing resumes:  96%|█████████▌| 12858/13389 [1:14:27<1:45:44, 11.95s/it]

⚠ Attempt 3 failed for resume. Retrying...
❌ Failed to normalize resume after 3 attempts.


Normalizing resumes:  96%|█████████▌| 12877/13389 [1:16:08<47:40,  5.59s/it]  

⚠ Attempt 1 failed for resume. Retrying...


Normalizing resumes:  97%|█████████▋| 12975/13389 [1:24:54<33:55,  4.92s/it]  

⚠ Attempt 1 failed for resume. Retrying...
⚠ Attempt 2 failed for resume. Retrying...


Normalizing resumes:  97%|█████████▋| 12976/13389 [1:25:20<1:16:13, 11.08s/it]

⚠ Attempt 3 failed for resume. Retrying...
❌ Failed to normalize resume after 3 attempts.


Normalizing resumes:  98%|█████████▊| 13077/13389 [1:34:08<24:01,  4.62s/it]  

⚠ Attempt 1 failed for resume. Retrying...
⚠ Attempt 2 failed for resume. Retrying...


Normalizing resumes:  98%|█████████▊| 13078/13389 [1:34:32<54:52, 10.59s/it]

⚠ Attempt 3 failed for resume. Retrying...
❌ Failed to normalize resume after 3 attempts.


Normalizing resumes:  98%|█████████▊| 13116/13389 [1:37:23<19:21,  4.25s/it]

⚠ Attempt 1 failed for resume. Retrying...
⚠ Attempt 2 failed for resume. Retrying...


Normalizing resumes:  98%|█████████▊| 13117/13389 [1:37:37<31:42,  7.00s/it]

⚠ Attempt 3 failed for resume. Retrying...
❌ Failed to normalize resume after 3 attempts.
⚠ Attempt 1 failed for resume. Retrying...
⚠ Attempt 2 failed for resume. Retrying...


Normalizing resumes:  98%|█████████▊| 13118/13389 [1:38:02<56:53, 12.60s/it]

⚠ Attempt 3 failed for resume. Retrying...
❌ Failed to normalize resume after 3 attempts.


Normalizing resumes:  98%|█████████▊| 13136/13389 [1:39:19<18:13,  4.32s/it]

⚠ Attempt 1 failed for resume. Retrying...


Normalizing resumes:  98%|█████████▊| 13147/13389 [1:40:11<14:10,  3.51s/it]

⚠ Attempt 1 failed for resume. Retrying...
⚠ Attempt 2 failed for resume. Retrying...


Normalizing resumes:  98%|█████████▊| 13148/13389 [1:40:35<39:11,  9.76s/it]

⚠ Attempt 3 failed for resume. Retrying...
❌ Failed to normalize resume after 3 attempts.


Normalizing resumes:  98%|█████████▊| 13172/13389 [1:42:23<15:06,  4.18s/it]

⚠ Attempt 1 failed for resume. Retrying...


Normalizing resumes:  99%|█████████▉| 13280/13389 [1:51:06<07:46,  4.28s/it]

⚠ Attempt 1 failed for resume. Retrying...
⚠ Attempt 2 failed for resume. Retrying...


Normalizing resumes:  99%|█████████▉| 13281/13389 [1:51:30<18:25, 10.23s/it]

⚠ Attempt 3 failed for resume. Retrying...
❌ Failed to normalize resume after 3 attempts.


Normalizing resumes:  99%|█████████▉| 13313/13389 [1:54:10<06:10,  4.87s/it]

⚠ Attempt 1 failed for resume. Retrying...
⚠ Attempt 2 failed for resume. Retrying...


Normalizing resumes:  99%|█████████▉| 13314/13389 [1:54:36<13:41, 10.95s/it]

⚠ Attempt 3 failed for resume. Retrying...
❌ Failed to normalize resume after 3 attempts.


Normalizing resumes: 100%|█████████▉| 13354/13389 [1:57:39<02:16,  3.91s/it]

⚠ Attempt 1 failed for resume. Retrying...


Normalizing resumes: 100%|█████████▉| 13360/13389 [1:58:21<02:54,  6.01s/it]

⚠ Attempt 1 failed for resume. Retrying...


Normalizing resumes: 100%|██████████| 13389/13389 [2:00:52<00:00,  1.85it/s]


The failed resume are still put into the JSON. I have to take care of them.

In [18]:
# Load normalized resumes
with open(resume_checkpoint, "r") as f:
    normalized_resumes = json.load(f)
# We drop all the elements in normalized_resumes with 'norm_text'=None
correctly_normalized_resumes = [r for r in normalized_resumes if r['norm_text'] is not None]
# Let's see how much resumes where not correctly normalized
print("Number of resumes normalized :", len(correctly_normalized_resumes), "/", len(normalized_resumes))
print("Equivalent in %:", 100*len(correctly_normalized_resumes)/len(normalized_resumes), "%")

Number of resumes normalized : 13314 / 13389
Equivalent in %: 99.43983867353798 %


 - Number of resumes normalized : 13314 / 13389
 - Equivalent in %: 99.43983867353798 %

In [19]:
# Resave
with open(resume_checkpoint, "w") as f:
    json.dump(correctly_normalized_resumes, f, indent=2)

#### Job descriptions (17014 elements) **DONE**

In [37]:
# We first prepare checkpoints (very long computation)
## File to save intermediate results
job_checkpoint = checkpoint_dir / "normalized_jobs.json"

## Load existing checkpoint if available
if job_checkpoint.exists():
    print("Checkpoint found")
    with open(job_checkpoint, "r") as f:
        normalized_jobs = json.load(f)
    processed_ids = {r['job_id'] for r in normalized_jobs}
    print(len(processed_ids), "job postings")
else:
    normalized_jobs = []
    processed_ids = set()

After some tests, the whole extraction is expected to last about : 6h33min, mostly due to normalization through LLM calls.

In [58]:
for job in tqdm(jobs, desc="Normalizing job descriptions"):
    job_id = job['job_id']
    # Skip resumes already processed
    if job_id in processed_ids:
        continue
    job_text = job['text']
    # We normalize the text
    norm_job_des = normalize_job(job_text)# --> None if normalization failed after 3 attempts <--
    if norm_job_des == None:
        processed_ids.add(job_id)
        continue
    # We join everythin into a dictionnary for clarity
    normalized_job = {
        'job_id': job_id,
        'job_description': norm_job_des
    }
    # We put the result in a list
    normalized_jobs.append(normalized_job)
    processed_ids.add(job_id)
    # We save a checkpoint (every 100 job descriptions)
    if len(normalized_jobs) % 100 == 0:
        with open(job_checkpoint, "w") as f:
            json.dump(normalized_jobs, f, indent=2)

# Final save
with open(job_checkpoint, "w") as f:
    json.dump(normalized_jobs, f, indent=2)

Normalizing job descriptions: 100%|██████████| 17014/17014 [3:41:08<00:00,  1.28it/s]  


In [59]:
# Load normalized jobs
with open(job_checkpoint, "r") as f:
    normalized_jobs = json.load(f)
# Let's see how much resumes where not correctly normalized
print("Number of resumes normalized :", len(normalized_jobs), "/", len(jobs))
print("Equivalent in %:", 100*len(normalized_jobs)/len(jobs), "%")

Number of resumes normalized : 17013 / 17014
Equivalent in %: 99.99412248736334 %


 - Number of job descriptions normalized : 17013 / 17014
 - Equivalent in %: 99.99 %

#### Fixing resume normalization **DONE**

Some of the answers from the LLM are not formalized correctly.

Experience is sometimes a list of dictionnaries (which breaks). 
- Ex : [{'Title': '...', 'Company': '...', 'Years': '...', ...}, {'Title': '...', 'Company': '...',...}]

It has to be a list of strings.
- Ex : ['Accountant – Omba Executive Leadership (2016-2018): ...', 'Accountant – Google (2015): ...']

And we have the same problem with the education field.

In [394]:
# Load normalized resumes
with open(resume_checkpoint, "r") as f:
    normalized_resumes = json.load(f)

First, we need to verify all the resumes that were not normalized correctly, and why.

In [None]:
def validate_resume_record(norm_text:dict):
    """
    Validates that:
    - skills, experience, education, certifications and industries are a list of strings
    """
    # Fields that must be lists of strings
    list_fields = [
        "skills",
        "experience",
        "education",
        "certifications",
        "industries"
    ]

    # verify they are lists
    for field in list_fields:
        if field in norm_text:
            if not isinstance(norm_text[field], list):
                return False
            if not all(isinstance(ele, str) for ele in norm_text[field]):
                return False
    return True

# We search for any errors :
count = 0
for i in range(len(normalized_resumes)):
    job_des = normalized_resumes[i]['norm_text']
    if not validate_resume_record(job_des):
        count += 1
print(count, "ERRORS")

5228 ERRORS


Now, we try detailing the errors for each field.

In [405]:
def detailed_error(norm_text:dict, field:str):
    """
    We will try seeing why some errors pop and 
    count the number of occurance for each type of errors
    """
    record = norm_text.get(field)
    error = ""

    if not record:
        error = "no field"
    elif all(isinstance(item, str) for item in record):
        error = "no error"
    elif any(isinstance(item, dict) for item in record):
        error = "dictionnaries in list"
    elif any(isinstance(item, list) for item in record):
        error = "lists in list"
    else:
        error = "other"
    
    return error

Let's see the errors in each field.

**experience**

In [412]:
experience_errors = {
    "no field":0,
    "no error":0,
    "dictionnaries in list":0,
    "lists in list":0,
    "other":0
}

# We count the errors
for i in range(len(normalized_resumes)):
    norm_text = normalized_resumes[i].get('norm_text', {})
    error = detailed_error(norm_text, 'experience')
    experience_errors[error] += 1

# Pretty print
print(" --- SUMMARY --- ")
print("Well normalized experience field (list with strings):")
print(experience_errors['no error'], "/", len(normalized_resumes))
print("No experience field :")
print(experience_errors['no field'], "/", len(normalized_resumes))
print("Badly normalized experience field (list with dictionnaries):")
print(experience_errors['dictionnaries in list'], "/", len(normalized_resumes))
print("Badly normalized experience field (list of lists):")
print(experience_errors['lists in list'], "/", len(normalized_resumes))
print("Badly normalized experience field (other):")
print(experience_errors['other'], "/", len(normalized_resumes))

 --- SUMMARY --- 
Well normalized experience field (list with strings):
8240 / 13314
No experience field :
15 / 13314
Badly normalized experience field (list with dictionnaries):
5048 / 13314
Badly normalized experience field (list of lists):
11 / 13314
Badly normalized experience field (other):
0 / 13314


Lists of errors that make the experience field badly normalized : 
 - Well normalized experience field (list of strings) : 8240 / 13314
 - No experience field : 15 / 13314
 - Badly normalized experience field (list with dictionnaries) : 5048 / 13314
 - Badly normalized experience field (list of lists): 11 / 13314
 - Badly normalized experience field (other): 0 / 13314

**education**

In [414]:
education_errors = {
    "no error":0,
    "no field":0,
    "dictionnaries in list":0,
    "lists in list":0,
    "other":0
}

# We count the errors
for i in range(len(normalized_resumes)):
    norm_text = normalized_resumes[i].get('norm_text', {})
    error = detailed_error(norm_text, 'education')
    education_errors[error] += 1

# Pretty print
print(" --- SUMMARY --- ")
print("Well normalized education field (list with strings):")
print(education_errors['no error'], "/", len(normalized_resumes))
print("No education field :")
print(education_errors['no field'], "/", len(normalized_resumes))
print("Badly normalized education field (list with dictionnaries):")
print(education_errors['dictionnaries in list'], "/", len(normalized_resumes))
print("Badly normalized education field (list of lists):")
print(education_errors['lists in list'], "/", len(normalized_resumes))
print("Badly normalized education field (other):")
print(education_errors['other'], "/", len(normalized_resumes))

 --- SUMMARY --- 
Well normalized education field (list with strings):
7714 / 13314
No education field :
377 / 13314
Badly normalized education field (list with dictionnaries):
5212 / 13314
Badly normalized education field (list of lists):
11 / 13314
Badly normalized education field (other):
0 / 13314


Lists of errors that make the education field badly normalized : 
 - Well normalized education field (list of strings) : 7714 / 13314
 - No education field : 377 / 13314
 - Badly normalized education field (list with dictionnaries) : 5212 / 13314
 - Badly normalized education field (list of lists): 11 / 13314
 - Badly normalized education field (other): 0 / 13314

Then, we need to fix the errors in those fields.

In [371]:
import agents.normalization_agent
# Reload the file to take into account the changes
importlib.reload(agents.normalization_agent)

<module 'agents.normalization_agent' from 'C:\\Users\\Sebastien\\Desktop\\LLM and GenAI\\Smart_Resume_to_Job_Matcher\\agents\\normalization_agent.py'>

In [None]:
from agents.normalization_agent import coerce_to_strings_experience, coerce_to_strings_education

In [416]:
def detailed_error(norm_text:dict, field:str):
    """
    We will try seeing why some errors pop and 
    count the number of occurance for each type of errors
    """
    record = norm_text.get(field)
    error = ""

    if not record:
        error = "no field"
    elif all(isinstance(item, str) for item in record):
        error = "no error"
    elif any(isinstance(item, dict) for item in record):
        error = "dictionnaries in list"
    elif any(isinstance(item, list) for item in record):
        error = "lists in list"
    else:
        error = "other"
    
    return error

**experience**

In [417]:
def fix_error_experience(norm_text:dict):
    """
    Now, we will use the functions defined in the normalization agent to fix 
    the issues made by the LLM.

    Depending on the error, the fix is different, 
    so we will also use the previously define detailed_error function.
    """
    field_value = norm_text.get("experience")
    error = detailed_error(norm_text, "experience")
    if error == "no field":# We add an empty field
        norm_text["experience"] = []
    elif error == "dictionnaries in list" or error == "lists in list":
        norm_text["experience"] = coerce_to_strings_experience(field_value)
    return norm_text# Return the fixed dictionnary

In [418]:
for i in range(len(normalized_resumes)):
    norm_text = normalized_resumes[i].get('norm_text', {})
    normalized_resumes[i]['norm_text'] = fix_error_experience(norm_text)

**education**

In [423]:
def fix_error_education(norm_text:dict):
    """
    Now, we will use the functions defined in the normalization agent to fix 
    the issues made by the LLM.

    Depending on the error, the fix is different, 
    so we will also use the previously define detailed_error function.
    """
    field_value = norm_text.get("education")
    error = detailed_error(norm_text, "education")
    if error == "no field":# We add an empty field
        norm_text["education"] = []
    elif error == "dictionnaries in list" or error == "lists in list":
        norm_text["education"] = coerce_to_strings_education(field_value)
    return norm_text# Return the fixed dictionnary

In [424]:
for i in range(len(normalized_resumes)):
    norm_text = normalized_resumes[i].get('norm_text', {})
    normalized_resumes[i]['norm_text'] = fix_error_education(norm_text)

And we eventually save the resulting normalized files.

In [428]:
# Saving
with open(resume_checkpoint, "w") as f:
    json.dump(normalized_resumes, f, indent=2)

#### Fixing job normalization **DONE**

We verify if there are any fields which types are not correct.

In [380]:
# Load normalized job descriptions
with open(job_checkpoint, "r") as f:
    normalized_jobs = json.load(f)

In [392]:
def validate_job_record(job_description: dict) -> bool:
    """
    Validates that:
    - job_title, required_experience, required_education, industry are strings
    - required_skills is a list of strings
    """

    # Fields that must be strings
    string_fields = [
        "job_title",
        "required_experience",
        "required_education",
        "industry"
    ]

    for field in string_fields:
        if field in job_description and not isinstance(job_description[field], str):
            return False

    # Validate required_skills
    if "required_skills" in job_description:
        if not isinstance(job_description["required_skills"], list):
            return False
        if not all(isinstance(skill, str) for skill in job_description["required_skills"]):
            return False

    return True

In [393]:
# None don't work, job records are clean
for i in range(len(normalized_jobs)):
    job_des = normalized_jobs[i]['job_description']
    if not validate_job_record(job_des):
        print('ERROR')

This code returns no 'ERROR' message, meaning that avery record was handled correctly.

In [None]:
# Saving (not really necessary as nothing changed)
with open(job_checkpoint, "w") as f:
    json.dump(normalized_jobs, f, indent=2)

# Generate embeddings **DONE**

### Complete embedding generation

In this step, we will generate the embedding of both resumes and jobs thanks to a prompt. The objects will look like this :
```python 
embedded_resumes = [
    {
    'resume_id': 1, 
    'resume_vector': List[float]
    },
    ...
]

embedded_jobs = [
    {
    'job_id': 1, 
    'job_vector': List[float]
    },
    ...
]
```

In [None]:
import embeddings.embedding_engine
# Reload the file to take into account the changes
importlib.reload(embeddings.embedding_engine)

In [56]:
from embeddings.embedding_engine import embed_resume_nomic, embed_job_nomic, embed_resume_BGE, embed_job_BGE

#### Resumes (13314 elements) **DONE**

In [57]:
from pathlib import Path
import json
from tqdm import tqdm

In [76]:
def checkpoint_verification_resumes(normalized_resumes_file:Path, embedded_path:Path):
    # Load normalized resumes
    with open(normalized_resumes_file, "r") as f:
        normalized_resumes = json.load(f)
        print("Normalized files imported")

    # Load existing embeddings if checkpoint exists (restart-safe)
    if embedded_path.exists():
        print("Checkpoint found")
        with open(embedded_path, "r", encoding="utf-8") as f:
            embedded = json.load(f)
        processed_ids = {r['resume_id'] for r in embedded}
        print(len(processed_ids), "resume embeddings found")
    else:
        print("No checkpoint found")
        embedded = []
        processed_ids = set()
    
    return normalized_resumes, embedded, processed_ids

**NOMIC embedding model** (~20 minutes)

In [77]:
## First we intitialize our storage (we download it if some was already computed)
# Paths
normalized_resumes_file = Path("checkpoints/normalized_resumes.json")
resume_emb_file = Path("checkpoints/resume_embeddings_nomic.json")
normalized_resumes, embedded_resumes, processed_ids = checkpoint_verification_resumes(normalized_resumes_file, resume_emb_file)

Normalized files imported
Checkpoint found
13296 resume embeddings found


In [None]:
##  Then generate embeddings for remaining resumes
for r in tqdm(normalized_resumes, desc="Generating resume embeddings..."):
    resume_id = r["resume_id"]
    # Skip resumes already processed
    if resume_id in processed_ids:
        continue
    try:
        resume_vector = embed_resume_nomic(r["norm_text"])
        embedded_resume = {
            'resume_id': resume_id,
            'resume_vector': resume_vector
        }
        embedded_resumes.append(embedded_resume)
        processed_ids.add(resume_id)
    except Exception as e:
        print(f"⚠ Failed embedding for resume {resume_id}: {e}")
    
    # We save a checkpoint (every 100 embedded resumes)
    if len(embedded_resumes) % 100 == 0:
        with open(resume_emb_file, "w", encoding="utf-8") as f:
            json.dump(embedded_resumes, f)

# Save embeddings to disk for later use
with open(resume_emb_file, "w", encoding="utf-8") as f:
    json.dump(embedded_resumes, f)

Generating resume embeddings...:  23%|██▎       | 3113/13314 [01:45<15:36, 10.89it/s]

⚠ Failed embedding for resume 3114: unhashable type: 'dict'


Generating resume embeddings...:  23%|██▎       | 3128/13314 [01:45<07:46, 21.84it/s]

⚠ Failed embedding for resume 3125: unhashable type: 'dict'


Generating resume embeddings...:  27%|██▋       | 3555/13314 [02:05<03:51, 42.21it/s]

⚠ Failed embedding for resume 3551: unhashable type: 'dict'


Generating resume embeddings...:  29%|██▉       | 3828/13314 [02:19<07:18, 21.65it/s]

⚠ Failed embedding for resume 3826: unhashable type: 'dict'


Generating resume embeddings...:  30%|██▉       | 3986/13314 [02:25<03:10, 49.08it/s]

⚠ Failed embedding for resume 3985: unhashable type: 'dict'


Generating resume embeddings...:  33%|███▎      | 4344/13314 [02:46<04:48, 31.10it/s]

⚠ Failed embedding for resume 4341: unhashable type: 'dict'


Generating resume embeddings...:  48%|████▊     | 6431/13314 [04:55<08:32, 13.44it/s]

⚠ Failed embedding for resume 6438: unhashable type: 'dict'


Generating resume embeddings...:  56%|█████▌    | 7407/13314 [06:02<01:54, 51.61it/s]

⚠ Failed embedding for resume 7417: unhashable type: 'dict'


Generating resume embeddings...:  62%|██████▏   | 8216/13314 [07:11<19:15,  4.41it/s]

⚠ Failed embedding for resume 8235: unhashable type: 'dict'


Generating resume embeddings...:  68%|██████▊   | 9097/13314 [08:21<01:25, 49.49it/s]

⚠ Failed embedding for resume 9127: unhashable type: 'dict'


Generating resume embeddings...:  69%|██████▉   | 9200/13314 [08:30<01:21, 50.74it/s]

⚠ Failed embedding for resume 9225: unhashable type: 'dict'


Generating resume embeddings...:  70%|██████▉   | 9276/13314 [08:38<01:32, 43.89it/s]

⚠ Failed embedding for resume 9304: unhashable type: 'dict'
⚠ Failed embedding for resume 9309: unhashable type: 'dict'


Generating resume embeddings...:  76%|███████▌  | 10142/13314 [10:00<03:47, 13.93it/s]

⚠ Failed embedding for resume 10177: unhashable type: 'dict'
⚠ Failed embedding for resume 10178: unhashable type: 'dict'


Generating resume embeddings...:  81%|████████▏ | 10822/13314 [11:08<12:14,  3.39it/s]

⚠ Failed embedding for resume 10874: unhashable type: 'dict'


Generating resume embeddings...:  95%|█████████▍| 12596/13314 [14:12<00:15, 47.76it/s]

⚠ Failed embedding for resume 12653: unhashable type: 'dict'


Generating resume embeddings...:  95%|█████████▍| 12615/13314 [14:12<00:13, 53.44it/s]

⚠ Failed embedding for resume 12670: unhashable type: 'dict'


Generating resume embeddings...: 100%|██████████| 13314/13314 [15:32<00:00, 14.28it/s]


Despite thiniking we took care of the typing issue in the previous phase, apparently, normalization was not strict enough and some files are still rejected from embedding because of typing.

In [446]:
print(f"Generated embeddings: {len(embedded_resumes)} resumes")

print(f"Number of resumes not embedded : {len(normalized_resumes)-len(embedded_resumes)}")
print(f"In % : {100*(len(normalized_resumes)-len(embedded_resumes))/len(normalized_resumes)}")

Generated embeddings: 13296 resumes
Number of resumes not embedded : 18
In % : 0.135196034249662


However, only 18 files were not correctly embedded, which represents less than 0.2% of all resumes.

**BGE embedding model** (~30 minutes)

In [78]:
normalized_resumes_file = Path("checkpoints/normalized_resumes.json")
resume_emb_file = Path("checkpoints/resume_embeddings_bge.json")
normalized_resumes, embedded_resumes, processed_ids = checkpoint_verification_resumes(normalized_resumes_file, resume_emb_file)

Normalized files imported
Checkpoint found
13296 resume embeddings found


In [60]:
##  Then generate embeddings for remaining resumes
for r in tqdm(normalized_resumes, desc="Generating resume embeddings..."):
    resume_id = r["resume_id"]
    # Skip resumes already processed
    if resume_id in processed_ids:
        continue
    try:
        resume_vector = embed_resume_BGE(r["norm_text"])
        embedded_resume = {
            'resume_id': resume_id,
            'resume_vector': resume_vector
        }
        embedded_resumes.append(embedded_resume)
        processed_ids.add(resume_id)
    except Exception as e:
        print(f"⚠ Failed embedding for resume {resume_id}: {e}")
    
    # We save a checkpoint (every 100 embedded resumes)
    if len(embedded_resumes) % 100 == 0:
        with open(resume_emb_file, "w", encoding="utf-8") as f:
            json.dump(embedded_resumes, f)

# Save embeddings to disk for later use
with open(resume_emb_file, "w", encoding="utf-8") as f:
    json.dump(embedded_resumes, f)

Generating resume embeddings...:  23%|██▎       | 3112/13314 [06:31<17:53,  9.51it/s]  

⚠ Failed embedding for resume 3114: unhashable type: 'dict'


Generating resume embeddings...:  23%|██▎       | 3123/13314 [06:32<12:27, 13.63it/s]

⚠ Failed embedding for resume 3125: unhashable type: 'dict'


Generating resume embeddings...:  27%|██▋       | 3548/13314 [07:14<10:51, 14.98it/s]  

⚠ Failed embedding for resume 3551: unhashable type: 'dict'


Generating resume embeddings...:  29%|██▊       | 3823/13314 [07:45<14:43, 10.75it/s]  

⚠ Failed embedding for resume 3826: unhashable type: 'dict'


Generating resume embeddings...:  30%|██▉       | 3982/13314 [08:00<12:04, 12.87it/s]  

⚠ Failed embedding for resume 3985: unhashable type: 'dict'


Generating resume embeddings...:  33%|███▎      | 4338/13314 [08:42<10:30, 14.24it/s]  

⚠ Failed embedding for resume 4341: unhashable type: 'dict'


Generating resume embeddings...:  48%|████▊     | 6426/13314 [13:00<12:42,  9.03it/s]  

⚠ Failed embedding for resume 6438: unhashable type: 'dict'


Generating resume embeddings...:  56%|█████▌    | 7406/13314 [15:12<06:35, 14.93it/s]  

⚠ Failed embedding for resume 7417: unhashable type: 'dict'


Generating resume embeddings...:  62%|██████▏   | 8215/13314 [17:20<32:14,  2.64it/s]  

⚠ Failed embedding for resume 8235: unhashable type: 'dict'


Generating resume embeddings...:  68%|██████▊   | 9093/13314 [19:33<05:48, 12.11it/s]  

⚠ Failed embedding for resume 9127: unhashable type: 'dict'


Generating resume embeddings...:  69%|██████▉   | 9192/13314 [19:48<05:22, 12.78it/s]  

⚠ Failed embedding for resume 9225: unhashable type: 'dict'


Generating resume embeddings...:  70%|██████▉   | 9270/13314 [20:02<05:07, 13.16it/s]  

⚠ Failed embedding for resume 9304: unhashable type: 'dict'


Generating resume embeddings...:  70%|██████▉   | 9275/13314 [20:02<04:48, 14.02it/s]

⚠ Failed embedding for resume 9309: unhashable type: 'dict'


Generating resume embeddings...:  76%|███████▌  | 10135/13314 [22:22<05:28,  9.66it/s]  

⚠ Failed embedding for resume 10177: unhashable type: 'dict'
⚠ Failed embedding for resume 10178: unhashable type: 'dict'


Generating resume embeddings...:  81%|████████▏ | 10822/13314 [24:14<17:28,  2.38it/s]  

⚠ Failed embedding for resume 10874: unhashable type: 'dict'


Generating resume embeddings...:  95%|█████████▍| 12591/13314 [29:11<00:54, 13.36it/s]

⚠ Failed embedding for resume 12653: unhashable type: 'dict'


Generating resume embeddings...:  95%|█████████▍| 12606/13314 [29:12<00:51, 13.80it/s]

⚠ Failed embedding for resume 12670: unhashable type: 'dict'


Generating resume embeddings...: 100%|██████████| 13314/13314 [31:18<00:00,  7.09it/s]


In [79]:
print(f"Generated embeddings: {len(embedded_resumes)} resumes")

print(f"Number of resumes not embedded : {len(normalized_resumes)-len(embedded_resumes)}")
print(f"In % : {100*(len(normalized_resumes)-len(embedded_resumes))/len(normalized_resumes)}")

Generated embeddings: 13296 resumes
Number of resumes not embedded : 18
In % : 0.135196034249662


Here too, only 18 files were not correctly embedded, which represents less than 0.2% of all resumes.

#### Job descriptions (17013 elements) **DONE**

In [62]:
def checkpoint_verification_jobs(normalized_jobs_file:Path, embedded_path:Path):
    # Load normalized resumes
    with open(normalized_jobs_file, "r") as f:
        normalized_jobs = json.load(f)
        print("Normalized files imported")

    # Load existing embeddings if checkpoint exists (restart-safe)
    if embedded_path.exists():
        print("Checkpoint found")
        with open(embedded_path, "r", encoding="utf-8") as f:
            embedded = json.load(f)
        processed_ids = {r['job_id'] for r in embedded}
        print(len(processed_ids), "job embeddings found")
    else:
        print("No checkpoint found")
        embedded = []
        processed_ids = set()
    
    return normalized_jobs, embedded, processed_ids

In [65]:
def embed_job(normalized_jobs, embedded_jobs, processed_ids, job_emb_file, embedding_function):
    for j in tqdm(normalized_jobs, desc="Generating job embeddings..."):
        job_id = j["job_id"]
        # Skip resumes already processed
        if job_id in processed_ids:
            continue
        try:
            job_vector = embedding_function(j["job_description"])
            embedded_job = {
                'job_id': job_id,
                'job_vector': job_vector
            }
            embedded_jobs.append(embedded_job)
            processed_ids.add(job_id)
        except Exception as e:
            print(f"⚠ Failed embedding for job {job_id}: {e}")
        
        # We save a checkpoint (every 100 embedded jobs)
        if len(embedded_jobs) % 100 == 0:
            with open(job_emb_file, "w", encoding="utf-8") as f:
                json.dump(embedded_jobs, f)

    # Save embeddings to disk for later use
    with open(job_emb_file, "w", encoding="utf-8") as f:
        json.dump(embedded_jobs, f)
    
    return embedded_jobs

**NOMIC embedding model** (~23 minutes)

In [71]:
## First we intitialize our storage (we download it if some was already computed)
# Paths to checkpointed normalized data
normalized_jobs_file = Path("checkpoints/normalized_jobs.json")
job_emb_file = Path("checkpoints/jobs_embeddings_nomic.json")
# Checkpoint verification
normalized_jobs, embedded_jobs, processed_ids = checkpoint_verification_jobs(normalized_jobs_file, job_emb_file)
# Embedding generation
embedded_jobs = embed_job(normalized_jobs, embedded_jobs, processed_ids, job_emb_file, embed_job_nomic)

Normalized files imported
Checkpoint found
17013 job embeddings found


Generating job embeddings...: 100%|██████████| 17013/17013 [00:00<00:00, 1652869.78it/s]


In [72]:
print(f"Generated embeddings: {len(embedded_jobs)} job descriptions")

print(f"Number of resumes not embedded : {len(normalized_jobs)-len(embedded_jobs)}")
print(f"In % : {100*(len(normalized_jobs)-len(embedded_jobs))/len(normalized_jobs)}")

Generated embeddings: 17013 job descriptions
Number of resumes not embedded : 0
In % : 0.0


**BGE embedding model** (~30 minutes)

In [73]:
## First we intitialize our storage (we download it if some was already computed)
# Paths to checkpointed normalized data
normalized_jobs_file = Path("checkpoints/normalized_jobs.json")
job_emb_file = Path("checkpoints/jobs_embeddings_bge.json")
# Checkpoint verification
normalized_jobs, embedded_jobs, processed_ids = checkpoint_verification_jobs(normalized_jobs_file, job_emb_file)
# Embedding generation
embedded_jobs = embed_job(normalized_jobs, embedded_jobs, processed_ids, job_emb_file, embed_job_BGE)

Normalized files imported
Checkpoint found
17013 job embeddings found


Generating job embeddings...: 100%|██████████| 17013/17013 [00:00<00:00, 1231494.10it/s]


In [74]:
print(f"Generated embeddings: {len(embedded_jobs)} job descriptions")

print(f"Number of resumes not embedded : {len(normalized_jobs)-len(embedded_jobs)}")
print(f"In % : {100*(len(normalized_jobs)-len(embedded_jobs))/len(normalized_jobs)}")

Generated embeddings: 17013 job descriptions
Number of resumes not embedded : 0
In % : 0.0


However, when it comes to jobs, just like normalization, no job description impeded the process. This is probably because the typing was simpler, no dictionnary inside dictionnaries.

# Cosine similarity and ranking

In [16]:
# Add the parent folder of ingestion to sys.path
project_root = Path("..").resolve()  # notebooks/ is one level down
sys.path.append(str(project_root))

## Format conversion

For now, I have embeddings looking like this : 
```python 
embedded_resumes = [
    {
    'resume_id': 1, 
    'resume_vector': List[float]
    },
    ...
]

embedded_jobs = [
    {
    'job_id': 1, 
    'job_vector': List[float]
    },
    ...
]
```
However, this format takes more space and is harder to manipulate. So, we will first convert our lists into dictionnaries, whose keys will be the embeddings' id.

In [22]:
import embeddings.embedding_format_conversion
# Reload the file to take into account the changes
importlib.reload(embeddings.embedding_format_conversion)

<module 'embeddings.embedding_format_conversion' from 'C:\\Users\\Sebastien\\Desktop\\LLM and GenAI\\Smart_Resume_to_Job_Matcher\\embeddings\\embedding_format_conversion.py'>

In [23]:
from embeddings.embedding_format_conversion import lists_to_id_vector_dicts
import json

In [25]:
## First we import the embeddings
# Paths
resume_emb_file_nomic = Path("checkpoints/resume_embeddings_nomic.json")
resume_emb_file_bge = Path("checkpoints/resume_embeddings_bge.json")
job_emb_file_nomic = Path("checkpoints/jobs_embeddings_nomic.json")
job_emb_file_bge = Path("checkpoints/jobs_embeddings_bge.json")

# Load embedded resumes
with open(resume_emb_file_nomic, "r") as f:
    embedded_resumes_nomic = json.load(f)
    print("Embedded resumes (Nomic) imported")
with open(resume_emb_file_bge, "r") as f:
    embedded_resumes_bge = json.load(f)
    print("Embedded resumes (BGE) imported")

# Load embedded job descriptions
with open(job_emb_file_nomic, "r") as f:
    embedded_jobs_nomic = json.load(f)
    print("Embedded job descriptions (Nomic) imported")
with open(job_emb_file_bge, "r") as f:
    embedded_jobs_bge = json.load(f)
    print("Embedded job descriptions (BGE) imported")

Embedded resumes (Nomic) imported
Embedded resumes (BGE) imported
Embedded job descriptions (Nomic) imported
Embedded job descriptions (BGE) imported


In [26]:
# Then, convert them into smaller dictionnaries for better storage
embedded_resumes_nomic_dict, embedded_jobs_nomic_dict = lists_to_id_vector_dicts(embedded_resumes_nomic, embedded_jobs_nomic)
embedded_resumes_bge_dict, embedded_jobs_bge_dict = lists_to_id_vector_dicts(embedded_resumes_bge, embedded_jobs_bge)

print("Resume IDs (Nomic):", list(embedded_resumes_nomic_dict.keys())[:5])
print("Job IDs (Nomic):", list(embedded_jobs_nomic_dict.keys())[:5])
print("Resume IDs (BGE):", list(embedded_resumes_bge_dict.keys())[:5])
print("Job IDs (BGE):", list(embedded_jobs_bge_dict.keys())[:5])

Resume IDs (Nomic): [1, 2, 3, 4, 5]
Job IDs (Nomic): [1, 2, 3, 4, 5]
Resume IDs (BGE): [1, 2, 3, 4, 5]
Job IDs (BGE): [1, 2, 3, 4, 5]


## Scoring and matching (Mettre l'ensemble des embedding)

Now that the embeddings are converted back to dictionnaries, we can use them for scoring and ranking.

In [27]:
import match_engine_and_explanation.match_engine
# Reload the file to take into account the changes
importlib.reload(match_engine_and_explanation.match_engine)

<module 'match_engine_and_explanation.match_engine' from 'C:\\Users\\Sebastien\\Desktop\\LLM and GenAI\\Smart_Resume_to_Job_Matcher\\match_engine_and_explanation\\match_engine.py'>

In [28]:
import ingestion.preprocess
importlib.reload(ingestion.preprocess)

from ingestion.preprocess import resumes_to_raw_text, jobs_to_raw_text

resumes = resumes_to_raw_text("../data/resumes/resumes.csv")
jobs = jobs_to_raw_text("../data/jobs/job_postings.csv")

In [None]:
# TEMP - the different embeddings for resumes and job descriptions - TEMP
embedded_resumes_nomic_dict, embedded_jobs_nomic_dict
embedded_resumes_bge_dict, embedded_jobs_bge_dict

In [29]:
from match_engine_and_explanation.match_engine import match_jobs_to_resume, match_resumes_to_job

In [None]:
# embedded_resumes = {resume_id:resume_vector, ...}
# embedded_jobs = {job_id:job_vector, ...}

job_id = 78
embedding = embedded_jobs_bge_dict[job_id]

results = match_resumes_to_job(
    job_id=job_id,
    job_embedding=embedding,
    resume_embeddings=embedded_resumes_bge_dict,
    top_k=5
)

# Displaying the result
print(f"Job ID: {job_id}")
print("=" * 30, "Job description", "=" * 30)
print(jobs[job_id-1]['text'])

for r in results:
    print("=" * 80)
    resume_id = r['resume_id']
    print(f"Resume ID: {resume_id}")
    print(f"Resume's text : ")
    print(resumes[resume_id-1]['text'])
    print(f"Similarity score: {r['score']:.3f}")

In [31]:
from match_engine_and_explanation.match_engine import score_jobs_for_resume

In [None]:
resume_id = 456
embedding = embedded_resumes_bge_dict[resume_id]

results = match_jobs_to_resume(
    resume_id=resume_id,
    resume_embedding=embedding,
    job_embeddings=embedded_jobs_bge_dict,
    top_k=5
)


# Displaying the result
print(f"Resume ID: {resume_id}")
print("=" * 30, "Resume's text", "=" * 30)
print(resumes[resume_id-1]['text'])

for r in results:
    print("=" * 80)
    job_id = r['job_id']
    print(f"Job ID: {job_id}")
    print(f"Job description : ")
    print(jobs[job_id-1]['text'])
    print(f"Similarity score: {r['score']:.3f}")

# LLM-based explanation (A FAIRE AVEC LES NOUVEAUX EMBEDDINGS)

In [None]:
import match_engine_and_explanation.llm_explanation
# Reload the file to take into account the changes
importlib.reload(match_engine_and_explanation.llm_explanation)

In [None]:
from match_engine_and_explanation.llm_explanation import generate_match_explanation

# normalized_jobs = [{id:normalized_job_i, ...}]
# normalized_resumes = [{id:normalized_resume_i, ...}]
# results = [{"job_id": job_id[str], "resume_id": resume_id, "score": score}]
r_0 = results[0]
job_id = r_0['job_id']
resume_id = r_0['resume_id']
score = r_0['score']

job = normalized_job# "job_description" where "job_id" = job_id
resume = normalized_resume# "norm_text" where "resume_id" = resume_id

generate_match_explanation(normalized_job, normalized_resume, score)

# ???

In [12]:
from langchain_community.llms import Ollama

In [14]:
from langgraph.graph import StateGraph

In [15]:
import streamlit
import fastapi