## Load Data and Preprocessing

In [5]:
import pandas as pd
from src.db.jobs_db import get_all_jobs, update_jobs_bulk, get_jobs_by_date
import datetime

from ollama import ChatResponse, chat
from tqdm import tqdm

In [8]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', 50)

### Load Data from DB

In [9]:
# df = get_all_jobs("data/jobs.sqlite")
df = get_jobs_by_date(datetime.date.today() ,"data/jobs.sqlite")

### Preprocess Data

Extract skills required in the job description

In [10]:
mask = df["ai_skills_required"].isnull() & df["description"].notnull()

for idx, desc in tqdm(df[mask]["description"].items(), total=mask.sum()):
    
    try:
        response: ChatResponse = chat(model='gemma3:1b', messages=[
        {
            'role': 'system',
            'content': 'You are a helpful assistant that extracts skills from job descriptions. Giving only list separate by comma on one line.\nLike this:\nCommunication, team working, redaction, good attitude, fast learner',},
        { 
            'role': 'user',
            'content': f'Give me a list of skills required for the following job description :\n\n{desc}\n\nI want only the list, no other text. And the skills appear as they are in tshe description. 10 maximum. separate by commas.',
        },
        ])
        skills = response.message.content

        df.loc[idx, "ai_skills_required"] = skills
    except Exception as e:
        print(f"Error at row {idx}: {e}")
        df.loc[idx, "ai_skills_required"] = None

jobs_dicts = df.to_dict(orient="records")
update_jobs_bulk(jobs_dicts,"data/jobs.sqlite")

0it [00:00, ?it/s]


## Filter Jobs

In [11]:
from sentence_transformers import SentenceTransformer
from umap import UMAP
from sklearn.metrics import pairwise_distances

  from .autonotebook import tqdm as notebook_tqdm


Use embeddings to put the skills required for the job in a latent space. Where it's easy to compare job offers.

We use bge-m3 model, which is a small performant multiligual model.

In [12]:
embedding_model = SentenceTransformer("BAAI/bge-m3")
embeddings = embedding_model.encode(df["ai_skills_required"], show_progress_bar=True)

Batches: 100%|██████████| 4/4 [00:42<00:00, 10.65s/it]


We use a reference skill set that we will compare to the skills required by the job offers.

In [13]:
text = "data scientce, AI, artificial intelligence, python, Computer vision, NLP, LSTM, transformers, PyTorch, TensorFlow."

embedding_reference = embedding_model.encode([text])

Compute distance between all point and the filtering point

In [14]:
distances = pairwise_distances(
    embedding_reference.reshape(1, -1),
    embeddings
)[0,:]

Sort distances and print the 5 job offers that are closest and farthest from the reference. 

In [15]:
argmin_indices = distances.argsort()
closest = argmin_indices[:5]
farthest = argmin_indices[-5:]

for idx in closest:
    print("CLOSEST:", df.iloc[idx]["title"], df.iloc[idx]["id"])
    # print(df.iloc[idx]["description"])
    print("-----")

for idx in farthest:
    print("FaRTHEST:", df.iloc[idx]["title"], df.iloc[idx]["id"])
    # print(df.iloc[idx]["description"])
    print("-----")

CLOSEST: Data Scientist - Computer Vision li-4347039000
-----
CLOSEST: Staff AI Engineer li-4300371167
-----
CLOSEST: Junior Data Scientist - Casablanca li-3928980953
-----
CLOSEST: Stage - Data Scientist IA/LLM H/F li-4343315745
-----
CLOSEST: CDI - Data Scientist Exploration - H/F li-4343334855
-----
FaRTHEST: Stage  Automatisation des tests de validation F/H - DPP OPTRO li-4343190222
-----
FaRTHEST: Lead Testing Radio 5G NTN H/F Toulouse li-4212332048
-----
FaRTHEST: Développeur Linux embarqué (H/F) li-4015943338
-----
FaRTHEST: Sogeti Développeur React/React Native Confirmé - Toulouse li-4214565485
-----
FaRTHEST: Ingénieur vérification système et logiciel (H/F) li-4327203246
-----


## Compute the reference based on my previous applications

In [None]:
import re
# List of URLs I applied to (I'am interested in) each on a new line
urls = """
https://www.linkedin.com/...
https://www.linkedin.com/...
https://www.linkedin.com/...
...
"""

# keep only linkedin job view URLs and extract job IDs
lines = [line for line in urls.strip().split("\n") if line.startswith("http") and line.find("linkedin.com/jobs") != -1]
lines = [re.match(r'^https://www\.linkedin\.com/jobs/view/(\d+)/', line).group(1) for line in lines]

Use jobspy library to extract the description of my previous applications. 

In [None]:
from jobspy import ScraperInput, LinkedIn

li = LinkedIn()
li.scraper_input = ScraperInput(site_type=["linkedin"])
li.scraper_input.description_format = None

descs = [li._get_job_details(id_li) for id_li in lines]

In [None]:
from bs4 import BeautifulSoup
import json

descs_str = [{"description": BeautifulSoup(d["description"], "html.parser").get_text().strip()} for d in descs if "description" in d and d["description"] is not None]

with open("descs.json", "w") as f:
    json.dump(descs_str, f, indent=4)

We do the same process to generate the embeddings for each job offers.

In [None]:
import json
with open("descs.json", "r") as f:
    descs_str = json.load(f)
    
    
for desc in tqdm(descs_str):
    
    response: ChatResponse = chat(model='gemma3:1b', messages=[
    {
        'role': 'system',
        'content': 'You are a helpful assistant that extracts skills from job descriptions. Giving only list separate by comma on one line.\nLike this:\nCommunication, team working, redaction, good attitude, fast learner',},
    { 
        'role': 'user',
        'content': f'Give me a list of skills required for the following job description :\n\n{desc["description"]}\n\nI want only the list, no other text. And the skills appear as they are in tshe description. 10 maximum. separate by commas.',
    },
    ])
    skills = response.message.content
    
    desc['ai_skills_required'] = skills

with open("descs.json", "w") as f:
    json.dump(descs_str, f, indent=4)

  0%|          | 0/48 [00:00<?, ?it/s]

100%|██████████| 48/48 [10:09<00:00, 12.70s/it]


In [16]:
import json
from sentence_transformers import SentenceTransformer
from umap import UMAP
from sklearn.metrics import pairwise_distances

embedding_model = SentenceTransformer("BAAI/bge-m3")

with open("descs.json", "r") as f:
    descs_str = json.load(f)

embeddings = embedding_model.encode([desc['ai_skills_required'] for desc in descs_str],show_progress_bar=True)

Batches: 100%|██████████| 2/2 [00:21<00:00, 10.80s/it]


We take the mean of all the embeddings generated and save it.

In [18]:
emb_mean = embeddings.mean(axis=0)
emb_mean

array([-0.04858536, -0.00502254, -0.00985722, ...,  0.02500832,
        0.03933824,  0.00900446], dtype=float32)

In [60]:
# save the embedding in a file
import numpy as np
np.save("embeddings_skills.npy", emb_mean)

We run some small tests.

In [48]:

distances = pairwise_distances(
    emb_mean.reshape(1, -1),
    embeddings,
    metric='cosine'
)[0,:]
distances

array([0.16600156, 0.12664658, 0.21876037, 0.15413761, 0.19481057,
       0.22077358, 0.12990737, 0.21948099, 0.1871146 , 0.11598623,
       0.25076705, 0.14878333, 0.20470059, 0.13023359, 0.19564974,
       0.13320494, 0.22485948, 0.1520654 , 0.13870478, 0.18063092,
       0.18564492, 0.19887245, 0.17263687, 0.25756812, 0.14693153,
       0.24154228, 0.22289824, 0.18908226, 0.16129619, 0.17725205,
       0.14713573, 0.10943145, 0.14071292, 0.11436486, 0.17986691,
       0.17197591, 0.2737044 , 0.16459036, 0.17060041, 0.22556257,
       0.19210368, 0.21143746, 0.13525474, 0.22227693, 0.11117619,
       0.1819331 , 0.12489128, 0.14538383], dtype=float32)

In [None]:
test = ["Google Analytics, SQL, Power BI, Optimizely, AB Tasty, GA..","Analysis SOC/Vuln. Management, Data Flow, Risk Assessment, SIEM, Data Analysis, Report Generation, Key Risk Indicator (KRI), Data Visualization, Collaboration, Communication","Data Analyst, Data Engineering, Data Visualization, Project Management, Data Modeling, Statistical Analysis, Qlik Sense, ETL, Data Quality, BI, Python, SQL, Data Visualization Tools, Project Management, Agile Methodologies"]
test_emb = embedding_model.encode(test,show_progress_bar=True)

distances = pairwise_distances(
    emb_mean.reshape(1, -1),
    test_emb,
    metric='cosine'
)[0,:]
distances

Batches: 100%|██████████| 1/1 [00:00<00:00,  1.16it/s]


array([0.3598075 , 0.2722252 , 0.16400194], dtype=float32)

In [None]:
distance = pairwise_distances(
    emb_mean.reshape(1, -1),
    embedding_reference.reshape(1, -1),
    metric='cosine'
)[0,0]
distance

0.1619919