In [3]:
import requests
import json
import os
import numpy as np
from openai import OpenAI
from typing import List, Dict
import time
import datetime

In [4]:
from Last_Refresh import get_last_refresh_date, update_last_refresh_date
from BDD import save_embeddings_numpy, load_embeddings, append_embeddings


# API to fetch all VIE Id's

In [3]:
def search_offers(limit=100, skip=0):
    # URL de l'API
    url = "https://civiweb-api-prd.azurewebsites.net/api/Offers/search"

    # Corps de la requ√™te (payload)
    payload = {
        "limit": limit,
        "skip": skip,
        "latest": ["true"],
        "method": ["null"],
        "activitySectorId": [],
        "missionsTypesIds": ["1"],
        "missionsDurations": [],
        "gerographicZones": [],
        "countriesIds": [],
        "studiesLevelId": [],
        "companiesSizes": [],
        "specializationsIds": [],
        "entreprisesIds": [0],
        "missionStartDate": None,
        "query": None
    }

    # Headers
    headers = {
        "Content-Type": "application/json"
    }

    # Effectuer la requ√™te POST
    response = requests.post(url, json=payload, headers=headers)
    # V√©rifier le code de statut
    response.raise_for_status()
    # Afficher le r√©sultat
    print(f"Code de statut: {response.status_code}")
        
    return response.json()

#search_offers(1,0)
    

# API to fetch all the data for each ids

In [7]:
def get_offer_details(offer_id):
    url = f"https://civiweb-api-prd.azurewebsites.net/api/Offers/details/{offer_id}"
    # Headers (optionnel pour un GET simple)
    headers = {
        "Accept": "application/json"
    }
    # Effectuer la requ√™te GET
    response = requests.get(url, headers=headers)
    # V√©rifier le code de statut
    response.raise_for_status()
    print("   R√©cup√©ration des donn√©es r√©ussies")
    return response.json()

#get_offer_details(227934)

    

## Methods to compute application rate of the Job

In [8]:
def enhance_offer_data(offer_data):
    """Am√©liore les donn√©es avec des m√©triques calcul√©es"""
    candidates = offer_data.get('candidateCounter', 0)
    views = offer_data.get('viewCounter', 0)
    
    # Calcul du taux de postulation
    application_rate = 0
    if views > 0:
        application_rate = round((candidates / views) * 100, 1)
    
    # Cat√©gorisation de la comp√©tition
    competition_level = "FAIBLE"
    if application_rate > 10:
        competition_level = "√âLEV√âE"
    elif application_rate > 5:
        competition_level = "MOYENNE"
    
    return {
        **offer_data,
        "application_rate": application_rate,  
        "competition_level": competition_level,  
        "candidates_count": candidates,
        "views_count": views
    }

## Methods to clean the data, (Used in the creating chunk method)

In [9]:
def clean_offer_data(offer_data):
    print("   Nettoyage des donn√©es")
    """Nettoie et compl√®te les donn√©es manquantes"""
    return {
        "reference": offer_data.get("reference", "N/A"),
        "organizationName": offer_data.get("organizationName", "Entreprise non sp√©cifi√©e"),
        "missionTitle": offer_data.get("missionTitle", "Titre non sp√©cifi√©"),
        "missionDescription": offer_data.get("missionDescription", "Description non disponible"),
        "missionProfile": offer_data.get("missionProfile", "Profil non sp√©cifi√©"),
        "countryNameEn": offer_data.get("countryNameEn", "Pays non sp√©cifi√©"),
        "cityNameEn": offer_data.get("cityNameEn", "Ville non sp√©cifi√©e"),
        "activitySectorN1": offer_data.get("activitySectorN1", "Secteur non sp√©cifi√©"),
        "missionDuration": offer_data.get("missionDuration", "Dur√©e non sp√©cifi√©e"),
        "indemnite": offer_data.get("indemnite", "Non sp√©cifi√©"),
        "missionStartDate": offer_data.get("missionStartDate", "Date non sp√©cifi√©e"),
        "creationDate": offer_data.get("creationDate","Date non sp√©cifi√©e"),
        "contactEmail": offer_data.get("contactEmail", "Email non disponible"),
    }

# Creating chunks method

We are dividing our chunks into content and metadata.

We will only apply embedding on our content.

Metadata will allow to use some filters when fetching the embeddings in the ChromaDB database

In [10]:

def create_chunks_for_rag(offer_data):
    """Cr√©e 2 chunks optimis√©s pour le RAG"""
    chunks = []

    # 1. Nettoyer les donn√©es brutes
    cleaned_offer = clean_offer_data(offer_data)

    # 2. Calculer le taux de postulation √† l'offre
    enhanced_data = enhance_offer_data(offer_data)

    # 2. Extraire les m√©tadonn√©es depuis les donn√©es nettoy√©es
    common_metadata = {
        "offer_reference": cleaned_offer.get("reference"),  
        "company": cleaned_offer.get("organizationName"),
        "title": cleaned_offer.get("missionTitle"),
        "country": cleaned_offer.get("countryNameEn"),
        "city": cleaned_offer.get("cityNameEn"),
        "sector": cleaned_offer.get("activitySectorN1"),
        "duration_months": cleaned_offer.get("missionDuration"),
        "salary_eur": cleaned_offer.get("indemnite"),
        "start_date": cleaned_offer.get("missionStartDate"),
        "creation_date": cleaned_offer.get("creationDate"),
        "contact_email": cleaned_offer.get("contactEmail"),
        # Rest is comming from enhanced_data
        "application_rate": enhanced_data["application_rate"],
        "competition_level": enhanced_data["competition_level"], 
        "candidates_count": enhanced_data["candidates_count"],
        "views_count": enhanced_data["views_count"]
    }

    
    
    # CHUNK 1: Description de la mission
    chunk1_content = f"""Offre VIE {offer_data.get('reference')} - {offer_data.get('missionTitle')}
    Entreprise: {offer_data.get('organizationName')}
    Localisation: {offer_data.get('cityNameEn')}, {offer_data.get('countryNameEn')}
    Secteur d'activit√©: {offer_data.get('activitySectorN1')}
    Dur√©e: {offer_data.get('missionDuration')} mois
    Indemnit√©: {offer_data.get('indemnite')} ‚Ç¨ par mois
    Description de la mission: {offer_data.get('missionDescription', 'Non sp√©cifi√©e')}"""

    new_chunk = {
        "content": chunk1_content,
        "metadata": {
            **common_metadata,
            "chunk_type": "mission_description",
            "chunk_id": f"{offer_data.get('id')}_description"
        }
    }

    print(f"   Taille du chunk de Description (caract√®res) : {len(new_chunk['content'])}")
    chunks.append(new_chunk)
    
    # CHUNK 2: Profil recherch√©
    chunk2_content = f"""Offre VIE {offer_data.get('reference')} - {offer_data.get('missionTitle')}
    Entreprise: {offer_data.get('organizationName')}
    Localisation: {offer_data.get('cityNameEn')}, {offer_data.get('countryNameEn')}
    Secteur d'activit√©: {offer_data.get('activitySectorN1')}
    Profil recherch√©: {offer_data.get('missionProfile', 'Non sp√©cifi√©')}"""

    new_chunk2 = {
        "content": chunk2_content,
        "metadata": {
            **common_metadata,
            "chunk_type": "candidate_profile",
            "chunk_id": f"{offer_data.get('id')}_profile"
        }
    }

    chunks.append(new_chunk2)
    print(f"   Taille du chunk de Profil (caract√®res) : {len(new_chunk2['content'])}")
    
    return chunks

# Create the embeding of the chunks

In [8]:
# Configuration OpenAI
api_key = os.environ.get("API_KEY_OPENAI")
client = OpenAI(api_key=api_key)

###  Method to embed the question that we will ask the RAG

In [6]:
def get_text_embedding(text: str) -> List[float]:
    """
    Cr√©e un embedding pour un texte avec OpenAI
    
    Args:
        text: Le texte √† embedder
        
    Returns:
        Liste de floats repr√©sentant l'embedding
    """
    response = client.embeddings.create(
        input=text,
        model="text-embedding-3-small"  # 1536 dimensions, $0.02/1M tokens
    )
    return response.data[0].embedding

###  Method to create our embeddings dataset

We are using batch to optimize API calls and reduce latency. 

This approach processes multiple texts in a single request instead of making individual calls for each one.

In [14]:
def create_embeddings_batch(chunks: List[Dict], batch_size: int = 100) -> List[Dict]:
    """
    Cr√©e les embeddings pour tous les chunks avec traitement par batch
    
    Args:
        chunks: Liste des chunks √† embedder
        batch_size: Nombre de chunks √† traiter par batch (max 100 pour OpenAI)
        
    Returns:
        Liste des chunks enrichis avec leurs embeddings
    """
    enriched_chunks = []
    
    for i in range(0, len(chunks), batch_size):
        batch = chunks[i:i + batch_size]
        print(f"Traitement du batch {i//batch_size + 1} ({len(batch)} chunks)...")
        
        # Extraire les contenus textuels
        texts = [chunk["content"] for chunk in batch]
        
        # Cr√©er les embeddings en batch (plus efficace)
        response = client.embeddings.create(
            input=texts,
            model="text-embedding-3-small"
        )
        
        # Associer chaque embedding √† son chunk
        for j, chunk in enumerate(batch):
            enriched_chunk = chunk.copy()
            enriched_chunk["embedding"] = response.data[j].embedding
            enriched_chunks.append(enriched_chunk)
        
        # Rate limiting (optionnel mais recommand√©)
        time.sleep(0.1)
    
    print(f"‚úÖ {len(enriched_chunks)} embeddings cr√©√©s")
    return enriched_chunks




# Creation of the Pipeline : create the dataset embeddings

In [None]:
print("="*80)
print("PIPELINE RAG - OFFRES VIE")
print("="*80)

# √âTAPE 1: Rechercher les offres
print("\n[1/5] Recherche des offres VIE...")
offers_response = search_offers(limit=10000)  # Ajustez la limite selon vos besoins

# La structure de retour peut varier, adaptez selon l'API
# Supposons que l'API retourne une liste d'IDs ou d'objets simplifi√©s
offer_ids = []
offer_ids = [item.get("id") for item in offers_response["result"] if item.get("id") is not None]

print(f"‚úÖ {len(offer_ids)} offres trouv√©es")

# √âTAPE 2: R√©cup√©rer les d√©tails et cr√©er les chunks
print("\n[2/5] R√©cup√©ration des d√©tails et cr√©ation des chunks...")
all_chunks = []
    

for i, offer_id in enumerate(offer_ids, 1):  
    try:
        print(f"  Traitement offre {i}/{len(offer_ids)}: {offer_id}")
        offer_details = get_offer_details(offer_id)
        chunks = create_chunks_for_rag(offer_details)
        all_chunks.extend(chunks)
        time.sleep(0.2)  # Rate limiting pour l'API Civiweb
    except Exception as e:
        print(f"  ‚ö†Ô∏è  Erreur pour l'offre {offer_id}: {e}")
        continue
    
    print(f"‚úÖ {len(all_chunks)} chunks cr√©√©s\n")

# √âTAPE 3: Cr√©er les embeddings
print("\n[3/5] Cr√©ation des embeddings OpenAI...")
chunks_with_embeddings = create_embeddings_batch(all_chunks)


print("\n[4/5] Sauvegarde des nouvelles donn√©es...")
save_embeddings_numpy(chunks_with_embeddings, "vie_embeddings.npz")

# We save the current date
update_last_refresh_date()

# √âTAPE 5: Statistiques
print("\n[5/5] Statistiques finales")
print("="*80)
print(f"Nombre de nouvelles offres trait√©es: {len(offer_ids)}")
print(f"Nombre de nouveaux chunks: {len(chunks_with_embeddings)}")
print(f"Dimension des nouveaux embeddings: {len(chunks_with_embeddings[0]['embedding'])}")

# Calculer la taille totale
total_tokens = sum(len(c['content']) // 4 for c in chunks_with_embeddings)
print(f"Tokens estim√©s: ~{total_tokens:,}")
print(f"Co√ªt estim√©: ~${(total_tokens / 1_000_000) * 0.02:.4f}")
print("="*80)



[3/5] Cr√©ation des embeddings OpenAI...
Traitement du batch 1 (100 chunks)...
Traitement du batch 2 (100 chunks)...
Traitement du batch 3 (100 chunks)...
Traitement du batch 4 (100 chunks)...
Traitement du batch 5 (100 chunks)...
Traitement du batch 6 (100 chunks)...
Traitement du batch 7 (100 chunks)...
Traitement du batch 8 (100 chunks)...
Traitement du batch 9 (100 chunks)...
Traitement du batch 10 (100 chunks)...
Traitement du batch 11 (100 chunks)...
Traitement du batch 12 (100 chunks)...
Traitement du batch 13 (100 chunks)...
Traitement du batch 14 (100 chunks)...
Traitement du batch 15 (100 chunks)...
Traitement du batch 16 (100 chunks)...
Traitement du batch 17 (100 chunks)...
Traitement du batch 18 (100 chunks)...
Traitement du batch 19 (100 chunks)...
Traitement du batch 20 (100 chunks)...
Traitement du batch 21 (100 chunks)...
Traitement du batch 22 (100 chunks)...
Traitement du batch 23 (100 chunks)...
Traitement du batch 24 (100 chunks)...
Traitement du batch 25 (100 chu

# Creation of the Pipeline : Add new embeddings in the curent dataset

In [4]:
print("="*80)
print("PIPELINE RAG - Nouvelles OFFRES VIE")
print("="*80)

# √âTAPE 1: Rechercher les offres
print("\n[1/5] Recherche des offres VIE...")
offers_response = search_offers(limit=100000)  # Ajustez la limite selon vos besoins

last_refresh_date = get_last_refresh_date()
print(f"üìÖ Dernier rafra√Æchissement: {last_refresh_date}")

offer_ids = []
offer_ids = [
    item.get("id") for item in offers_response["result"] 
    if (item.get("id") is not None and 
        datetime.fromisoformat(item.get("creationDate")) > last_refresh_date)
]

print(f"‚úÖ {len(offer_ids)} nouvelles offres depuis le dernier rafra√Æchissement")
# Mettre √† jour Last_refresh apr√®s le traitement
update_last_refresh_date()

# √âTAPE 2: R√©cup√©rer les d√©tails et cr√©er les chunks
print("\n[2/5] R√©cup√©ration des d√©tails et cr√©ation des chunks...")
all_chunks = []
    

for i, offer_id in enumerate(offer_ids, 1):  
    try:
        print(f"  Traitement offre {i}/{len(offer_ids)}: {offer_id}")
        offer_details = get_offer_details(offer_id)
        chunks = create_chunks_for_rag(offer_details)
        all_chunks.extend(chunks)
        time.sleep(0.2)  # Rate limiting pour l'API Civiweb
    except Exception as e:
        print(f"  ‚ö†Ô∏è  Erreur pour l'offre {offer_id}: {e}")
        continue
    
    print(f"‚úÖ {len(all_chunks)} chunks cr√©√©s\n")

# √âTAPE 3: Cr√©er les embeddings
print("\n[3/5] Cr√©ation des embeddings OpenAI...")
chunks_with_embeddings = create_embeddings_batch(all_chunks)


print("\n[4/5] Sauvegarde des donn√©es...")
save_embeddings_numpy(chunks_with_embeddings, "vie_embeddings.npz")


# √âTAPE 5: Statistiques
print("\n[5/5] Statistiques finales")
print("="*80)
print(f"Nombre d'offres trait√©es: {len(offer_ids)}")
print(f"Nombre de chunks: {len(chunks_with_embeddings)}")
print(f"Dimension des embeddings: {len(chunks_with_embeddings[0]['embedding'])}")

# Calculer la taille totale
total_tokens = sum(len(c['content']) // 4 for c in chunks_with_embeddings)
print(f"Tokens estim√©s: ~{total_tokens:,}")
print(f"Co√ªt estim√©: ~${(total_tokens / 1_000_000) * 0.02:.4f}")
print("="*80)


PIPELINE RAG - Nouvelles OFFRES VIE

[1/5] Recherche des offres VIE...


NameError: name 'search_offers' is not defined

# Now let's ask a question and embed it !!!

In [9]:
question = "Je Cherche un VIE de minimum 1 an, je suis orient√©e en Data et IA mais j'aime aussi faire du d√©veloppement web. Peux tu me donner les meilleurs offres pour moi. Sachant que je cherche une offre pour Octobre 2025"
#question = "Je Cherche un VIE de minimum 1 an, je suis orient√©e en Data et IA mais j'aime aussi faire du d√©veloppement web. Peux tu me donner les meilleurs offres pour moi. Si possible priorise les offres en cor√©e du sud"

In [10]:
question_embeddings = np.array([get_text_embedding(question)])

# Comparison bewtween our embedding database and our embedding question

In [11]:
from sklearn.metrics.pairwise import cosine_similarity
def find_closest_embedding(
    question_embeddings: np.array,
    text_embeddings: np.ndarray,
    top_n: int = 3,
) -> tuple[np.ndarray, np.ndarray]:

    # Calculer les similarit√©s avec tous les embeddings
    similarites = cosine_similarity(question_embeddings, text_embeddings)[0]

    # R√©cup√©rer les top_n index et scores
    top_index = np.argsort(similarites)[-top_n:][::-1]  # Tri d√©croissant
    top_scores = similarites[top_index]

    return top_scores, top_index

In [12]:
loaded_embeddings = load_embeddings("vie_embeddings.npz")
text_embeddings = loaded_embeddings['embeddings']
top_scores, top_index = find_closest_embedding(question_embeddings, text_embeddings, 10)

top_results = []

print("Top 10 embeddings les plus proches:\n ")
for score, idx in zip(top_scores, top_index):
    print(f"- Embedding {idx}: Score = {score:.4f}")
    print(f"metadata : {loaded_embeddings['metadata'][idx]}")
    print(f"contents : {loaded_embeddings['contents'][idx]}")
    top_results.append(loaded_embeddings['contents'][idx])

    # loaded_embeddings['metadata'][idx] PAS UTILISER POUR L'INSTANT

Top 10 embeddings les plus proches:
 
- Embedding 2745: Score = 0.6519
metadata : {'offer_reference': 'VIE226785', 'company': 'EXTIA', 'title': 'DEVELOPPEUR FULLSTACK (H/F)', 'country': 'CANADA', 'city': 'MONTREAL', 'sector': "TECHNOLOGIES DE L'INFORMATION ET DES TELECOMMUNICATIONS", 'duration_months': 24, 'salary_eur': 2623, 'start_date': '2026-01-01T00:00:00', 'creation_date': '2025-04-24T21:13:10.497', 'contact_email': 'cboissy@extia.ca', 'application_rate': 5.1, 'competition_level': 'MOYENNE', 'candidates_count': 84, 'views_count': 1658, 'chunk_type': 'candidate_profile', 'chunk_id': '225785_profile'}
contents : Offre VIE VIE226785 - DEVELOPPEUR FULLSTACK (H/F)
    Entreprise: EXTIA
    Localisation: MONTREAL, CANADA
    Secteur d'activit√©: TECHNOLOGIES DE L'INFORMATION ET DES TELECOMMUNICATIONS
    Profil recherch√©: Profil recherch√©

- Vous justifiez d'au moins 5 ans d'exp√©rience r√©ussie en tant qu'Ing√©nieur(e) de donn√©es (alternances comprises)
- Bonnes connaissances des i

# We Use a LLM to answer our question using the 10th closest embedding

In [13]:

# Concat√©ner les top-N segments en un seul contexte
context = "\n".join(top_results)
prompt = f"Contexte :\n{context}\n\nQuestion : {question}\nR√©ponse :"


response = client.chat.completions.create(
    model="gpt-4o",
    messages=[
        {"role": "system", "content": "Tu es un assistant sp√©cialis√© dans la recherche d'information √† partir de documents fournis. Tes r√©ponses doivent absolument provenir du contexte fourni."},
        {"role": "user", "content": prompt}
    ]
)

print(response.choices[0].message.content)


Pour une mission VIE d'une dur√©e minimale de 1 an dans le domaine de la Data et de l'IA, et avec un int√©r√™t pour le d√©veloppement web, voici deux offres qui pourraient correspondre √† ton profil :

1. **Offre VIE VIE229576 - DATA ENGINEER DEVELOPPER W/ FRENCH (H/F)**
   - **Entreprise:** EXTIA
   - **Localisation:** BUCHAREST, ROUMANIE
   - **Dur√©e:** 12 mois
   - **Profil:** Tu auras l'opportunit√© de travailler sur des concepts de Big Data, ETL, Data Mining, et Machine Learning. Tu devras √©galement r√©diger des scripts en Python et/ou R, ce qui peut √™tre compl√©t√© par des connaissances en Java ou Scala, et travailler dans un environnement Linux. Une exp√©rience avec les infrastructures cloud et le stockage de fichiers volumineux sera utile.

2. **Offre VIE VIE226785 - DEVELOPPEUR FULLSTACK (H/F)**
   - **Entreprise:** EXTIA
   - **Localisation:** MONTREAL, CANADA
   - **Dur√©e:** 24 mois
   - **Profil:** Bien que cette offre soit orient√©e vers le d√©veloppement fullstack, el