# Analyse des performances du Keyword Extractor

Objectif : Diagnostiquer les problèmes actuels de l'extraction de keywords pour améliorer la qualité des suggestions.


## Setup


In [2]:
import json
from pathlib import Path
import pandas as pd
import matplotlib.pyplot as plt
from typing import Dict, List, Any

# Chemins
RESUME_DATA_DIR = Path("/Users/hugovaillaud/Documents/code/JobSeekerAgent/src/jobseeker_agent/data/resume")

# Dossiers à analyser
JOB_IDS = [18, 71, 100, 270, 399, 405, 434, 771]


## Chargement des données


In [3]:
def load_job_data(job_id: int) -> Dict[str, Any]:
    """Charge les 3 fichiers JSON pour un job donné."""
    job_dir = RESUME_DATA_DIR / str(job_id)
    
    data = {
        "job_id": job_id,
        "highlights": None,
        "keywords": None,
        "keywords_validated": None
    }
    
    # Highlights
    highlights_file = job_dir / "highlights.json"
    if highlights_file.exists():
        with open(highlights_file, 'r', encoding='utf-8') as f:
            data["highlights"] = json.load(f)
    
    # Keywords
    keywords_file = job_dir / "keywords.json"
    if keywords_file.exists():
        with open(keywords_file, 'r', encoding='utf-8') as f:
            data["keywords"] = json.load(f)
    
    # Keywords validated
    keywords_validated_file = job_dir / "keywords_validated.json"
    if keywords_validated_file.exists():
        with open(keywords_validated_file, 'r', encoding='utf-8') as f:
            data["keywords_validated"] = json.load(f)
    
    return data

# Charger tous les jobs
jobs_data = {job_id: load_job_data(job_id) for job_id in JOB_IDS}

print(f"Chargé {len(jobs_data)} dossiers")
for job_id, data in jobs_data.items():
    print(f"Job {job_id}:")
    print(f"  - Highlights: {len(data['highlights']) if data['highlights'] else 0}")
    print(f"  - Keywords groups: {len(data['keywords']) if data['keywords'] else 0}")
    print(f"  - Validated groups: {len(data['keywords_validated']) if data['keywords_validated'] else 0}")


Chargé 8 dossiers
Job 18:
  - Highlights: 13
  - Keywords groups: 10
  - Validated groups: 2
Job 71:
  - Highlights: 21
  - Keywords groups: 9
  - Validated groups: 5
Job 100:
  - Highlights: 13
  - Keywords groups: 11
  - Validated groups: 4
Job 270:
  - Highlights: 10
  - Keywords groups: 14
  - Validated groups: 3
Job 399:
  - Highlights: 11
  - Keywords groups: 9
  - Validated groups: 3
Job 405:
  - Highlights: 20
  - Keywords groups: 5
  - Validated groups: 4
Job 434:
  - Highlights: 43
  - Keywords groups: 14
  - Validated groups: 8
Job 771:
  - Highlights: 30
  - Keywords groups: 10
  - Validated groups: 6


## Étape 1 : Vue d'ensemble quantitative


In [4]:
# À compléter : calculer métriques de base


## Étape 2 : Analyse qualitative par échantillonnage


In [10]:
def display_job_analysis(job_id: int):
    """Affiche les highlights et keywords extraits pour un job."""
    data = jobs_data.get(job_id)
    if not data:
        print(f"Job {job_id} non trouvé")
        return
    
    print(f"{'='*80}")
    print(f"JOB {job_id}")
    print(f"{'='*80}\n")
    
    # 1. Highlights
    print(f"{'─'*80}")
    print(f"HIGHLIGHTS ({len(data['highlights']) if data['highlights'] else 0})")
    print(f"{'─'*80}")
    if data['highlights']:
        for i, highlight in enumerate(data['highlights'], 1):
            print(f"{i:2d}. {highlight}")
    else:
        print("Aucun highlight")
    
    # 2. Keywords validés
    print(f"\n{'─'*80}")
    print(f"KEYWORDS VALIDÉS (sélection manuelle)")
    print(f"{'─'*80}")
    
    # Extraire tous les keywords validés de manière plate
    validated_keywords = []
    if data['keywords_validated']:
        for group_name, group_data in data['keywords_validated'].items():
            if 'keywords' in group_data:
                validated_keywords.extend(group_data['keywords'])
    
    if validated_keywords:
        for i, kw in enumerate(validated_keywords, 1):
            print(f"{i:2d}. {kw}")
        print(f"\nTotal: {len(validated_keywords)} keywords")
    else:
        print("Aucun keyword validé")
    
    # 3. Keywords rejetés (extraits mais non validés)
    print(f"\n{'─'*80}")
    print(f"KEYWORDS REJETÉS (extraits mais non sélectionnés)")
    print(f"{'─'*80}")
    
    # Extraire tous les keywords de manière plate
    all_keywords = []
    if data['keywords']:
        for group_name, group_data in data['keywords'].items():
            if isinstance(group_data, dict):
                # Cas de la structure avec match_present, match_absent, mismatch_absent
                for category in ['match_present', 'match_absent', 'mismatch_absent']:
                    if category in group_data and group_data[category]:
                        all_keywords.extend(group_data[category])
            else:
                # Cas d'une simple liste
                all_keywords.extend(group_data)
    
    # Normaliser pour la comparaison (lowercase, strip)
    validated_normalized = {kw.lower().strip() for kw in validated_keywords}
    
    # Filtrer les keywords non validés
    rejected_keywords = [kw for kw in all_keywords if kw.lower().strip() not in validated_normalized]
    
    if rejected_keywords:
        for i, kw in enumerate(rejected_keywords, 1):
            print(f"{i:2d}. {kw}")
        print(f"\nTotal: {len(rejected_keywords)} keywords rejetés")
        print(f"Taux de rétention: {len(validated_keywords)}/{len(all_keywords)} = {len(validated_keywords)/len(all_keywords)*100:.1f}%")
    else:
        print("Tous les keywords extraits ont été validés")
    
    print(f"\n{'='*80}\n")

# Exemple : afficher l'analyse pour le job 18
display_job_analysis(JOB_IDS[2])


JOB 100

────────────────────────────────────────────────────────────────────────────────
HIGHLIGHTS (13)
────────────────────────────────────────────────────────────────────────────────
 1. data pipeline
 2. cross-functional team
 3. Machine Learning
 4. text-to-speech
 5. speech-to-text,
 6. reinforcement learning 
 7. vision
 8. GCP
 9. Python
10. Agentic LLM pipeline
11. ANN, rerankers, feedbackloops, knowledge graphs
12. Classification Algorithms
13. minimum cost flow problems

────────────────────────────────────────────────────────────────────────────────
KEYWORDS VALIDÉS (sélection manuelle)
────────────────────────────────────────────────────────────────────────────────
 1. combinatorial optimization
 2. heuristics
 3. Bayesian optimization
 4. experiment tracking (WandB)
 5. data pipelines
 6. data engineering
 7. LangChain
 8. agent frameworks
 9. tool-augmented workflow
10. LangGraph

Total: 10 keywords

──────────────────────────────────────────────────────────────────────

## Génération du rapport complet


In [11]:
# Générer le rapport pour tous les jobs
import sys
from io import StringIO

output_file = Path("keywords_analysis_report.txt")

with open(output_file, 'w', encoding='utf-8') as f:
    for job_id in JOB_IDS:
        # Capturer l'output
        old_stdout = sys.stdout
        sys.stdout = StringIO()
        
        display_job_analysis(job_id)
        
        output = sys.stdout.getvalue()
        sys.stdout = old_stdout
        
        f.write(output)

print(f"Rapport généré : {output_file}")


Rapport généré : keywords_analysis_report.txt


## Analyse des tendances - Keywords rejetés


In [12]:
# Collecter tous les keywords rejetés
from collections import Counter

all_rejected = []

for job_id in JOB_IDS:
    data = jobs_data[job_id]
    
    # Extraire keywords validés
    validated_keywords = []
    if data['keywords_validated']:
        for group_name, group_data in data['keywords_validated'].items():
            if 'keywords' in group_data:
                validated_keywords.extend(group_data['keywords'])
    
    # Extraire tous les keywords
    all_keywords = []
    if data['keywords']:
        for group_name, group_data in data['keywords'].items():
            if isinstance(group_data, dict):
                for category in ['match_present', 'match_absent', 'mismatch_absent']:
                    if category in group_data and group_data[category]:
                        all_keywords.extend(group_data[category])
            else:
                all_keywords.extend(group_data)
    
    # Identifier les rejetés
    validated_normalized = {kw.lower().strip() for kw in validated_keywords}
    rejected = [kw for kw in all_keywords if kw.lower().strip() not in validated_normalized]
    all_rejected.extend(rejected)

# Analyser les tendances
print(f"Total de keywords rejetés (tous jobs): {len(all_rejected)}")
print(f"\nTop 20 keywords les plus souvent rejetés:")
print("="*80)

most_common = Counter(all_rejected).most_common(20)
for kw, count in most_common:
    print(f"{count:2d}x - {kw}")

print(f"\n\nKeywords rejetés uniques: {len(set(all_rejected))}")


Total de keywords rejetés (tous jobs): 334

Top 20 keywords les plus souvent rejetés:
 5x - vector databases
 5x - semantic search
 4x - APIs
 3x - root cause analysis
 3x - PyTorch
 3x - feedback loops
 3x - Python
 3x - RAG
 3x - embeddings
 2x - JAX
 2x - AWS
 2x - CI/CD
 2x - BigQuery
 2x - context management
 2x - vector search
 2x - scalable infrastructure
 2x - cloud infrastructure
 2x - containers
 2x - CI/CD workflows
 2x - production deployment of GenAI features


Keywords rejetés uniques: 293


Testing on job: 18 - Applied ML/AI Engineer - Monitoring

Extracting keywords with simplified agent...
✅ Chargement du modèle OpenAI : gpt-4o
Keyword extraction took 8.88 seconds

✅ Extraction réussie!
Nombre de keywords extraits: 53

Premiers keywords:
   1. data observability
   2. data quality
   3. machine learning
   4. time series forecasting
   5. intelligent alerting
   6. generative AI
   7. data profiling
   8. root cause analysis
   9. metadata
  10. Python 3
  ... et 43 autres


## Structure d'évaluation et métriques


In [None]:
# Test de la structure d'évaluation et des fonctions de métriques
from jobseeker_agent.customizer.evaluation.keyword_evaluator import (
    KeywordEvaluationResult,
    calculate_metrics,
    add_metrics_to_result,
    print_evaluation_summary
)

# Exemple de résultat d'évaluation (simulé pour test)
example_result: KeywordEvaluationResult = {
    "true_positives": [
        {"proposed": "agentic workflows", "matched_with": "AI agents", "confidence": 0.9},
        {"proposed": "LangChain", "matched_with": "LangChain", "confidence": 1.0},
        {"proposed": "prompt engineering", "matched_with": "prompt engineering", "confidence": 1.0},
    ],
    "false_positives": [
        {"proposed": "AWS", "reason": "not in ground truth"},
        {"proposed": "APIs", "reason": "not in ground truth"},
        {"proposed": "vector databases", "reason": "not in ground truth"},
    ],
    "false_negatives": [
        {"ground_truth": "context injection", "reason": "not proposed by agent"},
        {"ground_truth": "tracing", "reason": "not proposed by agent"},
    ],
    "metrics": {}  # Sera calculé
}

# Calculer les métriques
result_with_metrics = add_metrics_to_result(example_result)

# Afficher le résumé
print_evaluation_summary(result_with_metrics, job_id=999)

# Vérifier les calculs manuellement
print("Vérification des calculs:")
print(f"TP: {len(example_result['true_positives'])}")
print(f"FP: {len(example_result['false_positives'])}")
print(f"FN: {len(example_result['false_negatives'])}")
print(f"Precision attendue: {3/(3+3):.4f} = {result_with_metrics['metrics']['precision']}")
print(f"Recall attendu: {3/(3+2):.4f} = {result_with_metrics['metrics']['recall']}")


In [13]:
# Catégorisation manuelle des patterns
# À compléter en regardant les résultats ci-dessus

# Patterns suspects (exemples à ajuster selon les résultats)
patterns = {
    "Infrastructure/DevOps générique": ["CI/CD", "containers", "cloud infrastructure", "Docker", "Kubernetes", "AWS", "GCP"],
    "Soft skills vagues": ["collaboration", "best practices", "user interactions"],
    "Termes trop généraux": ["scalability", "monitoring", "APIs", "performance", "maintainability"],
    "Outils spécifiques": ["Prometheus", "Loki", "Grafana", "Sentry", "BigQuery", "Snowflake"],
    "Déploiement/Production": ["deploy", "deployment", "production", "serving", "MLOps", "LLMOps"],
}

print("Analyse par catégories:\n")
print("="*80)

for category, keywords in patterns.items():
    matches = [kw for kw in all_rejected if any(pattern.lower() in kw.lower() for pattern in keywords)]
    if matches:
        print(f"\n{category} ({len(matches)} occurrences):")
        # Compter les occurrences
        match_counts = Counter(matches).most_common(10)
        for kw, count in match_counts:
            print(f"  {count}x - {kw}")


Analyse par catégories:


Infrastructure/DevOps générique (16 occurrences):
  2x - AWS
  2x - CI/CD
  2x - cloud infrastructure
  2x - containers
  2x - CI/CD workflows
  1x - Kubernetes (AWS EKS)
  1x - MySQL (AWS RDS)
  1x - GCP
  1x - AWS Bedrock
  1x - CI/CD integration

Soft skills vagues (4 occurrences):
  1x - best practices
  1x - user interactions
  1x - cross-functional collaboration with product and design
  1x - Collaboration with electronics/software/system teams

Termes trop généraux (22 occurrences):
  4x - APIs
  2x - model monitoring
  1x - monitoring engine
  1x - model evaluation / monitoring
  1x - metadata (monitoring-related)
  1x - scalability
  1x - write APIs
  1x - evaluation & quality monitoring
  1x - quality monitoring
  1x - performance analysis post-release

Outils spécifiques (7 occurrences):
  2x - BigQuery
  1x - Prometheus
  1x - Loki
  1x - Grafana
  1x - Sentry
  1x - Snowflake

Déploiement/Production (18 occurrences):
  2x - production deployment o

## Étape 3 : Identification des problèmes


In [None]:
# À compléter : synthétiser les patterns d'erreurs


## Notes et observations

### Patterns d'erreurs identifiés

1. ...
2. ...
3. ...

### Hypothèses

- ...
- ...
