In [1]:
from transformers import AutoTokenizer, AutoModel
import torch
import numpy as np 
import json
# Charger le modèle et le tokenizer
MODEL_NAME = "thenlper/gte-small"

In [2]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME)

In [6]:
def get_embeddings(text, method="cls"):
    """Extrait l'embedding d'une phrase selon la méthode spécifiée."""
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
    with torch.no_grad():
        outputs = model(**inputs)
    
    last_hidden_state = outputs.last_hidden_state  # (batch_size, seq_len, hidden_dim)
    
    if method == "cls":
        embeddings = last_hidden_state[:, 0, :]
        embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1)

        return  embeddings # Embedding du token [CLS]
    elif method == "mean":
        attention_mask = inputs["attention_mask"].unsqueeze(-1)
        embeddings = (last_hidden_state * attention_mask).sum(dim=1) / attention_mask.sum(dim=1)

        embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1)
        return embeddings  # Moyenne des embeddings des tokens
    else:
        raise ValueError("Méthode non reconnue : choisir 'cls' ou 'mean'.")

In [7]:
# Charger le JSON avec un encodage spécifique
with open('../outputs/categories.json', 'r', encoding='utf-8-sig') as f:
    data = json.load(f)

# Créer un dictionnaire pour stocker les embeddings
embeddings_dict = {}

# Extraire les embeddings pour chaque catégorie
for category, subcategories in data.items():
    for subcategory, details in subcategories.items():
        text = f"{category} {subcategory} {details['Scope']} {details['sentences']}"
        embedding = get_embeddings(text, method="cls")  # Ou "cls" selon ce que tu préfères
        embeddings_dict[f"{category}_{subcategory}"] = embedding.numpy()

In [8]:
# Sauvegarder les embeddings dans un fichier .npy
np.save('../outputs/category_gte-Qwen2-1.5B-instruct.npy', embeddings_dict)

In [9]:
category_embeddings = np.load('../outputs/category_gte-Qwen2-1.5B-instruct.npy', allow_pickle=True).item()

In [10]:
from sklearn.metrics.pairwise import cosine_similarity

def classify_question_top_5(question, category_embeddings, method="cls"):
    """Classifie la question et retourne les 5 catégories les plus similaires."""
    # Extraire l'embedding de la question
    question_embedding = get_embeddings(question, method=method)
    
    # Comparer la question avec chaque catégorie
    similarities = {}
    for category, category_embedding in category_embeddings.items():
        similarity = cosine_similarity(question_embedding.numpy().reshape(1, -1), category_embedding.reshape(1, -1))[0][0]
        
        similarities[category] = similarity

    # Trier les catégories par similarité décroissante et récupérer les 5 meilleures
    
    sorted_categories = sorted(similarities.items(), key=lambda x: x[1], reverse=True)[:15]
    return sorted_categories

In [11]:

with open('../outputs/test_categories.json', 'r', encoding='utf-8') as f:
    test_categoies = json.load(f)

with open('../outputs/2022_PreEx_open.json', 'r', encoding='utf-8') as f:
    PreEx_open = json.load(f)

score_1 = 0
score_2 = 0
score_3 = 0
score_4 = 0
failed = 0

global_score_1 = 0
global_score_2 = 0
global_score_3 = 0
global_score_4 = 0
global_failed = 0

for question_id, question_text in PreEx_open.items():
    category = test_categoies.get("2022_PreEx_open", {}).get(f"{question_id}", {}).get("Category")

    if category is None:
        print(f"Warning: Subcategory not found for question ID: {question_id}")
        failed +=1
        continue

    top_5_categories = classify_question_top_5(question_text, category_embeddings, method="cls")
  
    top_5_category_names = [item[0] for item in top_5_categories]

    if top_5_category_names[0].split("_")[0] == category:
        score_1 += 1
    elif top_5_category_names[1].split("_")[0] == category:
        score_2 += 1
    elif top_5_category_names[2].split("_")[0] == category:
        score_3 += 1
    elif top_5_category_names[3].split("_")[0] == category:
        score_4 += 1
    else:
        print(category)
        failed += 1

print(f"🥇:{score_1} 🥈:{score_2} 🥉:{score_3} 🏅:{score_4} ❌:{failed}")
print("--- Fin des résultats ---")




global_score_1 += score_1
global_score_2 += score_2
global_score_3 += score_3
global_score_4 += score_4
global_failed += failed

Entitlement and transfers
Divisional applications
🥇:6 🥈:1 🥉:1 🏅:0 ❌:2
--- Fin des résultats ---


In [12]:
with open('../outputs/2021_PreEx_open.json', 'r', encoding='utf-8') as f:
    PreEx_open = json.load(f)

score_1 = 0
score_2 = 0
score_3 = 0
score_4 = 0
failed = 0

for question_id, question_text in PreEx_open.items():
    category = test_categoies.get("2021_PreEx_open", {}).get(f"{question_id}", {}).get("Category")

    if category is None:
        print(f"Warning: Subcategory not found for question ID: {question_id}")
        failed +=1
        continue

    top_5_categories = classify_question_top_5(question_text, category_embeddings, method="cls")

    # Extract the category names from the tuples
    top_5_category_names = [item[0] for item in top_5_categories]

    if top_5_category_names[0].split("_")[0] == category:
        score_1 += 1
    elif top_5_category_names[1].split("_")[0] == category:
        score_2 += 1
    elif top_5_category_names[2].split("_")[0] == category:
        score_3 += 1
    elif top_5_category_names[3].split("_")[0] == category:
        score_4 += 1
    else:
        print(category)
        failed += 1

print(f"🥇:{score_1} 🥈:{score_2} 🥉:{score_3} 🏅:{score_4} ❌:{failed}")
print("--- Fin des résultats ---")


global_score_1 += score_1
global_score_2 += score_2
global_score_3 += score_3
global_score_4 += score_4
global_failed += failed

print(f"🥇:{global_score_1} 🥈:{global_score_2} 🥉:{global_score_3} 🏅:{global_score_4} ❌:{global_failed}")
print("--- Fin des résultats ---")

Fees, payment methods, and time limits
Priority claims and right of priority
🥇:5 🥈:0 🥉:3 🏅:0 ❌:2
--- Fin des résultats ---
🥇:11 🥈:1 🥉:4 🏅:0 ❌:4
--- Fin des résultats ---


In [13]:
with open('../outputs/OEB_SUP_mcq.json', 'r', encoding='utf-8') as f:
    PreEx_open = json.load(f)

score_1 = 0
score_2 = 0
score_3 = 0
score_4 = 0
failed = 0  # Corrected typo: faild -> failed
for question_id, question in PreEx_open.items():
    category = test_categoies.get("OEB_SUP_mcq", {}).get(f"{question_id}", {}).get("Category")
    if category is None:
        print(f"Warning: Subcategory not found for question ID: {question_id}")
        failed +=1
        continue
    question_text = f"{question.get("question", {})} {question.get("options", {})}" 
    top_5_categories = classify_question_top_5(question_text, category_embeddings, method="mean")

    # Extract the category names from the tuples
    top_5_category_names = [item[0] for item in top_5_categories]

    if top_5_category_names[0].split("_")[0] == category:
        score_1 += 1
    elif top_5_category_names[1].split("_")[0] == category:
        score_2 += 1
    elif top_5_category_names[2].split("_")[0] == category:
        score_3 += 1
    elif top_5_category_names[3].split("_")[0] == category:
        score_4 += 1
    else:
        failed += 1

print(f"🥇:{score_1} 🥈:{score_2} 🥉:{score_3} 🏅:{score_4} ❌:{failed}")
print("--- Fin des résultats ---")


global_score_1 += score_1
global_score_2 += score_2
global_score_3 += score_3
global_score_4 += score_4
global_failed += failed
print(f"🥇:{global_score_1} 🥈:{global_score_2} 🥉:{global_score_3} 🏅:{global_score_4} ❌:{global_failed}")
print("--- Fin des résultats ---")

🥇:18 🥈:11 🥉:3 🏅:3 ❌:13
--- Fin des résultats ---
🥇:29 🥈:12 🥉:7 🏅:3 ❌:17
--- Fin des résultats ---
