In [21]:
from transformers import AutoTokenizer, AutoModel
import torch
import numpy as np 
import json
# Charger le modèle et le tokenizer
MODEL_NAME = "Alibaba-NLP/gte-Qwen2-1.5B-instruct"

In [22]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [23]:
def get_embeddings(text, method="cls"):
    """Extrait l'embedding d'une phrase selon la méthode spécifiée."""
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
    with torch.no_grad():
        outputs = model(**inputs)
    
    last_hidden_state = outputs.last_hidden_state  # (batch_size, seq_len, hidden_dim)
    
    if method == "cls":
        return last_hidden_state[:, 0, :]  # Embedding du token [CLS]
    elif method == "mean":
        attention_mask = inputs["attention_mask"].unsqueeze(-1)
        embeddings = (last_hidden_state * attention_mask).sum(dim=1) / attention_mask.sum(dim=1)

        embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1)
        return embeddings  # Moyenne des embeddings des tokens
    else:
        raise ValueError("Méthode non reconnue : choisir 'cls' ou 'mean'.")

In [None]:
# Charger le JSON avec un encodage spécifique
with open('../outputs/categories.json', 'r', encoding='utf-8-sig') as f:
    data = json.load(f)

# Créer un dictionnaire pour stocker les embeddings
embeddings_dict = {}

# Extraire les embeddings pour chaque catégorie
for category, subcategories in data.items():
    for subcategory, details in subcategories.items():
        text = f"{category} {subcategory} {details['Scope']} {details['sentences']}"
        embedding = get_embeddings(text, method="mean")  # Ou "cls" selon ce que tu préfères
        embeddings_dict[f"{category}_{subcategory}"] = embedding.numpy()

In [25]:
# Sauvegarder les embeddings dans un fichier .npy
np.save('../outputs/category_gte-Qwen2-1.5B-instruct.npy', embeddings_dict)

In [26]:
category_embeddings = np.load('../outputs/category_gte-Qwen2-1.5B-instruct.npy', allow_pickle=True).item()

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

def classify_question_top_5(question, category_embeddings, method="mean"):
    """Classifie la question et retourne les 5 catégories les plus similaires."""
    # Extraire l'embedding de la question
    question_embedding = get_embeddings(question, method=method)
    
    # Comparer la question avec chaque catégorie
    similarities = {}
    for category, category_embedding in category_embeddings.items():
        similarity = cosine_similarity(question_embedding.numpy().reshape(1, -1), category_embedding.reshape(1, -1))[0][0]
        similarities[category.split("_")[-1]] = similarity
    
    # Trier les catégories par similarité décroissante et récupérer les 5 meilleures
    sorted_categories = sorted(similarities.items(), key=lambda x: x[1], reverse=True)[:5]
    
    return sorted_categories

In [None]:

with open('../outputs/test_categories.json', 'r', encoding='utf-8') as f:
    test_categoies = json.load(f)

with open('../outputs/2022_PreEx_open.json', 'r', encoding='utf-8') as f:
    PreEx_open = json.load(f)

score_1 = 0
score_other = 0
failed = 0  # Corrected typo: faild -> failed

global_score_1 = 0
global_score_other = 0
global_failed = 0

for question_id, question_text in PreEx_open.items():
    subcategory = test_categoies.get("2022_PreEx_open", {}).get(f"{question_id}", {}).get("Subcategory")

    if subcategory is None:
        print(f"Warning: Subcategory not found for question ID: {question_id}")
        failed +=1
        continue

    top_5_categories = classify_question_top_5(question_text, category_embeddings, method="mean")

    # Extract the category names from the tuples
    top_5_category_names = [item[0] for item in top_5_categories]
    print(subcategory)
    print(top_5_category_names)
    if top_5_category_names[0] == subcategory:
        score_1 += 1
    elif subcategory in top_5_category_names[1:5]:
        score_other += 1
    else:
        failed += 1

    print(score_1, score_other, failed)

print(f"Score 1: {score_1}")
print(f"Score Other: {score_other}")
print(f"Failed: {failed}")
global_score_1 += score_1
global_score_other += score_other
global_failed += failed

Entitlement disputes (article 61 EPC)
['Grounds for opposition (article 100 epc)', 'Opposition procedure and admissibility', 'Appeal proceedings', 'Sequence listing filing and format', 'Added subject-matter in biotech claims']
0 0 1
Opposition procedure and admissibility
['Minimum requirements for a filing date', 'Filing methods and locations', 'Types and calculation of fees', 'Payment mechanisms', 'Fee deadlines and late payment consequences']
0 0 2
Appeal proceedings
['Examination procedure and communications', 'Claim amendments and article 123 epc', 'Grant stage (rule 71(3) epc) and post-grant publication', 'Novelty analysis', 'Inventive step analysis']
0 0 3
Transfers and assignments
['Formality examination', 'Filing requirements', 'Subject-matter and scope', 'Substantive requirements for priority', 'Time limits and restoration']
0 0 4
Language of filing and procedural language
['Grounds for opposition (article 100 epc)', 'Opposition procedure and admissibility', 'Appeal proceeding

In [None]:
with open('../outputs/2021_PreEx_open.json', 'r', encoding='utf-8') as f:
    PreEx_open = json.load(f)

score_1 = 0
score_other = 0
failed = 0  # Corrected typo: faild -> failed

for question_id, question_text in PreEx_open.items():
    subcategory = test_categoies.get("2021_PreEx_open", {}).get(f"{question_id}", {}).get("Subcategory")

    if subcategory is None:
        print(f"Warning: Subcategory not found for question ID: {question_id}")
        failed +=1
        continue

    top_5_categories = classify_question_top_5(question_text, category_embeddings, method="mean")

    # Extract the category names from the tuples
    top_5_category_names = [item[0] for item in top_5_categories]
    print(subcategory)
    print(top_5_category_names)
    if top_5_category_names[0] == subcategory:
        score_1 += 1
    elif subcategory in top_5_category_names[1:5]:
        score_other += 1
    else:
        failed += 1

    print(score_1, score_other, failed)


print(f"Score 1: {score_1}")
print(f"Score Other: {score_other}")
print(f"Failed: {failed}")
global_score_1 += score_1
global_score_other += score_other
global_failed += failed

Fee deadlines and late payment consequences
['Examination procedure and communications', 'Claim amendments and article 123 epc', 'Grant stage (rule 71(3) epc) and post-grant publication', 'Novelty analysis', 'Inventive step analysis']
0 0 1
Time limits and restoration
['Formality examination', 'Filing requirements', 'Subject-matter and scope', 'Substantive requirements for priority', 'Time limits and restoration']
0 1 1
Filing requirements
['Formality examination', 'Filing requirements', 'Subject-matter and scope', 'Substantive requirements for priority', 'Time limits and restoration']
0 2 1
International filing and search
['Formality examination', 'Filing requirements', 'Subject-matter and scope', 'Substantive requirements for priority', 'Time limits and restoration']
0 2 2
Opposition procedure and admissibility
['Language of filing and procedural language', 'Translation requirements on grant or other stages', 'Effects of language on costs and procedural rights', 'Formality examinatio

In [None]:
with open('../outputs/OEB_SUP_mcq.json', 'r', encoding='utf-8') as f:
    PreEx_open = json.load(f)

score_1 = 0
score_other = 0
failed = 0  # Corrected typo: faild -> failed

for question_id, question in PreEx_open.items():
    subcategory = test_categoies.get("OEB_SUP_mcq", {}).get(f"{question_id}", {}).get("Subcategory")

    if subcategory is None:
        print(f"Warning: Subcategory not found for question ID: {question_id}")
        failed +=1
        continue
    question_text = f"{question.get("question", {})} {question.get("options", {})}" 
    top_5_categories = classify_question_top_5(question_text, category_embeddings, method="mean")

    # Extract the category names from the tuples
    top_5_category_names = [item[0] for item in top_5_categories]
    print(subcategory)
    print(top_5_category_names)
    if top_5_category_names[0] == subcategory:
        score_1 += 1
    elif subcategory in top_5_category_names[1:5]:
        score_other += 1
    else:
        failed += 1

    print(score_1, score_other, failed)


print(f"Score 1: {score_1}")
print(f"Score Other: {score_other}")
print(f"Failed: {failed}")
global_score_1 += score_1
global_score_other += score_other
global_failed += failed


print(f"Score 1: {global_score_1}")
print(f"Score Other: {global_score_other}")
print(f"Failed: {global_failed}")

Effects of language on costs and procedural rights
['Examination procedure and communications', 'Claim amendments and article 123 epc', 'Grant stage (rule 71(3) epc) and post-grant publication', 'Unity of invention', 'Unity in pct applications']
0 0 1
Loss of rights and remedies
['Formality examination', 'Filing requirements', 'Subject-matter and scope', 'Examination procedure and communications', 'Claim amendments and article 123 epc']
0 0 2
Loss of rights and remedies
['Examination procedure and communications', 'Claim amendments and article 123 epc', 'Grant stage (rule 71(3) epc) and post-grant publication', 'Unity of invention', 'Unity in pct applications']
0 0 3
International filing and search
['Examination procedure and communications', 'Claim amendments and article 123 epc', 'Grant stage (rule 71(3) epc) and post-grant publication', 'Unity of invention', 'Unity in pct applications']
0 0 4
Transfers and assignments
['Examination procedure and communications', 'Claim amendments an