In [43]:
from transformers import AutoTokenizer, AutoModel
import torch
import numpy as np 
import json
# Charger le modèle et le tokenizer
MODEL_NAME = "nlpaueb/bert-base-uncased-eurlex"

In [44]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME)

In [45]:
def get_embeddings(text, method="cls"):
    """Extrait l'embedding d'une phrase selon la méthode spécifiée."""
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
    
    with torch.no_grad():
        outputs = model(**inputs)
    
    last_hidden_state = outputs.last_hidden_state  # (batch_size, seq_len, hidden_dim)
    
    if method == "cls":
        return last_hidden_state[:, 0, :]  # Embedding du token [CLS]
    elif method == "mean":
        attention_mask = inputs["attention_mask"].unsqueeze(-1)
        embeddings = (last_hidden_state * attention_mask).sum(dim=1) / attention_mask.sum(dim=1)
        return embeddings  # Moyenne des embeddings des tokens
    else:
        raise ValueError("Méthode non reconnue : choisir 'cls' ou 'mean'.")

In [71]:
# Charger le JSON avec un encodage spécifique
with open('../outputs/categories.json', 'r', encoding='utf-8-sig') as f:
    data = json.load(f)

# Créer un dictionnaire pour stocker les embeddings
embeddings_dict = {}

# Extraire les embeddings pour chaque catégorie
for category, subcategories in data.items():
    for subcategory, details in subcategories.items():
        text = f"{category} {subcategory} {details['Scope']} {details['sentences']}"
        embedding = get_embeddings(text, method="mean")  # Ou "cls" selon ce que tu préfères
        embeddings_dict[f"{category}_{subcategory}"] = embedding.numpy()

In [72]:
# Sauvegarder les embeddings dans un fichier .npy
np.save('../outputs/category_embeddings.npy', embeddings_dict)

In [73]:
category_embeddings = np.load('../outputs/category_embeddings.npy', allow_pickle=True).item()

In [74]:
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
def classify_question_top_5(question, category_embeddings, method="mean"):
    """Classifie la question et retourne les 5 catégories les plus similaires."""
    # Extraire l'embedding de la question
    question_embedding = get_embeddings(question, method=method)
    
    # Comparer la question avec chaque catégorie
    similarities = {}
    for category, category_embedding in category_embeddings.items():
        similarity = cosine_similarity(question_embedding.numpy().reshape(1, -1), category_embedding.reshape(1, -1))[0][0]
        similarities[category] = similarity
    
    # Trier les catégories par similarité décroissante et récupérer les 5 meilleures
    sorted_categories = sorted(similarities.items(), key=lambda x: x[1], reverse=True)[:5]
    
    return sorted_categories

# Exemple de question
question = "You represent a client whose European patent has been opposed. The opposition division decides to maintain the patent in amended form. The invitation with the three-month time limit under Rule 82(2) EPC is sent on 16 August 2021, but proceedings are interrupted under Rule 142 EPC on 20 October 2021. The problems are resolved and the proceedings resume on 16 January 2022. When does the time limit under Rule 82(2)EPC end?"
# Classifier la question et récupérer les 5 meilleures catégories
top_5_categories = classify_question_top_5(question, category_embeddings, method="mean")

# Afficher les résultats
print(f"Question: {question}")
for i, (category, similarity_score) in enumerate(top_5_categories, 1):
    print(f"{i}. Category: {category} with similarity score: {similarity_score}")

Question: When proceedings for grant of a European patent application are stayed following the institution of entitlement proceedings, renewal fees which fall due during the period of stay can be paid up to the date of resumption of the proceedings for grant.
1. Category: Entitlement and transfers_Entitlement disputes (article 61 EPC) with similarity score: 0.8565102815628052
2. Category: Entitlement and transfers_Procedural consequences with similarity score: 0.8402036428451538
3. Category: Procedural remedies and legal effect_Re-establishment of rights (article 122 epc) with similarity score: 0.8284813761711121
4. Category: Opposition and appeals_Appeal proceedings with similarity score: 0.8225505352020264
5. Category: Procedural remedies and legal effect_Further processing (rule 135 epc) with similarity score: 0.8224268555641174
