In [1]:
import json
from sklearn.metrics.pairwise import cosine_similarity

In [2]:

with open('data/clean/json/case_law_of_the_board.json') as f:
    data = json.load(f)


In [3]:
def flatten_json(json_obj, parent_title='', parent_number=''):
    """
    Aplatit une structure JSON imbriquée en une liste de chaînes de texte avec contexte.

    :param json_obj: Dictionnaire JSON à aplatir.
    :param parent_title: Titre parent pour le contexte.
    :param parent_number: Numéro parent pour le contexte.
    :return: Liste de chaînes de texte aplaties avec contexte.
    """
    items = []

    if isinstance(json_obj, dict):
        article_title = json_obj.get("article_title", "")
        article_number = json_obj.get("article_number", "")
        main_article = json_obj.get("main_article", "")

        # Construire le titre complet avec contexte
        full_title = f"{parent_title} {article_number} {article_title}".strip(", ")
        if main_article:
            items.append(f"{full_title} : {main_article}")

        # Parcourir les sous-articles récursivement
        sub_articles = json_obj.get("sub_articles", [])
        for sub_article in sub_articles:
            items.extend(flatten_json(sub_article, full_title, article_number))

    elif isinstance(json_obj, list):
        for item in json_obj:
            items.extend(flatten_json(item, parent_title, parent_number))

    return items

In [4]:
data = flatten_json(data)




In [5]:
for i in range(10):
    print(data[i])
    print()

I. PATENTABILITY 1. Patent protection for technical inventions :  The four essential preconditions governing the patentability of inventions under the EPC are laid down inArt. 52(1) EPC, which reads: "European patents shall be granted for any inventions, in all fields of technology, provided that they are new, involve an inventive step and are susceptible of industrial application." Art. 52(1) EPCexpresses the fundamental principle of a general entitlement to patent protection for any invention in all technical fields (seeG 5/83,OJ 1985, 64, point 21 et seq. of the Reasons;G 1/98,OJ 2000, 111, point 3.9 of the Reasons;G 1/03,OJ 2004, 413, point 2.2.2 of the Reasons;G 1/04,OJ 2006, 334, point 6 of the Reasons;T 154/04;OJ 2008, 46, 62, point 6 of the Reasons). Any limitation to the general entitlement to patent protection is thus not a matter of administrative or judicial discretion, but must have a clear legal basis in the EPC (seeG 2/12, of 25.03.2015,OJ 2016, A28;T 154/04).

I. PATENT

In [13]:
import ollama
import h5py
import nltk
# nltk.download('stopwords')
# from nltk.corpus import stopwords
import numpy as np


In [None]:
# stop_words = set(stopwords.words('english'))

# import re

# data_clean = []

# def clean_text(text):
#     """
#     Custom function to clean text by removing special characters, extra spaces, and digits.
#     """
#     text = re.sub(r'[^a-zA-Z\s]', '', text)  
#     text = re.sub(r'\s+', ' ', text).strip()
#     return text

# for line in data:
#     line = line.lower()
#     line = clean_text(line)
#     line = ' '.join([word for word in line.split() if word not in stop_words])  # Remove stopwords
#     data_clean.append(line)


In [None]:
# for i in range(10):
#     print(data_clean[i])
#     print()

patentability patent protection technical inventions four essential preconditions governing patentability inventions epc laid inart epc reads european patents shall granted inventions fields technology provided new involve inventive step susceptible industrial application art epcexpresses fundamental principle general entitlement patent protection invention technical fields seeg oj point et seq reasonsg oj point reasonsg oj point reasonsg oj point reasonst oj point reasons limitation general entitlement patent protection thus matter administrative judicial discretion must clear legal basis epc seeg oj

patentability whether invention art epc art epccontains nonexhaustive list ofnoninventions subjectmatter activities regarded inventions within meaning paragraph exclusion patentability subjectmatter activities referred applies extent european patent application european patent relates subjectmatter activitiesas suchart epc see alsooj se art epccovers subjectmatter whose common feature su

In [16]:

embeddings_list = []
for line in data:
    response = ollama.embeddings(model='nomic-embed-text', prompt=line)
    embeddings_list.append(response.embedding)

embeddings_array = np.array(embeddings_list)

with h5py.File('data/clean/h5/guidelines_examination_articles.h5', 'w') as h5_file:
    h5_file.create_dataset('embeddings', data=embeddings_array)


KeyboardInterrupt: 

In [11]:
print(embeddings_array.shape)

NameError: name 'embeddings_array' is not defined

In [19]:


with h5py.File('data/clean/h5/case_law_of_the_board.h5', 'r') as h5_file:
    embeds = h5_file['embeddings'][:]


prompt = "When is an appeal considered inadmissible under Article 107 of the EPC?"

print("prompt")

prompt_embedding_response = ollama.embeddings(model='nomic-embed-text', prompt=prompt)
prompt_embedding = np.array(prompt_embedding_response.embedding).reshape(1, -1)

# Calculer les similarités entre le prompt et les paragraphes
similarities = cosine_similarity(prompt_embedding, embeds)

# Trouver les indices des 5 paragraphes les plus similaires
top_5_indices = similarities.argsort()[0][-5:][::-1]

# Afficher les 5 paragraphes les plus similaires
top_5_paragraphs = [data[i] for i in top_5_indices]
for i, paragraph in enumerate(top_5_paragraphs, 1):
    print(f"Top {i}: {paragraph}\n")


prompt
Top 1: V. PROCEEDINGS BEFORE THE BOARDS OF APPEAL 3. Petition for review under Article 112a EPC 3.2. Transitional provisions : The decision of the Administrative Council of 28 June 2001 on the transitional provisions underArt. 7 of the Act revising the EPCof 29 November 2000 provides that "Article 112a EPCshall apply to decisions of the Boards of Appeal taken as from the date of its entry into force" (seeR 2/08). Referring toG 12/91(OJ 1994, 285), the Enlarged Board inR 5/08interpreted "taken" to be the date a decision given orally is pronounced, not the date of notification. A petition based on a decision dated prior to, yet not notified to the parties until after entry into force of the EPC 2000, is therefore inadmissible.

Top 2: V. PROCEEDINGS BEFORE THE BOARDS OF APPEAL 2. Filing and admissibility of the appeal 2.4. Entitlement to appeal 2.4.1 Formal aspects under Article 107 EPC  b) Appeal filed in name of representative : An appeal filed in the name of the representative 

In [None]:

json_files = ['data/clean/json/guidelines_examination_articles.json', 'data/clean/json/epc_rules.json', 'data/clean/json/epc_articles.json', "data/clean/json/case_law_of_the_board.json"]
h5_files = ['data/clean/h5/guidelines_examination_articles.h5', 'data/clean/h5/epc_rules.h5', 'data/clean/h5/epc_articles.h5', "data/clean/h5/case_law_of_the_board.h5"]

data_array = []
for file in json_files:
    with open(file) as f:
        data = json.load(f)
        data = flatten_json(data)
        data_array.append(data)

embeddings_array = []
for file in h5_files:
    with h5py.File(file, 'r') as h5_file:
        embeddings = h5_file['embeddings'][:]
        embeddings_array.append(embeddings)

prompt = "Je suis en train de lire un document sur la brevetabilité des inventions techniques selon la Convention sur le brevet européen (CBE). Pourriez-vous m'expliquer les principales conditions de brevetabilité mentionnées dans l'article 52(1) de la CBE et comment elles sont interprétées par les chambres de recours ? De plus, pourriez-vous clarifier la notion de caractère technique requis pour une invention et comment elle est évaluée sans référence à l'état de la technique ?"
prompt = clean_text(prompt)
prompt = ' '.join([word for word in prompt.split() if word not in stop_words])  # Remove stopwords

prompt_embedding_response = ollama.embeddings(model='nomic-embed-text', prompt=prompt)
prompt_embedding = np.array(prompt_embedding_response.embedding).reshape(1, -1)

for data, embeddings in zip(data_array, embeddings_array):
    similarities = cosine_similarity(prompt_embedding, embeddings)
    high_similarity_indices = np.where(similarities[0] >= 0.6)[0]
    top_paragraphs = [data[i] for i in high_similarity_indices][:10]  # Limit to maximum 10 per file
    print(top_paragraphs)


In [20]:
from transformers import pipeline

pipe = pipeline("translation", model="Helsinki-NLP/opus-mt-fr-en")

Device set to use cuda:0


In [27]:
res = pipe("I am currently reading a document on the patentability of technical inventions under the European Patent Convention (EPC). Could you explain to me the main conditions of patentability mentioned in Article 52(1) EPC and how they are interpreted by the Boards of Appeal? Furthermore, could you clarify the concept of the technical character required for an invention and how it is assessed without reference to prior art?")

In [30]:
print(res[0]['translation_text'])

I am currently reading a document on the patentability of technical inventions under the European Patent Convention (EPC). Could you explain to me the main conditions of patentability mentioned in Article 52(1) EPC and how they are interpreted by the Boards of Appeal? Furthermore, could you clarify the concept of the technical character required for an invention and how it is assessed without reference to prior art?
