In [14]:
import json
import random

In [15]:

with open('data/clean/json/epc_rules.json') as f:
    data = json.load(f)


In [16]:
def flatten_json(json_obj, parent_title='', parent_number=''):
    """
    Aplatit une structure JSON imbriquée en une liste de chaînes de texte avec contexte.

    :param json_obj: Dictionnaire JSON à aplatir.
    :param parent_title: Titre parent pour le contexte.
    :param parent_number: Numéro parent pour le contexte.
    :return: Liste de chaînes de texte aplaties avec contexte.
    """
    items = []

    if isinstance(json_obj, dict):
        article_title = json_obj.get("article_title", "")
        article_number = json_obj.get("article_number", "")
        main_article = json_obj.get("main_article", "")

        # Construire le titre complet avec contexte
        full_title = f"{parent_title}, {article_number} {article_title}".strip(", ")
        if main_article:
            items.append(f"{full_title} : {main_article}")

        # Parcourir les sous-articles récursivement
        sub_articles = json_obj.get("sub_articles", [])
        for sub_article in sub_articles:
            items.extend(flatten_json(sub_article, full_title, article_number))

    elif isinstance(json_obj, list):
        for item in json_obj:
            items.extend(flatten_json(item, parent_title, parent_number))

    return items

In [17]:
data = flatten_json(data)




In [18]:
for i in range(10):
    print(random.choice(data))
    print()

Rule. 34 53 New deposit of biological material : If biological material deposited in accordance with Rule 31 ceases to be available from the recognised depositary institution, an interruption in availability shall be deemed not to have occurred if a new deposit of that material is made with a recognised depositary institution on the same terms as those laid down in the Budapest Treaty on the International Recognition of the Deposit of Microorganisms for the Purposes of Patent Procedure of 28 April 1977, and if a copy of the receipt of the new deposit issued by the depositary institution is forwarded to the European Patent Office within four months of the date of the new deposit, stating the number of the European patent application or of the European patent. 53 See notice from the EPO of 07.07.2010 ( OJ EPO 2010, 498 ).

Rule. 58 Correction of deficiencies in the application documents : If the European patent application does not comply with the requirements of Rule 57(a) to Rule 57 (d

In [19]:
for i in range(10):
    print(data[i])
    print()

Rule. 1 1 Written proceedings : In written proceedings before the European Patent Office, the requirement to use the written form shall be satisfied if the content of the documents can be reproduced in a legible form on paper . 1 Amended by decision of the Administrative Council CA/D 26/23 of 14.12.2023 ( OJ EPO 2024, A16 ), which entered into force on 01.04.2024.

Rule. 2 2 Filing of and formal requirements for documents : In proceedings before the European Patent Office, documents may be filed by delivery by hand, by postal services or by means of electronic communication. The President of the European Patent Office shall lay down the details and conditions and, where appropriate, any special formal or technical requirements for the filing of documents. In particular, he may specify that confirmation must be supplied. If such confirmation is not supplied in due time, the European patent application shall be refused; documents filed subsequently shall be deemed not to have been receiv

In [20]:
import ollama
import h5py
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import numpy as np


[nltk_data] Downloading package stopwords to /home/leo/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [21]:
import re
stop_words = set(stopwords.words('english'))

#add "Art." to the list of stopwords"

stop_words.add("art")

data_clean = []

def clean_text(text):
    """
    Custom function to clean text by removing special characters, extra spaces, and digits.
    """
    text = re.sub(r'[^a-zA-Z\s]', '', text)  
    text = re.sub(r'\s+', ' ', text).strip()
    return text

for line in data:
    line = line.lower()
    line = clean_text(line)
    line = ' '.join([word for word in line.split() if word not in stop_words])  # Remove stopwords
    data_clean.append(line)


In [22]:
for i in range(10):
    print(data_clean[i])
    print()

rule written proceedings written proceedings european patent office requirement use written form shall satisfied content documents reproduced legible form paper amended decision administrative council cad oj epo entered force

rule filing formal requirements documents proceedings european patent office documents may filed delivery hand postal services means electronic communication president european patent office shall lay details conditions appropriate special formal technical requirements filing documents particular may specify confirmation must supplied confirmation supplied due time european patent application shall refused documents filed subsequently shall deemed received convention provides document must signed authenticity document may confirmed handwritten signature appropriate means use permitted president european patent office document authenticated means shall deemed meet legal requirements signature way document bearing handwritten signature filed paper form

rule langua

In [23]:

embeddings_list = []
count = 0
for line in data_clean:

    response = ollama.embeddings(model='nomic-embed-text', prompt=line)
    embeddings_list.append(response.embedding)
    print(f'{count}/{len(data_clean)}')  
    count += 1

embeddings_array = np.array(embeddings_list)

with h5py.File('data/clean/h5/guidelines_examination_articles.h5', 'w') as h5_file:
    h5_file.create_dataset('embeddings', data=embeddings_array)


0/176
1/176
2/176
3/176
4/176
5/176
6/176
7/176
8/176
9/176
10/176
11/176
12/176
13/176
14/176
15/176
16/176
17/176
18/176
19/176
20/176
21/176
22/176
23/176
24/176
25/176
26/176
27/176
28/176
29/176
30/176
31/176
32/176
33/176
34/176
35/176
36/176
37/176
38/176
39/176
40/176
41/176
42/176
43/176
44/176
45/176
46/176
47/176
48/176
49/176
50/176
51/176
52/176
53/176
54/176
55/176
56/176
57/176
58/176
59/176
60/176
61/176
62/176
63/176
64/176
65/176
66/176
67/176
68/176
69/176
70/176
71/176
72/176
73/176
74/176
75/176
76/176
77/176
78/176
79/176
80/176
81/176
82/176
83/176
84/176
85/176
86/176
87/176
88/176
89/176
90/176
91/176
92/176
93/176
94/176
95/176
96/176
97/176
98/176
99/176
100/176
101/176
102/176
103/176
104/176
105/176
106/176
107/176
108/176
109/176
110/176
111/176
112/176
113/176
114/176
115/176
116/176
117/176
118/176
119/176
120/176
121/176
122/176
123/176
124/176
125/176
126/176
127/176
128/176
129/176
130/176
131/176
132/176
133/176
134/176
135/176
136/176
137/176
138/17

In [30]:
from sklearn.metrics.pairwise import cosine_similarity

with h5py.File('data/clean/h5/epc_rules.h5', 'r') as h5_file:
    embeddings_array = h5_file['embeddings'][:]


print(embeddings_array.shape)

(176, 768)


In [31]:



prompt = "Quelles sont les conditions dans lesquelles une partie à une procédure orale devant l'Office européen des brevets peut utiliser une langue autre que la langue de la procédure ?"
prompt_embedding_response = ollama.embeddings(model='nomic-embed-text', prompt=prompt)
prompt_embedding = np.array(prompt_embedding_response.embedding).reshape(1, -1)

# Calculer les similarités entre le prompt et les paragraphes
similarities = cosine_similarity(prompt_embedding, embeddings_array)

# Trouver les indices des 5 paragraphes les plus similaires

best_indices = similarities.argsort()[0][-10:][::-1]
#Garder uniquement ceux avec une similarité supérieure à 0.5
best_indices = [i for i in best_indices if similarities[0][i] > 0.5]
# Afficher les 5 paragraphes les plus similaires
best_results = [data[i] for i in best_indices]

# put top 5 paragraphs in a string variable
best_results = "\n".join(best_results)

print(best_results)
# show similiraties for each result
print(similarities[0][best_indices])

Rule. 4 Language in oral proceedings : Any party to oral proceedings before the European Patent Office may use an official language of the European Patent Office other than the language of the proceedings, if such party gives notice to the European Patent Office at least one month before the date of such oral proceedings or provides for interpretation into the language of the proceedings. Any party may use an official language of a Contracting State, if he provides for interpretation into the language of the proceedings. The European Patent Office may permit derogations from these provisions. In the course of oral proceedings, employees of the European Patent Office may use an official language of the European Patent Office other than the language of the proceedings. Where evidence is taken, any party, witness or expert to be heard who is unable to express himself adequately in an official language of the European Patent Office or of a Contracting State may use another language. Where 

In [None]:
import requests
import json
from IPython.display import clear_output
import time

# Define the URL for the Ollama API endpoint
url = "http://localhost:11434/api/generate"



# Define the payload with the model and prompt
payload = {
    "model": "mistral",
    "prompt": "Based on this RAG output, the following are the key points to consider when drafting your response: " + best_results + "\n\n Answer the following question: " + prompt
}

# Define the headers for the request
headers = {
    "Content-Type": "application/json"
}

# Send the POST request to the API with stream=True
with requests.post(url, headers=headers, data=json.dumps(payload), stream=True) as response:
    # Check if the request was successful
    if response.status_code == 200:
        complete_response = ""
        buffer = ""
        # Iterate over the response data in chunks
        for chunk in response.iter_content(chunk_size=8192):
            if chunk:
                # Decode the chunk and append to the buffer
                buffer += chunk.decode('utf-8')
                try:
                    # Attempt to parse the buffer as JSON
                    data = json.loads(buffer)
                    # Extract the response field
                    complete_response += data.get("response", "")
                    # Clear the output and print the complete response
                    clear_output(wait=True)
                    print(complete_response)
                    # Reset the buffer after successful parsing
                    buffer = ""
                except json.JSONDecodeError:
                    # If JSON is incomplete, continue accumulating chunks
                    continue
    else:
        print(f"Error: Received status code {response.status_code}")


 A party to oral proceedings before the European Patent Office may use a language other than the language of the proceedings if any of the following conditions are met:

1. The parties and the European Patent Office agree on using any language during the oral proceedings.
2. If a party gives notice to the European Patent Office at least one month before the date of such oral proceedings, or provides for interpretation into the language of the proceedings if they use an official language of the European Patent Office or a Contracting State.
3. If the party is unable to express themselves adequately in an official language of the European Patent Office or of a Contracting State, they may use another language during the evidence-taking process. However, if this request is made, the party must provide for interpretation into the language of the proceedings.
4. Employees of the European Patent Office may use an official language of the European Patent Office other than the language of the p