In [None]:
# Umsetzung des XML+HTML-Verarbeitung und Vorverarbeitung + Erster Versuch der ChatGpt Datenvorverarbeitung (dies wurde jedoch in dem oneRungptPP ausgearbeitet)

# Xml-Preprocessing

In [1]:
# Artificial xml preprocessing zum Testen des Trainings
from lxml import etree
import os
import re
import json

def extract_text_from_element(element):
    texts = []
    for elem in element.iter():
        if elem.text:
            texts.append(elem.text.strip())
        if elem.tail:
            texts.append(elem.tail.strip())
    return ' '.join(texts)

def process_xml_file(file_path):
    tree = etree.parse(file_path)
    root = tree.getroot()

    # Extract Titelzeile
    titelzeile_elem = root.xpath('.//titelzeile//p')
    titelzeile = titelzeile_elem[0].text.strip() if titelzeile_elem else ""

    rest_text = extract_text_from_element(root)

    rest_text = re.sub(r'\s+', ' ', rest_text).strip()

    return {"input": titelzeile, "output": rest_text}

def process_xml_files_in_directory(directory_paths):
    for directory_single_path in directory_paths:
        data = []
        for filename in os.listdir(directory_single_path):
            if filename.endswith('.xml'):
                file_path = os.path.join(directory_single_path, filename)
                result = process_xml_file(file_path)
                data.append(result)
        with open(directory_single_path + ".json", 'w', encoding='utf-8') as json_file:
            json.dump(data, json_file, ensure_ascii=False, indent=4)

In [2]:
directories = ["data/bund"]

process_xml_files_in_directory(directories)

# Html-Preprocessing

In [4]:
pip install beautifulsoup4


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.1.2[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [5]:
from bs4 import BeautifulSoup

def process_html_files_in_directory(directory_paths):
    for directory_single_path in directory_paths:
        data = []
        for filename in os.listdir(directory_single_path):
            if filename.endswith('.html'):
                file_path = os.path.join(directory_single_path, filename)
                # Read the HTML file
                with open(file_path, 'r', encoding='utf-8') as file:
                    html_content = file.read()

                # Parse the HTML content with BeautifulSoup
                soup = BeautifulSoup(html_content, 'html.parser')

                # Find all <p> tags
                p_tags = soup.find_all('p')

                # Extract text from each <p> tag and join them into a single string
                fluent_string = ' '.join(p.get_text() for p in p_tags)
                data.append({"input": "", "output": fluent_string})
        with open(directory_single_path + ".json", 'w', encoding='utf-8') as json_file:
            json.dump(data, json_file, ensure_ascii=False, indent=4)

In [6]:
directories = ["data/be"]

process_html_files_in_directory(directories)

# Adjust for Gpt

In [7]:
# Adjust for chatgpt
import json

# Gpt system definition
definition_string = """
You are a data preprocessing tool.
You should generate a fluent human german question from the input string.
The question should be as if a person has a problem in this law sector and wants to find the answer.
You also should generate a german fluent answer for this question from the content of the ouput string.
The question should be written from the perspective of a person who doesn't has much knowledge in the associated field.
The answer should be precise, true and there shouldn't be added any imaginary context to the outputs context.
Remove unnecessary numbers or symbols, which doesnt add anything to the context of the text.
The structure of the response should always be Question: text || Answer: text.
Replace Antwort with Answer.
"""
#The question should be around 200 token.
#The answer should be around 800 token.

def gpt_obj_adjusting(input_file_paths, output_file_path):
  all_data = []
  # Read the input JSON file
  for input_file_path in input_file_paths:
    with open(input_file_path, 'r') as input_file:
        data = json.load(input_file)
        if isinstance(data, list):  # If data is a list, extend the list
            all_data.extend(data)
        else:  # If data is not a list, append it as an element
            all_data.append(data)

  entry_id_count = 0
  # Write the updated data to the output JSON file
  with open(output_file_path, 'w') as output_file:
    for entry in all_data:
      entry["custom_id"] = "request-" + str(entry_id_count)
      entry["method"] = "POST"
      entry["url"] = "/v1/chat/completions"
      entry["body"] = {"model": "gpt-3.5-turbo-0125",
                      "messages": [{"role": "system", "content": definition_string}, # You are a helpful assistant.
                                    {"role": "user", "content": f"Input: {entry['input']}; Output: {entry['output']}"}],
                      "max_tokens": 2000}
      entry_id_count += 1
      del entry['input']
      del entry['output']
      json.dump(entry, output_file)
      output_file.write('\n')

In [8]:
# Specify the path to your input JSON file
input_file_paths = ['data/be.json', 'data/bund.json']
output_file_path = 'adjustedForGpt.jsonl'

gpt_obj_adjusting(input_file_paths, output_file_path)
print("Data has been updated and saved to", output_file_path)

Data has been updated and saved to adjustedForGpt.jsonl


# Combined preprocess

In [9]:
xml_directories = ["data/bund"]
html_directories = ["data/mv", "data/rp", "data/sh", "data/sl", "data/st", "data/th"]
input_file_paths = [s + ".json" for s in xml_directories + html_directories]
output_file_path = 'adjustedForGpt.jsonl'
print(input_file_paths)

process_xml_files_in_directory(xml_directories)
process_html_files_in_directory(html_directories)


gpt_obj_adjusting(input_file_paths, output_file_path)
print("Data has been updated and saved to", output_file_path)

['data/bund.json', 'data/mv.json', 'data/rp.json', 'data/sh.json', 'data/sl.json', 'data/st.json', 'data/th.json']
Data has been updated and saved to adjustedForGpt.jsonl


# ChatGpt

In [2]:
pip install openai

Collecting openai
  Downloading openai-1.43.0-py3-none-any.whl.metadata (22 kB)
Collecting httpx<1,>=0.23.0 (from openai)
  Downloading httpx-0.27.2-py3-none-any.whl.metadata (7.1 kB)
Collecting jiter<1,>=0.4.0 (from openai)
  Downloading jiter-0.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.6 kB)
Collecting httpcore==1.* (from httpx<1,>=0.23.0->openai)
  Downloading httpcore-1.0.5-py3-none-any.whl.metadata (20 kB)
Collecting h11<0.15,>=0.13 (from httpcore==1.*->httpx<1,>=0.23.0->openai)
  Downloading h11-0.14.0-py3-none-any.whl.metadata (8.2 kB)
Downloading openai-1.43.0-py3-none-any.whl (365 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m365.7/365.7 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading httpx-0.27.2-py3-none-any.whl (76 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.4/76.4 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading httpcore-1.0.5-py3-none-any.whl (77 kB)
[2K   [90m━━

In [10]:
from openai import OpenAI

client = OpenAI(
    api_key="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"
)

batch_input_file = client.files.create(
  file=open("adjustedForGpt.jsonl", "rb"),
  purpose="batch"
)


In [12]:
batch_input_file_id = batch_input_file.id

batch_job = client.batches.create(
    input_file_id=batch_input_file_id,
    endpoint="/v1/chat/completions",
    completion_window="24h",
    metadata={
      "description": "nightly eval job"
    }
)

In [13]:
# Checking preprocessing status (don't forget to update batch_id <3)
# Rausfinden warum so viele failen??
batch_job = client.batches.retrieve(batch_job.id)
print(batch_job)
print(batch_job.status)
print(batch_job.request_counts)

Batch(id='batch_fQHDJmO1D4W2QzfWaWSAXzrX', completion_window='24h', created_at=1726313757, endpoint='/v1/chat/completions', input_file_id='file-SVnyvVN8E4wlCPhexgLAMVw7', object='batch', status='in_progress', cancelled_at=None, cancelling_at=None, completed_at=None, error_file_id=None, errors=None, expired_at=None, expires_at=1726400157, failed_at=None, finalizing_at=None, in_progress_at=1726313760, metadata={'description': 'nightly eval job'}, output_file_id=None, request_counts=BatchRequestCounts(completed=0, failed=0, total=1356))
in_progress
BatchRequestCounts(completed=0, failed=0, total=1356)


In [14]:
pip install playsound

Collecting playsound
  Downloading playsound-1.3.0.tar.gz (7.7 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hBuilding wheels for collected packages: playsound
  Building wheel for playsound (setup.py) ... [?25ldone
[?25h  Created wheel for playsound: filename=playsound-1.3.0-py3-none-any.whl size=7020 sha256=d366132efd13fcfff894e1513418c5f73746a16a6909c45b816f88f51659b883
  Stored in directory: /home/ps2024/.cache/pip/wheels/cf/42/ff/7c587bae55eec67b909ca316b250d9b4daedbf272a3cbeb907
Successfully built playsound
Installing collected packages: playsound
Successfully installed playsound-1.3.0

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.1.2[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [None]:
!pip3 install pygobject

In [None]:
import time
from playsound import playsound

# Sound check
while True:
    batch_job = client.batches.retrieve(batch_job.id)
    status = batch_job.status
    print(status)
    print(str(batch_job.request_counts.completed + batch_job.request_counts.failed) + " / " + str(batch_job.request_counts.total))
    if status == "completed":
        print("Batch process completed!")
        playsound("sucess.mp3")  # Play sound when complete
        break
    else:
        print("Batch still in progress, checking again in 60 seconds...")
        time.sleep(60)  # Wait for 60 seconds before checking again

In [191]:
# Getting results (don't forget to update file_id <3)

file_response = client.files.content(batch_job.output_file_id)
print(file_response.text)

{"id": "batch_req_Xu1oDtLUDF1d16oALIB4uxXN", "custom_id": "request-0", "response": {"status_code": 200, "request_id": "10be1f02badd16d136961879a02a870e", "body": {"id": "chatcmpl-A2kVP7TNwpPVa53BWeqlPvcw3n7Wz", "object": "chat.completion", "created": 1725218787, "model": "gpt-3.5-turbo-0125", "choices": [{"index": 0, "message": {"role": "assistant", "content": "Question: Was bedeutet die Bindungswirkung eines Strafurteils und wie beeinflusst sie ein gerichtliches Disziplinarverfahren? || Answer: Die Bindungswirkung eines Strafurteils besagt, dass die festgestellten Tatsachen des rechtskr\u00e4ftigen Strafurteils im Disziplinarverfahren als bindend gelten. Das bedeutet, dass die Verwaltungsgerichte grunds\u00e4tzlich an die Tatsachenfeststellungen des Strafurteils gebunden sind, es sei denn, es liegen bestimmte Voraussetzungen vor, die eine neue Pr\u00fcfung erforderlich machen.", "refusal": null}, "logprobs": null, "finish_reason": "stop"}], "usage": {"prompt_tokens": 2673, "completion

In [None]:
# Get error file to look into errors (vielleicht hilft das was) -> ja es hilft <3 manche inputs müssen gekürzt werden!!
error_response = client.files.content(batch_job.error_file_id)
print(error_response.text)

In [None]:
# Cancel batch (don't forget to update batch_id <3)

client.batches.cancel(batch_job.id)

# Finish Up

In [17]:
# Finish up to file

result = client.files.content(batch_job.output_file_id).content

result_file_name = "preprocessing_results.jsonl"

with open(result_file_name, 'wb') as file:
    file.write(result)

In [18]:
final_training_data_path = "final_training_data.json"

# Loading data from saved file

results = []
with open(result_file_name, 'r') as file:
    for line in file:
        # Parsing the JSON string into a dict and appending to the list of results
        json_object = json.loads(line.strip())
        results.append(json_object)

final_data = []
# Reading only the first results
for res in results:
    result = res['response']['body']['choices'][0]['message']['content']
    # Split the string into two parts using '\n' as the delimiter
    parts = result.split("Answer:")

    print(parts)
    # Extract the question and answer by further splitting the parts by ':'
    question_text = parts[0].strip()
    answer_text = parts[1].strip()
    obj = {"input": question_text, "output": answer_text}
    final_data.append(obj)
    print(f"{result}")
    print("\n\n----------------------------\n\n")


with open(final_training_data_path + ".json", 'w', encoding='utf-8') as json_file:
            json.dump(final_data, json_file, ensure_ascii=False, indent=4)

['Question: Kannst du mir bitte erklären, wieso die Anhörungsrüge abgelehnt wurde und was die Konsequenzen für den Kläger sind? || ', ' Die Anhörungsrüge wurde abgelehnt, da der Kläger nicht dargelegt hat, dass sein Anspruch auf rechtliches Gehör in entscheidungserheblicher Weise verletzt wurde. Der Kläger trägt die Kosten des Rügeverfahrens und Gerichtskosten werden nicht erhoben. Die Anhörungsrüge ist unanfechtbar und eine erneute Anhörungsrüge ist ausgeschlossen.']
Question: Kannst du mir bitte erklären, wieso die Anhörungsrüge abgelehnt wurde und was die Konsequenzen für den Kläger sind? || Answer: Die Anhörungsrüge wurde abgelehnt, da der Kläger nicht dargelegt hat, dass sein Anspruch auf rechtliches Gehör in entscheidungserheblicher Weise verletzt wurde. Der Kläger trägt die Kosten des Rügeverfahrens und Gerichtskosten werden nicht erhoben. Die Anhörungsrüge ist unanfechtbar und eine erneute Anhörungsrüge ist ausgeschlossen.


----------------------------


['Question: Was sind d

IndexError: list index out of range