In [None]:
import openai
import os
import json
import re

from typing import List, Dict, Any
from pathlib import Path
from dataclasses import dataclass

from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv())

openai.api_key = os.getenv('OPENAI_API_KEY')

def print_lines(text: str, max_line_length: int = 120):
    lines: List[str] = []
    line = ""
    for word in text.split():
        if len(line + word) > max_line_length:
            lines.append(line)
            line = ""
        line += word + " "
    lines.append(line)

    for line in lines:
        print(line)


@dataclass
class TermData:
    area: str | None
    termDef: str

    @property
    def term(self) -> str:
        return self.termDef.split(" - ")[0]

    @property
    def definition(self) -> str:
        return " - ".join(self.termDef.split(" - ")[1:])

    @property
    def has_definition(self) -> bool:
        return len(self.definition) > 0

    def term_def_without_uuids(self) -> str:
        # Replaces |[a-f0-9-]{36}\} with "\}"
        return re.sub(r"\|[a-f0-9-]{36}\}", "}", self.termDef)

    def is_english(self) -> bool:
        # Check with regex if text contains only english letters, and some special characters
        return re.match(r"^[a-zA-Z0-9\s{}\+\-—\.,\|\(\)/';]+$", self.termDef) is not None

    def has_russian(self) -> bool:
        return re.search(r"[а-яА-Я]+", self.termDef) is not None

    def is_balanced(self) -> bool:
        counter = 0
        for char in self.termDef:
            if char == "{":
                counter += 1
            elif char == "}":
                counter -= 1

            if counter > 1:
                return False

        return counter == 0



class StaticStorage:
    def __init__(self, path: Path):
        self.path = path
        json_data = json.loads(self.path.read_text())
        self.name = json_data['name']
        self.terms: List[TermData] = []
        for term in json_data['terms']:
            area = term['area'] if 'area' in term else None
            term_data = TermData(area, term['termDef'])
            self.terms.append(term_data)

    def save(self):
        dict_data = {
            "name": self.name,
            "terms": []
        }

        for term in self.terms:
            term_dict = {}
            if term.area:
                term_dict["area"] = term.area
            term_dict["termDef"] = term.termDef
            dict_data["terms"].append(term_dict)

        self.path.write_text(json.dumps(dict_data, indent=4, ensure_ascii=False) + "\n")

    def remove_all_uuids(self):
        for term in self.terms:
            term.termDef = term.term_def_without_uuids()

    def terms_list(self) -> List[str]:
        return [item.term for item in self.terms]

    def add_term(self, term: str, definition: str, area: str = ""):
        data = TermData(area, f"{term} - {definition}")
        self.terms.append(data)

    def all_non_eng_terms(self, skip_empty: bool = True) -> List[TermData]:
        ret: List[TermData] = []
        for term in self.terms:
            if skip_empty and not term.has_definition:
                continue
            if not term.is_english():
                ret.append(term)

        return ret

    def all_russian_terms(self, skip_empty: bool = True) -> List[TermData]:
        ret: List[TermData] = []
        for term in self.terms:
            if skip_empty and not term.has_definition:
                continue
            if term.has_russian():
                ret.append(term)

        return ret

    def check_prepositions(self) -> None:
        preps = ['an', 'a' ,'of', 'the', 'by']
        for term in self.terms:
            for prep in preps:
                bad_start = r"{" + prep + r"\b"
                bad_end = r"\b" + prep + r"}"
                if re.search(bad_start, term.termDef) or re.search(bad_end, term.termDef):
                    print(f"Term: {term.term} has {prep} in definition")

    def check_tag_start_end_with_space(self) -> None:
        for term in self.terms:
            if "{ " in term.termDef or " }" in term.termDef:
                print(f"Term: {term.term} has tag start or end with space")

    def check_tags_balance(self) -> None:
        for term in self.terms:
            if not term.is_balanced():
                print(f"Term: {term.term} has unbalanced tags")

    def check_not_ends_with_dot(self) -> None:
        for term in self.terms:
            if term.termDef[-1] == ".":
                print(f"Term: {term.term} ends with dot")

    def validate(self) -> None:
        self.check_prepositions()
        self.check_tag_start_end_with_space()
        self.check_tags_balance()
        self.check_not_ends_with_dot()


class ChatEngine:
    model = ""
    temp = 0

    def __init__(self, model: str = "gpt-4o-mini", temp: float = 0):
        self.model = model
        self.temp = temp

@dataclass
class Message:
    role: str
    content: str

    def to_dict(self) -> Dict[str, str]:
        return {"role": self.role, "content": self.content}

class TermChat:
    system_message: Message
    messages: List[Message] = []
    engine: ChatEngine

    def __init__(self, engine: ChatEngine, system_message: str):
        self.engine = engine
        self.system_message = Message("system", system_message)

    def _get_messages(self) -> List[Dict[str, str]]:
        ret: List[Dict[str, str]] = []
        ret.append(self.system_message.to_dict())

        for message in self.messages:
            ret.append(message.to_dict())

        return ret

    def get_answer(self, user_message: str = "") -> str | None:
        if user_message != "":
            self.messages.append(Message("user", user_message))

        response = openai.chat.completions.create(
            model=self.engine.model,
            messages=self._get_messages(),
            temperature=self.engine.temp,
        )
        answer = response.choices[0].message.content
        answer = answer if answer else ""
        # self.messages.append({"role": "assistant", "content": answer})
        self.messages.append(Message("assistant", answer))
        return answer

    def clear_messages(self):
        self.messages = []


data_path: Path = Path("../../source/staticDataStorage/data/").resolve()

# bio_storage: StaticStorage = StaticStorage(data_path / "Biochemistry.json")
glb_storage: StaticStorage = StaticStorage(data_path / "Global.json")


In [None]:
# Prompts

continue_term_prompt = """
You are a terms helper robot
your task is to guess, based on the list of terms,
which one term, not in the list,
is the most relevant to already given terms. Answer with one term please.
"""

give_definition_prompt = """
You are a terms definition robot
your task is to give a best possible definition to the user requested term.

You should give a definition step by step, according to the this rules:

1. Definition should contain only information about what the term intrinsically is,
without any information about what the term is not, and what the term is related to.
Every entity that can be removed from definition without changing the meaning, should be removed.
2. Definition should based on the existing list of terms (list would be provided to you).
3. Check if there exist terms that also fit this definition and are not synonymous to the term.
4. Definition should be as short as possible.

List of existing terms: '''{0}'''

User requested definition for term: '''{1}'''

Format of answer should be step by step thoughts:
Raw term definition: <try to define user term for the first time. format 'term - definition'> <new line>
Thoughts about definition: <your thoughts on how this term should be redefined according to rules> <new line>

First retry: <try to define user term for the second time. format 'term - definition'> <new line>
Thoughts about definition: <your thoughts on how this term should be redefined according to rules> <new line>

Final try: <try to define user term for the third time. format 'term - definition'> <new line>
"""

# - If there is no provided definition on russian, you should provide a definition in english based on the term itself
translate_term_prompt = """
You are a GPT that is specified in term translation.
Your task is to translate terms and definitions from russian to english, saving their format up to the last detail.

Format description:
- TermDef format: each term should start with a capital letter, and the definition should start with a lowercase letter.
- Preserve brackets: If in definition some terms are in brackets, you should save them in brackets in the translation. Brackets type is important!!! Use the same type of brackets as in the original definition.
- Keyword Usage: Each definition is carefully crafted with specific details
- Scientific answers: each definition should be as precise, scientific and dry as possible. Expect that your user knows difficult conceptions and you should use them. your task is just with definition show how this conceptions are related.
- Your translation should be exact, without any additional information
- If there is no provided definition on russian, you should also not provide a definition in english - just translate the term itself
"""

In [None]:
def predict_next_term():
    engine = ChatEngine(temp=0.5)
    chat = TermChat(engine, continue_term_prompt)
    terms_str = ' ,'.join(glb_storage.terms_list())
    answer = chat.get_answer(terms_str)
    print(answer)
    chat.clear_messages()

In [None]:
def give_definition(new_term):
    terms_str = ' ,'.join(glb_storage.terms_list())

    engine = ChatEngine(temp=0.2)
    chat = TermChat(engine, give_definition_prompt.format(terms_str, new_term))

    answer = chat.get_answer()
    print(answer)
    chat.clear_messages()

In [None]:
def add_term(term, definition, area=""):
    glb_storage.add_term(term, definition, area)
    glb_storage.save()

In [None]:
# Transfer all terms from Biochemistry to Global with translation

# bio_storage.remove_all_uuids()
# bio_storage.save()

# r_terms = bio_storage.all_russian_terms(skip_empty=False)
# print(f"Russian terms count: {len(r_terms)}")

# for term_data in bio_storage.all_russian_terms(skip_empty=False):
    # print(f"Term: {term_data.termDef}")

def translate_terms(storage: StaticStorage, limit: int = 50):
    term_count = len(storage.terms)
    translated_count = 0
    for i in range(term_count):
        term_data = storage.terms[i]
        if not term_data.has_russian():
            continue

        engine = ChatEngine(temp=0.0)
        translator = TermChat(engine, translate_term_prompt)
        translator.clear_messages()
        print(f"Src term {i}       : {term_data.termDef}")
        answer = translator.get_answer(term_data.termDef)
        term_data.termDef = answer if answer else term_data.termDef
        print(f"Translated term {i}: {term_data.termDef}")

        if translated_count % 10 == 0:
            print(f"Translated {translated_count} terms")

        translator.clear_messages()
        translated_count += 1

        if translated_count >= limit:
            break

        storage.save()

# translate_terms(bio_storage, 100)

In [None]:
# Move terms from bio to global

# for i in range(len(bio_storage.terms)):
#     term_data = bio_storage.terms.pop(0)
#     term_data.area = 'chem'
#     glb_storage.terms.append(term_data)

# bio_storage.save()
glb_storage.validate()
glb_storage.save()