In [1]:
import os
from IPython.display import display, Markdown
import openai
from openai import OpenAI

def pretty(obj):
    display(Markdown(str(obj)))



'vocabulary/dict/english.txt'

In [114]:
file_path = os.path.join('../vocabulary', 'dict', 'english.txt')
file_path

### Models

In [3]:
import langchain_openai
from langchain_openai import ChatOpenAI # type: ignore

api_key = os.getenv("OPENAI_API_KEY")

chat_default = ChatOpenAI(model='gpt-4o-mini', temperature=0.7, openai_api_key=api_key)
chat_reliable = ChatOpenAI(model='gpt-4o-mini', temperature=0.0, openai_api_key=api_key)
chat_creative = ChatOpenAI(model='gpt-4o-mini', temperature=1.5, openai_api_key=api_key)

all_chats = [chat_default, chat_reliable, chat_creative]

In [4]:
def ask_all(prompt, chats: list[ChatOpenAI]=all_chats):
    for chat in all_chats:
        print(chat.invoke(prompt).content + '\n')

In [5]:
ask_all('hi')

Hello! How can I assist you today?

Hello! How can I assist you today?

Hello! How can I assist you today?



### Templates

In [120]:
from langchain.prompts import ChatPromptTemplate, FewShotPromptTemplate # type: ignore

basic_prompt_template = ChatPromptTemplate.from_template(
        """
        explain word ```{word}``` using Cambridge Dictionary in the following format:
        word [part of speech]
        ()
        1. explanation
        (if there is more than one explanation you can list them using subsequent numbers)

        I will give you some example for word \'battered\':
        battered [adjective]
        ()
        1. hurt by being repeatedly hit
        2. damaged, especially by being used a lot

        If You cannot find word please tell me that i probably made mistake and don't try to forcefully come up with something. 
        You may give me instead suggestions with similar words in case I just misspelled it. 
        For example if i write word \'beffled\' You can respond:
        Cannot find \'beffled\' but I have found word(s) with similar spelling:
        1. baffled
        """
)

pretty(basic_prompt_template.messages[0].prompt.template)


        explain word ```{word}``` using Cambridge Dictionary in the following format:
        word [part of speech]
        ()
        1. explanation
        (if there is more than one explanation you can list them using subsequent numbers)

        I will give you some example for word 'battered':
        battered [adjective]
        ()
        1. hurt by being repeatedly hit
        2. damaged, especially by being used a lot

        If You cannot find word please tell me that i probably made mistake and don't try to forcefully come up with something. 
        You may give me instead suggestions with similar words in case I just misspelled it. 
        For example if i write word 'beffled' You can respond:
        Cannot find 'beffled' but I have found word(s) with similar spelling:
        1. baffled
        

### Structured response

In [7]:
from langchain.output_parsers import ResponseSchema
from langchain.output_parsers import StructuredOutputParser

In [8]:
response_schemas = [
    ResponseSchema(name="word", description="base form of the word used in dictionary"),
    ResponseSchema(name="part_of_speech", description="what part of speech it is"), 
    ResponseSchema(name="translations", description="list of most relevant translations. You can use more than one but don't put to many if it's not necessary.", type='array'),
    ResponseSchema(name="comment", description="if You find relevant words You can keep this part empty. But if You cannot find anything please tell me that I probably made mistake and don't try to forcefully come up with something. You may suggest similar words instead. ")
]

output_parser = StructuredOutputParser.from_response_schemas(response_schemas)

structured_format_instructions = output_parser.get_format_instructions()

In [9]:
structured_prompt_template_str = """
        explain following word using Cambridge Dictionary in the following format:
        
        word: base form of the word used in dictionary
        part_of_speech: what part of speech it is
        translations: list of most relevant translations. You can use more than one but don't put to many if it's not necessary.
        comment: if You find relevant words You can keep this part empty. But if You cannot find anything please tell me that I probably made mistake and don't try to forcefully come up with something. You may suggest similar words instead. 

        For example if I write word \'battered\' You can respond:
        word: battered
        part_of_speech: adjective
        translations: ['hurt by being repeatedly hit', 'damaged, especially by being used a lot']
        comment: 

        For example if I write word \'beffled\' You can respond:
        word: 
        part_of_speech: 
        translations: 
        comment: Cannot find 'beffled' but I have found word(s) with similar spelling: [baffled; muffled]

        word: {word}
        format: {format_instructions}
        """

structured_prompt_template = ChatPromptTemplate.from_template(
    template=structured_prompt_template_str
)

In [10]:
def ask_generic(word: str, 
                prompt: ChatPromptTemplate = structured_prompt_template, 
                format_instructions: str = structured_format_instructions, output_parser: StructuredOutputParser | None = output_parser, 
                model: langchain_openai.ChatOpenAI | openai.OpenAI = chat_default
                ) -> dict | str: # return type depends on output_parser
    
    messages = prompt.format_messages(word=word, format_instructions=format_instructions)

    if type(model) == langchain_openai.ChatOpenAI:
        response = model.invoke(messages)
    elif type(model) == openai.OpenAI:
        completion = model.chat.completions.create(
        model="gpt-4o-mini", # why don't take it from a model put in arguments ?
        messages=[
                {"role": "system", "content": "You are a helpful assistant."},
                {
                    "role": "user",
                    "content": basic_prompt_template.format_messages(word='harness', format_instructions=None)[0].content
                }
            ]
        )

        response = completion.choices[0].message    
    else:
        raise Exception(f'only {langchain_openai.ChatOpenAI} and {openai.OpenAI} are acceptable as model types!')
    
    if output_parser is None:
        return response.content
    
    return output_parser.parse(response.content)


In [11]:
def ask(word: str, prompt: ChatPromptTemplate = structured_prompt_template, format_instructions: str = structured_format_instructions, output_parser: StructuredOutputParser | None = output_parser, model = chat_default) -> dict:
    messages = prompt.format_messages(word=word, format_instructions=format_instructions)

    response = model.invoke(messages)

    if output_parser is None:
        return response.content
    
    return output_parser.parse(response.content)

In [12]:
ask_generic('harness', prompt=basic_prompt_template, format_instructions=None, output_parser=None, model=chat_default)

'harness [noun]\n()\n1. a set of straps and fittings by which a horse or other animal is fastened to a cart, plow, etc., and which is also used to control the animal.\n2. a similar set of straps used for holding something in place or for supporting someone, especially a safety harness.\n  \nharness [verb]\n()\n1. to put a harness on a horse or other animal.\n2. to control something, usually in order to use its power or to achieve something.'

In [13]:
ask_generic('harness', prompt=basic_prompt_template, format_instructions=None, output_parser=None, model=chat_openai)

'harness [noun]\n()\n1. a set of straps used to attach a person or an animal to something, especially a vehicle, so that they can pull it\n2. a piece of equipment to control a horse or other animal\n\nharness [verb]\n()\n1. to put a harness on an animal\n2. to control and make use of (natural resources, human resources, etc.)'

In [14]:
ask_generic('harness', prompt=structured_prompt_template, format_instructions=structured_format_instructions, output_parser=output_parser, model=chat_default)

{'word': 'harness',
 'part_of_speech': 'noun, verb',
 'translations': ['a device for holding something in place',
  'to control and make use of (natural resources, etc.)'],
 'comment': ''}

#### prompt for modifying schema
i have such schema in langchain:
response_schemas = [
    ResponseSchema(name="word", description="base form of the word used in dictionary"),
    ResponseSchema(name="part_of_speech", description="what part of speech it is"), 
    ResponseSchema(name="translations", description="list of most relevant translations. You can use more than one but don't put to many if it's not necessary.", type='array'),
    ResponseSchema(name="comment", description="if You find relevant words You can keep this part empty. But if You cannot find anything please tell me that I probably made mistake and don't try to forcefully come up with something. You may suggest similar words instead. ")
]

it gives me output like this:
{'word': 'harness',
 'part_of_speech': 'noun',
 'translations': ['a set of straps and belts',
  'to control and make use of (something)'],
 'comment': ''}

now i'd like to modify it so the output will be list of such dictionaries. for example for 'harness' it may be:
{'harness': [
    {
        'part_of_speech': 'noun',
        'translations': ['a set of straps and belts',
        'to control and make use of (something)'],
        'comment': ''
    }, 
    {
        'part_of_speech': 'verb',
        'translations': ['to put a harness on a horse or other animal',
        'to use something for a particular purpose, especially for energy or power'],
        'comment': ''
    }
    ]
}


after few more prompts we arrived at this:
from langchain.output_parsers import JsonOutputKeyParser

json_schema = {
    "type": "object",
    "properties": {
        "words": {
            "type": "object",
            "additionalProperties": {
                "type": "array",
                "items": {
                    "type": "object",
                    "properties": {
                        "part_of_speech": {
                            "type": "string",
                            "description": "The grammatical category of the word (e.g., noun, verb)."
                        },
                        "translations": {
                            "type": "array",
                            "items": {"type": "string"},
                            "description": "List of most relevant translations."
                        },
                        "comment": {
                            "type": "string",
                            "description": "Optional comment about the translations. If the word is unclear, suggest alternatives."
                        }
                    },
                    "required": ["part_of_speech", "translations"]  # Ensure mandatory fields
                }
            }
        }
    },
    "required": ["words"]
}

output_parser = JsonOutputKeyParser(key_name="words", json_schema=json_schema)

needs to be checked

generally structured can be more complex - include ex. lang, only_plural etc. 
or maybe different schemas for different languages

In [15]:
ask('baffled')

{'word': 'baffled',
 'part_of_speech': 'adjective',
 'translations': ['confused', 'puzzled'],
 'comment': ''}

In [16]:
ask('baffled', prompt=basic_prompt_template, format_instructions=None, output_parser=None)

'baffled [adjective]  \n()  \n1. unable to understand or explain something; confused or perplexed.  \n2. surprised and confused; unable to think clearly.'

In [17]:
ask('beffled')

{'word': 'beffled',
 'part_of_speech': '',
 'translations': [],
 'comment': "Cannot find 'beffled' but I have found word(s) with similar spelling: [baffled; muffled]"}

In [18]:
ask('beffled', prompt=basic_prompt_template, format_instructions=None, output_parser=None)

"Cannot find 'beffled' but I have found word(s) with similar spelling:\n1. baffled"

In [19]:
ask('harness')

{'word': 'harness',
 'part_of_speech': 'noun',
 'translations': ['a set of straps and fittings by which a horse or other animal is fastened to a vehicle',
  'a device used to control or manage something'],
 'comment': ''}

In [20]:
ask('harness', prompt=basic_prompt_template, format_instructions=None, output_parser=None)

'harness [noun]  \n()  \n1. a set of straps and belts used to control a horse or other animal, or to hold something in place  \n2. a piece of equipment that is worn to help support or protect the body, often used in activities like climbing or skydiving  \n\nharness [verb]  \n()  \n1. to control and make use of something, especially a resource or an energy source  '

### Q&A over documents (RAG)

In [28]:
from langchain.document_loaders import TextLoader
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
import os

In [22]:
file_path = os.path.join('../vocabulary', 'dicts', 'english.txt')
loader = TextLoader(file_path)
docs = loader.load()

In [81]:
# neither of them ensures one dict entry per chunk!

splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,   # Doesn't matter much since we are enforcing strict splitting
    chunk_overlap=0,   # No overlap needed
    separators=["\n\n"]  # Ensure splitting only happens at double newlines
)

# text_splitter = CharacterTextSplitter(
#     separator="\n\n",
#     chunk_size=10000,    # Large enough to not split within entries
#     chunk_overlap=0,
#     strip_whitespace=True,
#     is_separator_regex=False
# )

# text_splitter = CharacterTextSplitter(
#     separator="\n\n",
#     chunk_size=1000,      # Smaller chunk size
#     chunk_overlap=0,
#     length_function=len,
#     is_separator_regex=False,
#     strip_whitespace=True
# )

# text_splitter = CharacterTextSplitter(
#     separator="\n\n",
#     chunk_size=100000,
#     chunk_overlap=0
# )

docs = splitter.split_documents(docs)

In [82]:
len(docs)

49

In [83]:
docs[0].page_content[-100:]

'cy it was)\n\nfancy [adj]\n(I wanted a simple black dress, nothing fancy) \n1. decorative or complicated'

In [84]:
from langchain.docstore.document import Document

with open(file_path, 'r') as file:
    content = file.read()

# Split on double newlines and filter out empty entries
entries = [entry.strip() for entry in content.split('\n\n') if entry.strip()]

# Convert each entry to a Document
docs = [
    Document(page_content=entry, metadata={"source": file_path}) 
    for entry in entries
]

In [85]:
len(docs)

240

In [86]:
pretty(docs[0].page_content)

impel [verb]
(I never read medicine advertisement without being impelled to the conclusion that I am suffering from the particular disease) [Three Men in a Boat (to say nothing of the dog)]
1. to make someone feel that they must do something
2. to force someone to do something

In [87]:
pretty(docs[1].page_content)

a touch of [phrase]
(Slight ailment of which I had a touch) [Three Men in a Boat (to say nothing of the dog)] 
1. a small amount of (something) : a hint or trace of (something)

In [70]:
from langchain_openai import OpenAIEmbeddings
embedder = OpenAIEmbeddings(model="text-embedding-ada-002", openai_api_key=api_key)

In [71]:
embedded_query = embedder.embed_query("hello world")

In [72]:
len(embedded_query)

1536

In [73]:
embedded_query[:10]

[-0.01609949767589569,
 0.0013686870224773884,
 -0.01948472298681736,
 -0.0336947925388813,
 -0.026005873456597328,
 0.007675800006836653,
 -0.024890584871172905,
 -0.0003144945949316025,
 -0.013002936728298664,
 -0.021689055487513542]

In [74]:
long_str = "lorep ipsum " * 3000
len(long_str)

36000

In [75]:
long_query_embedded = embedder.embed_query(long_str)

In [88]:
len(long_query_embedded)
long_query_embedded[:2]

[-0.005008402904363089, 0.016765096406017768]

In [89]:
embedded_query[:2]

[-0.01609949767589569, 0.0013686870224773884]

In [131]:
from langchain.vectorstores import DocArrayInMemorySearch
from langchain.chains import RetrievalQA

db = DocArrayInMemorySearch.from_documents(
    docs, 
    embedder
)

retriever = db.as_retriever()

def print_retrieved_content(query):
    for x in db.similarity_search(query):
        pretty(x.page_content)

qa_chain = RetrievalQA.from_chain_type(
    llm=chat_default, 
    chain_type='stuff',
    retriever=retriever,
    verbose=True
)

def print_qa_content(query):
    pretty(qa_chain(query)['result'])

In [123]:
query1 = 'becoming weaker'
query2 = 'becoming less strong'

In [None]:
print_retrieved_content('becoming weaker')

feeble [adj]
("Get me out of this" was the feeble reply)
1. weak and without energy, strength, or power

frail [adjective]
()
1. weak and delicate
2. easily damaged or broken

senescence [noun]
(We have done theoretical work on the evolution of trade-offs, senescence and morality)
1. the fact of becoming older, and therefore being in less good condition and less able to function well

impair [verb]
(Why do we choose to impair and disrupt our own cognition ?) [A hunter-gatherer's guide to the 21st century]
1. to spoil something or make it weaker so that it is less effective

In [126]:
print_retrieved_content('becoming less strong')

feeble [adj]
("Get me out of this" was the feeble reply)
1. weak and without energy, strength, or power

subside [verb]
(This subsided but interest was again revived)
1. to become less strong or loud

frail [adjective]
()
1. weak and delicate
2. easily damaged or broken

senescence [noun]
(We have done theoretical work on the evolution of trade-offs, senescence and morality)
1. the fact of becoming older, and therefore being in less good condition and less able to function well

In [132]:
print_qa_content('becoming weaker')



[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


The term "becoming weaker" can relate to several concepts, such as "feeble," which describes something that is weak and without energy, or "frail," which refers to being weak and delicate. Additionally, "senescence" refers to the process of aging, which often involves becoming less effective or in a less good condition over time.

In [133]:
print_qa_content('becoming less strong')



[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


The term that describes becoming less strong is "subside."

### Evaluation