In [11]:
import guidance
from guidance import models, gen, any_char, any_char_but, regex, substring, substring_no_empty, with_temperature, select

In [2]:
mistral = models.LlamaCpp("/Users/nicholasking/code/models/mixtral-8x7b-instruct-v0.1.Q3_K_M.gguf", n_gpu_layers=-1, n_ctx=4096)

# Dictionary Entry
A typical dictionary entry consists of the following components:

- Headword: The word being defined, usually in bold or a different color.
- Part of speech: The grammatical category of the word (e.g., noun, verb, adjective).
- Pronunciation: The way the word is pronounced, often using the International Phonetic Alphabet (IPA).
- Etymology: The origin and historical development of the word.
- Definition(s): The meaning(s) of the word, often with multiple senses listed numerically.
- Example sentences: Sentences illustrating the word in context.
- Synonyms: Words with similar meanings.
- Antonyms: Words with opposite meanings.
- Derived forms: Variations of the word (e.g., plurals, verb tenses).

## Ideas
- Monolingual entries for a few words
- Force too many entries for a single word
- Given a corpus of new slang from tiktok, generate a dictionary entry for the slang word
- Bilingual dictionary entry

In [10]:
newline_char = "\n"
dict_entry_instructions = mistral + f"""# Instructions
You are an expert lexicographer tasked with writing dictionary entries. Your dictionary entry should have the following components:

- Headword: The word being defined.
- Part of speech: The grammatical category of the word (e.g., noun, verb, adjective).
- Pronunciation: The way the word is pronounced, using the International Phonetic Alphabet (IPA).
- Etymology: The origin and historical development of the word.
- Definition(s): The meaning(s) of the word, often with multiple senses listed numerically.
- Example sentences: Sentences illustrating the word in context.
- Synonyms: Words with similar meanings.
- Antonyms: Words with opposite meanings.
- Inflections: Variations of the word (e.g., plurals, verb tenses).

You are writing a dictionary targeted toward an audience with a high school English level. Your entry should be at an appropriate level
for this audience.

"""

dict_entry_gen1 = dict_entry_instructions + f"""# Dictionary entry

Headword: solid
Part of speech: {gen(max_tokens=5, stop=newline_char)}
Pronunciation: {gen(max_tokens=10, stop=newline_char)}
Etymology: {gen(max_tokens=50, stop=newline_char)}
Definition:
  1. {gen(max_tokens=100, stop=newline_char)}
  2. {gen(max_tokens=100, stop=newline_char)}
Example sentences:
  1. {gen(max_tokens=100, stop=newline_char)}
  2. {gen(max_tokens=100, stop=newline_char)}
Synonyms: {gen(max_tokens=100, stop=newline_char)}
Antonyms: {gen(max_tokens=100, stop=newline_char)}
Inflections: {gen(max_tokens=100, stop=newline_char)}
"""

### Examples of reusable guidance functions

```python
@guidance
def quoted_list(lm, name, n):
    for i in range(n):
        if i > 0:
            lm += ", "
        lm += '"' + gen(name, list_append=True, stop='"') + '"'
    return lm

@guidance
def generate_character(
    lm,
    character_one_liner,
    weapons: list[str] = sample_weapons,
    armour: list[str] = sample_armor,
    n_items: int = 3
):
    lm += f'''\
    {{
        "description" : "{character_one_liner}",
        "name" : "{gen("character_name", stop='"')}",
        "age" : {gen("age", regex="[0-9]+")},
        "armour" : "{select(armour, name="armor")}",
        "weapon" : "{select(weapons, name="weapon")}",
        "class" : "{gen("character_class", stop='"')}",
        "mantra" : "{gen("mantra", stop='"')}",
        "strength" : {gen("age", regex="[0-9]+")},
        "quest_items" : [{quoted_list("quest_items", n_items)}]
    }}'''
    return lm
```

In [14]:
@guidance
def dict_entry_gen(lm, word: str):
    lm += f"""# Dictionary entry
Headword: {word}
Part of speech: {gen(max_tokens=5, stop=newline_char)}
Pronunciation: {gen(max_tokens=10, stop=newline_char)}
Etymology: {gen(max_tokens=50, stop=newline_char)}
Definition:
  1. {gen(max_tokens=100, stop=newline_char)}
  2. {gen(max_tokens=100, stop=newline_char)}
Example sentences:
  1. {gen(max_tokens=100, stop=newline_char)}
  2. {gen(max_tokens=100, stop=newline_char)}
Synonyms: {gen(max_tokens=100, stop=newline_char)}
Antonyms: {gen(max_tokens=100, stop=newline_char)}
Inflections: {gen(max_tokens=100, stop=newline_char)}
"""
    return lm

dict_entry_gen2 = mistral + dict_entry_gen("platinum")

In [15]:
dict_entry_gen3 = mistral + dict_entry_gen("force")

In [16]:
dict_entry_gen4 = mistral + dict_entry_gen("solar")

In [24]:
nl = "\n"
ipa_regex = r'[\u0250-\u02AF\u1D00-\u1D7F\u1D80-\u1DBF]+'

# Not working because guidance regex doesn't support multi-byte chars yet
@guidance
def dict_entry_guided_ipa_regex(lm, word: str, num_defs=1):
    if num_defs < 1:
        num_defs = 1
    parts_of_speech = ["noun", "verb", "adjective", "adverb", "pronoun", "preposition", "conjunction", "interjection", "article", "numeral"]
    lm += f"""# Dictionary entry
Headword: {word}
Part of speech: {select(parts_of_speech, recurse=True)}
Pronunciation: /{regex(ipa_regex)}/
Etymology: {gen(max_tokens=50, stop=newline_char)}
Definition:"""
    for i in range(num_defs):
        lm += f"{nl}  {i+1}. {gen(max_tokens=100, stop=newline_char)}"
    lm += f"""{nl}Example sentences:"""
    for i in range(num_defs):
        lm += f"{nl}  {i+1}. {gen(max_tokens=100, stop=newline_char)}"
    lm += f"""{nl}Synonyms: {gen(max_tokens=100, stop=newline_char)}
Antonyms: {gen(max_tokens=100, stop=newline_char)}
Inflections: {gen(max_tokens=100, stop=newline_char)}
"""
    return lm

In [25]:
dict_entry_guided1 = mistral + dict_entry_guided("solar")

ValueError: invalid literal for int() with base 16: '|0|2'

In [30]:
ipa_chars = [chr(i) for i in range(0x0250, 0x02AF + 1)] + [chr(i) for i in range(0x1D00, 0x1D7F + 1)] + [chr(i) for i in range(0x1D80, 0x1DBF + 1)]

@guidance
def dict_entry_guided_ipa_select(lm, word: str, num_defs=1):
    if num_defs < 1:
        num_defs = 1
    parts_of_speech = ["noun", "verb", "adjective", "adverb", "pronoun", "preposition", "conjunction", "interjection", "article", "numeral"]
    lm += f"""# Dictionary entry
Headword: {word}
Part of speech: {select(parts_of_speech, recurse=True, name="part_of_speech")}
Pronunciation: /{select(ipa_chars, recurse=True, name="ipa")}/
Etymology: {gen(max_tokens=50, stop=nl, temperature=0.5)}
Definition:"""
    for i in range(num_defs):
        lm += f"{nl}  {i+1}. {gen(max_tokens=100, stop=nl, temperature=0.5)}"
    lm += f"""{nl}Example sentences:"""
    for i in range(num_defs):
        lm += f"{nl}  {i+1}. {gen(max_tokens=100, stop=nl, temperature=0.5)}"
    lm += f"""{nl}Synonyms: {gen(max_tokens=100, stop=nl, temperature=0.5)}
Antonyms: {gen(max_tokens=50, stop=nl, temperature=0.5)}
Inflections: {gen(max_tokens=50, stop=nl, temperature=0.5)}
"""
    return lm

### Interesting note about parts of speech
Had to add (s) to part(s) of speech so that the model would possibly generate multiple parts of speech for polysemous words.

In [44]:
@guidance
def dict_entry_guided(lm, word: str, num_defs=1, temperature=0.5):
    if num_defs < 1:
        num_defs = 1
    parts_of_speech = ["noun", "verb", "adjective", "adverb", "pronoun", "preposition", "conjunction", "interjection", "article", "numeral", ", "]
    lm += f"""# Dictionary entry
Headword: {word}
Part(s) of speech: {select(parts_of_speech, recurse=True, name="part_of_speech")}
Pronunciation: /{gen(max_tokens=20, stop=[nl, "/"])}/
Etymology: {gen(max_tokens=50, stop=nl, temperature=temperature)}
Definition:"""
    for i in range(num_defs):
        lm += f"{nl}  {i+1}. {gen(max_tokens=100, stop=nl, temperature=temperature)}"
    lm += f"""{nl}Example sentences:"""
    for i in range(num_defs):
        lm += f"{nl}  {i+1}. {gen(max_tokens=100, stop=nl, temperature=temperature)}"
    lm += f"""{nl}Synonyms: {gen(max_tokens=100, stop=nl, temperature=temperature)}
Antonyms: {gen(max_tokens=50, stop=nl, temperature=temperature)}
Inflections: {gen(max_tokens=50, stop=nl, temperature=temperature)}
"""
    return lm

In [40]:
dict_entry_g1 = mistral + dict_entry_guided("solar")

In [41]:
dict_entry_g2 = mistral + dict_entry_guided("fast", num_defs=5)

In [43]:
dict_entry_g2 = mistral + dict_entry_guided("fast", num_defs=30)

In [45]:
dict_entry_g2 = mistral + dict_entry_guided("fast", num_defs=30, temperature=0.95)

What happens if we try to give a nonsense word?

In [48]:
dict_entry_g3 = mistral + dict_entry_guided("fapharough", num_defs=5, temperature=0.75)

What happens if we give a non-English, romance language word?

In [49]:
dict_entry_g5 = mistral + dict_entry_guided("incroyable", num_defs=5, temperature=0.75)

In [50]:
# Try giving a french verb to test inflections
dict_entry_g5 = mistral + dict_entry_guided("vivre", num_defs=5, temperature=0.75)

In [51]:
# an interesting verb in german (according to Claude)
dict_entry_g6 = mistral + dict_entry_guided("verschlimmbessern", num_defs=5, temperature=0.75)

The above result probably output english sentences because the metaprompt specifies that we're writing an english dictionary

In [52]:
# Now let's try a non-western language
dict_entry_g6 = mistral + dict_entry_guided("漂亮", num_defs=2, temperature=0.75)

In [53]:
dict_entry_g6 = mistral + dict_entry_guided("美しい", num_defs=2, temperature=0.75)

## Idea - write a kanji using hiragana

In [None]:
@guidance
