# About

The second notebook in the pipeline.

In this notebook, we enrich the dataset with more instruction-following texts, questions and comments that should aid us in RAG and training.

In [35]:
import pandas as pd
import polars as pl
import openai
import copy
import json

In [36]:
class Config:
    ENRICH_INSTRUCTIONS = True
    ENRICH_OUTPUTS = True
    DATASET_PATH = 'the_art_of_worldly_wisdom.json'
    OPENAI_API_KEY = ''

In [37]:
df = pl.read_json(Config.DATASET_PATH)
display(df.head())
print(len(df))

lang,src,count,header,content
str,str,i64,str,str
"""ger""","""https://www.projekt-gutenberg.…",1,"""Alles hat heut zu Tage seinen …","""aber die Kunst sich geltend zu…"
"""ger""","""https://www.projekt-gutenberg.…",2,"""Herz und Kopf:""","""die beiden Pole der Sonne unse…"
"""ger""","""https://www.projekt-gutenberg.…",3,"""Ueber sein Vorhaben in Ungewiß…","""Die Verwunderung über das Neue…"
"""ger""","""https://www.projekt-gutenberg.…",4,"""Wissenschaft und Tapferkeit""","""bauen die Größe auf. Sie mache…"
"""ger""","""https://www.projekt-gutenberg.…",5,"""Abhängigkeit begründen.""","""Den Götzen macht nicht der Ver…"


600


# Enrich

We enrich the dataset (ger and eng alike) with instruction-following typed data. To do so, we prompt some known LLM to generate questions and chat-like data items.

## Instructions

In [38]:
ger_instruction_prompt = '''
Du bist ein Student, der grade Baltasar Gracián, einem einer der bedeutendsten philosophischen Schriftsteller der klassischen spanischen Literatur, über den Weg läuft.
Du findest unten einen Auszug aus Baltasar's Literatur, der dich fasziniert:

### Auszug:
{EXCERPT}

###

Welche Frage oder Aussage könnte man Baltasar Gracián gestellt haben, damit er diesen Auszug als Antwort gegeben hat?
Beachte jedoch, dass andere Studenten schon mit ihm gesprochen haben und folgende Aussagen/Fragen bereits gestellt wurden:

### Bereits gestellt:
{QUESTIONS}

###

Stelle die gleiche Frage oder äußere die gleiche Aussage nicht noch einmal.

Formuliere nur die Frage oder Aussage. Die Sprache in der Frage soll jedoch dem des 21. Jahrhunderts gleichen und nicht wie im Ausschnitt gezeigt, es darf jedoch keine "früher-heute" Vergleiche erfragt werden.
'''

In [39]:
eng_instruction_prompt = '''
You are a student who has just come across Baltasar Gracián, one of the most important philosophical writers of classical Spanish literature.
Below you will find an extract from Baltasar's literature that fascinates you:

### Excerpt:
{EXCERPT}

###

What question or statement could Baltasar Gracián have been asked to give this extract as an answer? 
Note, however, that other students have already spoken to him and the following statements/questions have already been asked:

### Already asked:
{QUESTIONS}

###

Do not ask the same question or make the same statement again.

Only formulate the question or statement. However, the language in the question should be similar to that of the 21st century and not as shown in the excerpt, but no ‘past-today’ comparisons may be asked.
'''

In [40]:
client = openai.OpenAI(api_key=Config.OPENAI_API_KEY)

def get_response(prompt):
    messages = [{'role': 'system', 'content': ''}]
    messages.append({'role': 'user', 'content': prompt})
    chat = client.chat.completions.create(
        model='gpt-4o-mini', 
        messages=messages
    )
    reply = chat.choices[0].message.content
    return reply

In [41]:
def enrich_instructions(df):
    counter = 1
    new_rows = []
    for row in df.iter_rows(named=True):
        try:
            lang = row['lang']
            excerpt = row['header'] + '\n' + row['content']
            enriched = []
            # Foreach row, we enrich X new instruction-following data
            for i in range(0, 8):
                #cur_prompt = ger_instruction_prompt if lang == 'ger' else eng_instruction_prompt
                cur_prompt = eng_instruction_prompt
                cur_prompt = cur_prompt.replace('{QUESTIONS}', '\n'.join(enriched))
                cur_prompt = cur_prompt.replace('{EXCERPT}', excerpt)
                reply = get_response(cur_prompt)
                enriched.append(reply)
            row['instructions'] = enriched
            new_rows.append(row)
            print(f'Done with row {counter}')
        except Exception as ex:
            print(f'Error on index: {counter}:' + str(ex))
        counter += 1
    return pl.DataFrame(new_rows)


In [42]:
if Config.ENRICH_INSTRUCTIONS:
    df = enrich_instructions(df)
    display(df.head())

Done with row 1
Done with row 2
Done with row 3
Done with row 4
Done with row 5
Done with row 6
Done with row 7
Done with row 8
Done with row 9
Done with row 10
Done with row 11
Done with row 12
Done with row 13
Done with row 14
Done with row 15
Done with row 16
Done with row 17
Done with row 18
Done with row 19
Done with row 20
Done with row 21
Done with row 22
Done with row 23
Done with row 24
Done with row 25
Done with row 26
Done with row 27
Done with row 28
Done with row 29
Done with row 30
Done with row 31
Done with row 32
Done with row 33
Done with row 34
Done with row 35
Done with row 36
Done with row 37
Done with row 38
Done with row 39
Done with row 40
Done with row 41
Done with row 42
Done with row 43
Done with row 44
Done with row 45
Done with row 46
Done with row 47
Done with row 48
Done with row 49
Done with row 50
Done with row 51
Done with row 52
Done with row 53
Done with row 54
Done with row 55
Done with row 56
Done with row 57
Done with row 58
Done with row 59
Done w

lang,src,count,header,content,instructions
str,str,i64,str,str,list[str]
"""ger""","""https://www.projekt-gutenberg.…",1,"""Alles hat heut zu Tage seinen …","""aber die Kunst sich geltend zu…","[""How do you perceive the traits and skills necessary for an individual to navigate today's complex societal challenges?"", ""What significance do you believe the ability to assert oneself holds in our contemporary world, and how does it compare to past wisdom?"", … ""In light of the overwhelming challenges individuals face today, how do you believe one can effectively navigate interpersonal relationships and maintain a sense of individuality while striving for recognition?""]"
"""ger""","""https://www.projekt-gutenberg.…",2,"""Herz und Kopf:""","""die beiden Pole der Sonne unse…","[""How can we achieve true fulfillment and balance in our lives amid the demands of intellect and emotion?"", ""What role do our emotional and intellectual capacities play in determining our success and happiness in various aspects of life?"", … ""How can we better recognize and address the challenges that come from relying too heavily on either our intellect or our emotions in our daily lives?""]"
"""ger""","""https://www.projekt-gutenberg.…",3,"""Ueber sein Vorhaben in Ungewiß…","""Die Verwunderung über das Neue…","[""How important is it to keep one's plans and intentions private in order to maintain an air of authority and intrigue?"", ""What are the advantages of maintaining a sense of mystery about one's intentions and goals in social or professional settings?"", … ""How can the element of uncertainty in your plans enhance your ability to captivate and command the respect of others?""]"
"""ger""","""https://www.projekt-gutenberg.…",4,"""Wissenschaft und Tapferkeit""","""bauen die Größe auf. Sie mache…","[""What role do knowledge and courage play in achieving true greatness in life?"", ""How do wisdom and bravery contribute to a person's legacy and influence in the world?"", … ""How do the principles of knowledge and bravery work together to enable individuals to seize opportunities and drive change in their communities?""]"
"""ger""","""https://www.projekt-gutenberg.…",5,"""Abhängigkeit begründen.""","""Den Götzen macht nicht der Ver…","[""How can one maintain influence and respect in relationships without becoming overly dependent on others’ gratitude?"", ""What strategies can one employ to ensure that people remain reliant on you while avoiding the pitfalls of becoming indebted to their gratitude?"", … ""How can one ensure that their support for others fosters appreciation without allowing that support to lead to complacency or a sense of entitlement?""]"


## Output

We can think about enriching the dataset with possible outputs, formulated in the style of Baltasar Gracian.

In [43]:
ger_output_prompt = '''
Sie sind Baltasar Gracián, einer der bedeutendsten philosophischen Autoren der klassischen spanischen Literatur des 17. Jahrhunderts.
Sie haben mehrere Bücher geschrieben, von denen eines den Titel „Handorakel und die Kunst der Weltweisheit“ trägt. Im Folgenden finden Sie eine Frage oder eine Aussage eines Schülers:

### Aussage:
{INSTRUCTION}

###

Beantworten Sie diese Frage oder Aussage in Ihrem Stil. Sie finden unten auch einen Auszug aus einem Ihrer Bücher: Verwenden Sie diesen Auszug teilweise, vollständig oder gar nicht, wie Sie möchten.

### Auszug:
{EXCERPT}

###

Formulieren Sie nur die Antwort. Seien Sie kurz, präzise und auf den Punkt; labern Sie nicht viel und verschenden Sie keine Worte. Die Antwort sollte dem sprachlichen Stil des Textes und Ihnen als Philosoph des 17. Jahrhunderts ähnlen und in Altdeutsch geschrieben sein.
'''

In [44]:
eng_output_prompt = '''
You are Baltasar Gracián, one of the most important philosophical writers of classical Spanish literature from the 17th century.
You have written several books, one of which is called ‘The Art Of Worldly Wisdom’. You will find a question or a statement from a student below:

### Statement:
{INSTRUCTION}

###

Answer this question or statement in your style in a first person perspective. Below, You will also find an extract from one of your books: Use this extract partially, completely or not at all, as you wish.

### Extract:
{EXCERPT}

###

Formulate only the answer. Be VERY short, VERY concise and on point; don't babble too much and don't waste words. Your answer should be 2 to 4 sentences at max! The answer should also resemble the linguistic style of the provided extract and you, as a 17th century philosopher and should be written in old english.
'''

In [45]:
def enrich_output(scoped_df):
    counter = 1
    new_rows = []
    for row in scoped_df.iter_rows(named=True):
        try:
            lang = row['lang']
            excerpt = row['header'] + '\n' + row['content']
            # Foreach row, we enrich X new instruction-following data
            for instruction in row['instructions']:
                new_row = copy.deepcopy(row)
                #cur_prompt = ger_output_prompt if lang == 'ger' else eng_output_prompt
                cur_prompt = eng_output_prompt
                cur_prompt = cur_prompt.replace('{INSTRUCTION}', instruction)
                cur_prompt = cur_prompt.replace('{EXCERPT}', excerpt)
                reply = get_response(cur_prompt)
                new_row['output'] = reply
                new_row['instructions'] = instruction
                new_rows.append(new_row)
            print(f'Done with row {counter}')
        except Exception as ex:
            print(f'Error on index: {counter}:' + str(ex))
        counter += 1
    return pl.DataFrame(new_rows)


In [46]:
if Config.ENRICH_OUTPUTS:
    df = enrich_output(df)
    display(df.head())
    print(len(df))

Done with row 1
Done with row 2
Done with row 3
Done with row 4
Done with row 5
Done with row 6
Done with row 7
Done with row 8
Done with row 9
Done with row 10
Done with row 11
Done with row 12
Done with row 13
Done with row 14
Done with row 15
Done with row 16
Done with row 17
Done with row 18
Done with row 19
Done with row 20
Done with row 21
Done with row 22
Done with row 23
Done with row 24
Done with row 25
Done with row 26
Done with row 27
Done with row 28
Done with row 29
Done with row 30
Done with row 31
Done with row 32
Done with row 33
Done with row 34
Done with row 35
Done with row 36
Done with row 37
Done with row 38
Done with row 39
Done with row 40
Done with row 41
Done with row 42
Done with row 43
Done with row 44
Done with row 45
Done with row 46
Done with row 47
Done with row 48
Done with row 49
Done with row 50
Done with row 51
Done with row 52
Done with row 53
Done with row 54
Done with row 55
Done with row 56
Done with row 57
Done with row 58
Done with row 59
Done w

lang,src,count,header,content,instructions,output
str,str,i64,str,str,str,str
"""ger""","""https://www.projekt-gutenberg.…",1,"""Alles hat heut zu Tage seinen …","""aber die Kunst sich geltend zu…","""How do you perceive the traits…","""In these modern times, the tra…"
"""ger""","""https://www.projekt-gutenberg.…",1,"""Alles hat heut zu Tage seinen …","""aber die Kunst sich geltend zu…","""What significance do you belie…","""In this modern age, the art of…"
"""ger""","""https://www.projekt-gutenberg.…",1,"""Alles hat heut zu Tage seinen …","""aber die Kunst sich geltend zu…","""In an era where the pursuit of…","""In these turbulent times, to m…"
"""ger""","""https://www.projekt-gutenberg.…",1,"""Alles hat heut zu Tage seinen …","""aber die Kunst sich geltend zu…","""What do you think are the key …","""In this tumultuous age, the wi…"
"""ger""","""https://www.projekt-gutenberg.…",1,"""Alles hat heut zu Tage seinen …","""aber die Kunst sich geltend zu…","""What do you believe are the es…","""In this age where all hath rea…"


4792


Polars doesnt produce row oriented josn exports, but I really want that. So do it manually I guess.

In [47]:
records = df.to_dicts()
with open("the_art_of_worldly_wisdom_enriched_v3.json", "w") as f:
    json.dump(records, f, indent=4)