# Data Processing Pipeline for [The Life You Were Born to Live](https://www.peacefulwarrior.com/the-life-you-were-born-to-live/) by *Dan Millman*

In [None]:
import os
import time

import pymupdf
import google.generativeai as genai

import google.generativeai.types as gtypes
from google.ai.generativelanguage_v1beta.types import content

## Stage 0: Preparations

We need to initialize several constants and setup & define Google Gemini related stuff.

In [None]:
# Constants
DIR_TR = "./tr"
DIR_EN = "./en"
PDF_TR = f"{DIR_TR}/PDFs"
PDF_EN = f"{DIR_EN}/PDFs"
MD_TR = f"{DIR_TR}/MDs"
MD_EN = f"{DIR_EN}/MDs"
SUMM_TR = f"{DIR_TR}/Summarizations"
SUMM_EN = f"{DIR_EN}/Summarizations"
JSON_TR = f"{DIR_TR}/JSONs"
JSON_EN = f"{DIR_EN}/JSONs"
JSON_TR_EXT = f"{DIR_TR}/JSONs_Extended"
BOOK = "./millman_1995.pdf"

TIMEOUT_IN_SECONDS = 15

In [None]:
# Setup Google Gemini
genai.configure(api_key=os.environ["GEMINI_API_KEY"])

In [None]:
def upload_to_gemini(path, mime_type=None) -> gtypes.file_types.File:
    """Uploads the given file to Gemini.

    See https://ai.google.dev/gemini-api/docs/prompting_with_media
    """
    file = genai.upload_file(path, mime_type=mime_type)
    print(f"Uploaded file '{file.display_name}' as: {file.uri}")
    return file

In [None]:
def wait_for_files_active(files) -> None:
    """Waits for the given files to be active.

    Some files uploaded to the Gemini API need to be processed before they can be
    used as prompt inputs. The status can be seen by querying the file's "state"
    field.

    This implementation uses a simple blocking polling loop. Production code
    should probably employ a more sophisticated approach.
    """
    print("Waiting for file processing...")
    for name in (file.name for file in files):
        file = genai.get_file(name)
        while file.state.name == "PROCESSING":
            print(".", end="", flush=True)
            time.sleep(TIMEOUT_IN_SECONDS)
            file = genai.get_file(name)
        if file.state.name != "ACTIVE":
            raise Exception(f"File {file.name} failed to process")
    print("...all files ready")
    print()

In [None]:
# Create the Markdown model
md_model = genai.GenerativeModel(
    model_name="gemini-2.0-pro-exp-02-05",
    safety_settings=[
        gtypes.SafetySettingDict(
            category=gtypes.HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT,
            threshold=gtypes.HarmBlockThreshold.BLOCK_NONE,
        ),
        gtypes.SafetySettingDict(
            category=gtypes.HarmCategory.HARM_CATEGORY_HARASSMENT,
            threshold=gtypes.HarmBlockThreshold.BLOCK_NONE,
        ),
        gtypes.SafetySettingDict(
            category=gtypes.HarmCategory.HARM_CATEGORY_HATE_SPEECH,
            threshold=gtypes.HarmBlockThreshold.BLOCK_NONE,
        ),
        gtypes.SafetySettingDict(
            category=gtypes.HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT,
            threshold=gtypes.HarmBlockThreshold.BLOCK_NONE,
        ),
    ],
    generation_config=gtypes.GenerationConfig(
        max_output_tokens=8192,
        temperature=1.0,
        top_p=0.95,
        top_k=64,
        response_mime_type="text/plain",
    ),
    system_instruction="""You are a specialized Markdown converter designed to process and repair text files. Your task is to take potentially corrupted text, clean it, and convert it into well-formatted Markdown.  You should perform the following steps in order:

1.  **Text Extraction and Preservation:**  Extract the complete text from the input.  Crucially, preserve the original character order and any unusual characters *even if they appear to be errors*.  Do not attempt to "correct" anything at this stage.

2.  **De-hyphenation:** Carefully de-hyphenate the raw text, paying close attention to English hyphenation rules.  This involves:
    *   Identifying hyphens at the end of lines.
    *   Determining if the hyphen represents a true word break (requiring removal and joining of the word parts) or a hyphenated word (requiring the hyphen to be retained).  Use English linguistic rules and context to make this determination. *Prioritize accurately joining words that were split across lines.*  Be conservative; if unsure, it's better to leave a hyphen than to incorrectly join unrelated words.

3.  **Corruption Repair (English-Specific):** This is the most complex step and requires a deep understanding of English orthography and common OCR/scanning errors. Address the following types of corruption:
    *   **Typos:** Correct common English typographical errors, including incorrect characters, transpositions, and omissions. Use a English spellchecker or language model (internally, if possible) to assist, but *prioritize corrections that are highly likely to be accurate*.  Avoid making speculative changes.
    *   **Spacing Errors:**
        *   **Missing Spaces:** Insert spaces between words where they are missing (e.g., "spacesbetweenwords").
        *   **Extra Spaces:** Remove extraneous spaces within words (e.g., "w o r d") or between characters.
        *   **Incorrect Spaces around Punctuation:** Ensure correct spacing around English punctuation marks (periods, commas, question marks, etc.).
    *   **Paragraph Reconstruction:**  Identify and correct incorrect paragraph breaks caused by page endings or scanning artifacts.  Use contextual clues (sentence structure, topic shifts) to determine true paragraph boundaries.  Combine fragments of sentences that were split across lines.
    *   **Character Corruption:** Correct corrupted UTF-8 or other encoding issues. The goal is to correct the text to accurate English spelling.

4.  **Markdown Conversion:** Convert the cleaned and corrected text to Markdown, adhering to the following rules:
    *   **Headings:**  Identify potential headings based on context and capitalization. Use appropriate Markdown heading levels (`#`, `##`, `###`, etc.).  Be conservative; if unsure, prefer a lower heading level or plain text.
    *   **Lists:**  Identify and format bulleted or numbered lists. Look for common list indicators (e.g., numbers, bullets, dashes).
    *   **Paragraphs:** Separate paragraphs with *two* newline characters (`\n\n`). This is crucial for proper Markdown rendering.
    *   **Other Elements:** If you confidently identify other Markdown elements (e.g., bold, italics, blockquotes), format them appropriately. However, *prioritize accuracy over completeness*.  It's better to have plain text than incorrect Markdown.
    * **Do not add any elements not supported in Markdown**

5. **Output**
    *   **Markdown Only:** Output *only* the resulting Markdown text. Do not include any explanations, comments, or additional information.
    * **No additional changes:** Do not provide additional output.""",
)

In [None]:
# Create the JSON model
json_model = genai.GenerativeModel(
    model_name="gemini-2.0-pro-exp-02-05",
    safety_settings=[
        gtypes.SafetySettingDict(
            category=gtypes.HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT,
            threshold=gtypes.HarmBlockThreshold.BLOCK_NONE,
        ),
        gtypes.SafetySettingDict(
            category=gtypes.HarmCategory.HARM_CATEGORY_HARASSMENT,
            threshold=gtypes.HarmBlockThreshold.BLOCK_NONE,
        ),
        gtypes.SafetySettingDict(
            category=gtypes.HarmCategory.HARM_CATEGORY_HATE_SPEECH,
            threshold=gtypes.HarmBlockThreshold.BLOCK_NONE,
        ),
        gtypes.SafetySettingDict(
            category=gtypes.HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT,
            threshold=gtypes.HarmBlockThreshold.BLOCK_NONE,
        ),
    ],
    generation_config=gtypes.GenerationConfig(
        max_output_tokens=8192,
        temperature=1.0,
        top_p=0.95,
        top_k=64,
        response_mime_type="application/json",
        response_schema=content.Schema(
            type=content.Type.OBJECT,
            enum=[],
            required=[
                "key_traits",
                "challenges",
                "opportunities",
                "health",
                "relationships",
                "talents_work_finances",
                "famous_people",
                "fulfilling_destiny",
            ],
            properties={
                "key_traits": content.Schema(
                    type=content.Type.ARRAY,
                    items=content.Schema(
                        type=content.Type.STRING,
                    ),
                ),
                "challenges": content.Schema(
                    type=content.Type.ARRAY,
                    items=content.Schema(
                        type=content.Type.STRING,
                    ),
                ),
                "opportunities": content.Schema(
                    type=content.Type.ARRAY,
                    items=content.Schema(
                        type=content.Type.STRING,
                    ),
                ),
                "health": content.Schema(
                    type=content.Type.OBJECT,
                    enum=[],
                    required=["positive", "negative", "advice"],
                    properties={
                        "positive": content.Schema(
                            type=content.Type.ARRAY,
                            items=content.Schema(
                                type=content.Type.STRING,
                            ),
                        ),
                        "negative": content.Schema(
                            type=content.Type.ARRAY,
                            items=content.Schema(
                                type=content.Type.STRING,
                            ),
                        ),
                        "advice": content.Schema(
                            type=content.Type.ARRAY,
                            items=content.Schema(
                                type=content.Type.STRING,
                            ),
                        ),
                    },
                ),
                "relationships": content.Schema(
                    type=content.Type.OBJECT,
                    enum=[],
                    required=["positive", "negative", "advice"],
                    properties={
                        "positive": content.Schema(
                            type=content.Type.ARRAY,
                            items=content.Schema(
                                type=content.Type.STRING,
                            ),
                        ),
                        "negative": content.Schema(
                            type=content.Type.ARRAY,
                            items=content.Schema(
                                type=content.Type.STRING,
                            ),
                        ),
                        "advice": content.Schema(
                            type=content.Type.ARRAY,
                            items=content.Schema(
                                type=content.Type.STRING,
                            ),
                        ),
                    },
                ),
                "talents_work_finances": content.Schema(
                    type=content.Type.OBJECT,
                    enum=[],
                    required=["positive", "negative", "advice"],
                    properties={
                        "positive": content.Schema(
                            type=content.Type.ARRAY,
                            items=content.Schema(
                                type=content.Type.STRING,
                            ),
                        ),
                        "negative": content.Schema(
                            type=content.Type.ARRAY,
                            items=content.Schema(
                                type=content.Type.STRING,
                            ),
                        ),
                        "advice": content.Schema(
                            type=content.Type.ARRAY,
                            items=content.Schema(
                                type=content.Type.STRING,
                            ),
                        ),
                    },
                ),
                "famous_people": content.Schema(
                    type=content.Type.ARRAY,
                    items=content.Schema(
                        type=content.Type.STRING,
                    ),
                ),
                "fulfilling_destiny": content.Schema(
                    type=content.Type.OBJECT,
                    enum=[],
                    required=["guidelines", "questions"],
                    properties={
                        "guidelines": content.Schema(
                            type=content.Type.ARRAY,
                            items=content.Schema(
                                type=content.Type.STRING,
                            ),
                        ),
                        "questions": content.Schema(
                            type=content.Type.ARRAY,
                            items=content.Schema(
                                type=content.Type.STRING,
                            ),
                        ),
                    },
                ),
            },
        ),
    ),
    system_instruction="You are a Markdown to JSON converter.  Your task is to parse the provided Markdown input and generate a JSON object representing the key information and structure.  Focus on extracting important entities, relationships, and overall document structure.  Prioritize accuracy and a logical JSON schema.",
)

In [None]:
# Create the summarization model
summ_model = genai.GenerativeModel(
    model_name="gemini-2.0-pro-exp-02-05",
    safety_settings=[
        gtypes.SafetySettingDict(
            category=gtypes.HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT,
            threshold=gtypes.HarmBlockThreshold.BLOCK_NONE,
        ),
        gtypes.SafetySettingDict(
            category=gtypes.HarmCategory.HARM_CATEGORY_HARASSMENT,
            threshold=gtypes.HarmBlockThreshold.BLOCK_NONE,
        ),
        gtypes.SafetySettingDict(
            category=gtypes.HarmCategory.HARM_CATEGORY_HATE_SPEECH,
            threshold=gtypes.HarmBlockThreshold.BLOCK_NONE,
        ),
        gtypes.SafetySettingDict(
            category=gtypes.HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT,
            threshold=gtypes.HarmBlockThreshold.BLOCK_NONE,
        ),
    ],
    generation_config=gtypes.GenerationConfig(
        max_output_tokens=8192,
        temperature=1.0,
        top_p=0.95,
        top_k=64,
        response_mime_type="text/plain",
    ),
    system_instruction="""You are a specialist in summarizing personality typing systems, particularly those similar to and including Dan Millman's "The Life You Were Born to Live".  Your task is to create a comprehensive summary of the provided text (which will be pasted below this prompt).  The summary should be in the form of a bulleted list.

**Specific Instructions:**

1.  **Target Audience:** Assume the reader has *some* familiarity with the general concept of personality typing (e.g., Myers-Briggs, Enneagram) but may be new to Forbes' system or similar concepts.
2.  **Focus:** Identify and summarize the *key concepts, principles, and terminology* presented in the text.  Don't get bogged down in minor details; prioritize the core ideas.  If the text describes specific types, profiles, or categories, clearly outline their defining characteristics.
3.  **Language:**  The summary must be written in **English**.
4.  **Format:** Use Markdown for the bulleted list. Each bullet point should be concise but informative.  Use nested bullet points (indentation) to show hierarchical relationships between concepts where appropriate.  For example, if a main concept has several sub-components, list the main concept as a top-level bullet and the sub-components as indented bullets beneath it.
5.  **Comprehensiveness:** While concise, the summary should be comprehensive enough that someone reading it would gain a solid understanding of the main ideas presented in the original text. Avoid overly simplistic or vague summaries.
6.  **Objectivity:** Maintain a neutral and objective tone.  Do not express personal opinions about the validity or usefulness of the system being described.  Present the information as it is presented in the text.
7.  **Terminology:** Pay close attention to any specialized terminology used in the text.
8. **Contextualization (if applicable):** If the provided text refers to other personality systems or authors, briefly note these connections in the summary *if they are essential to understanding the main points*.

**Example Structure (Markdown):**

```markdown
- **Main Component 1:** Short description.
    - Sub-component 1.1: More detailed description.
    - Sub-component 1.2: More detailed description.
- **Main Component 2:** Short description.
    - Sub-component 2.1: More detailed description.
- **Main Component 3:** ...
```

Do not provide additional output.""",
)

In [None]:
# Create the Markdown translation model
trans_md_model = genai.GenerativeModel(
    model_name="gemini-2.0-pro-exp-02-05",
    safety_settings=[
        gtypes.SafetySettingDict(
            category=gtypes.HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT,
            threshold=gtypes.HarmBlockThreshold.BLOCK_NONE,
        ),
        gtypes.SafetySettingDict(
            category=gtypes.HarmCategory.HARM_CATEGORY_HARASSMENT,
            threshold=gtypes.HarmBlockThreshold.BLOCK_NONE,
        ),
        gtypes.SafetySettingDict(
            category=gtypes.HarmCategory.HARM_CATEGORY_HATE_SPEECH,
            threshold=gtypes.HarmBlockThreshold.BLOCK_NONE,
        ),
        gtypes.SafetySettingDict(
            category=gtypes.HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT,
            threshold=gtypes.HarmBlockThreshold.BLOCK_NONE,
        ),
    ],
    generation_config=gtypes.GenerationConfig(
        max_output_tokens=8192,
        temperature=1.0,
        top_p=0.95,
        top_k=64,
        response_mime_type="text/plain",
    ),
    system_instruction="You are a highly skilled English-to-Turkish translator with expertise in Markdown. Translate the following English text into idiomatic and accurate Turkish, preserving the original Markdown formatting. Pay close attention to maintaining the tone and style of the original text.",
)

In [None]:
# Create the JSON translation model
trans_json_model = genai.GenerativeModel(
    model_name="gemini-2.0-pro-exp-02-05",
    safety_settings=[
        gtypes.SafetySettingDict(
            category=gtypes.HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT,
            threshold=gtypes.HarmBlockThreshold.BLOCK_NONE,
        ),
        gtypes.SafetySettingDict(
            category=gtypes.HarmCategory.HARM_CATEGORY_HARASSMENT,
            threshold=gtypes.HarmBlockThreshold.BLOCK_NONE,
        ),
        gtypes.SafetySettingDict(
            category=gtypes.HarmCategory.HARM_CATEGORY_HATE_SPEECH,
            threshold=gtypes.HarmBlockThreshold.BLOCK_NONE,
        ),
        gtypes.SafetySettingDict(
            category=gtypes.HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT,
            threshold=gtypes.HarmBlockThreshold.BLOCK_NONE,
        ),
    ],
    generation_config=gtypes.GenerationConfig(
        max_output_tokens=8192,
        temperature=1.0,
        top_p=0.95,
        top_k=64,
        response_mime_type="application/json",
        response_schema=content.Schema(
            type=content.Type.OBJECT,
            enum=[],
            required=[
                "key_traits",
                "challenges",
                "opportunities",
                "health",
                "relationships",
                "talents_work_finances",
                "famous_people",
                "fulfilling_destiny",
            ],
            properties={
                "key_traits": content.Schema(
                    type=content.Type.ARRAY,
                    items=content.Schema(
                        type=content.Type.STRING,
                    ),
                ),
                "challenges": content.Schema(
                    type=content.Type.ARRAY,
                    items=content.Schema(
                        type=content.Type.STRING,
                    ),
                ),
                "opportunities": content.Schema(
                    type=content.Type.ARRAY,
                    items=content.Schema(
                        type=content.Type.STRING,
                    ),
                ),
                "health": content.Schema(
                    type=content.Type.OBJECT,
                    enum=[],
                    required=["positive", "negative", "advice"],
                    properties={
                        "positive": content.Schema(
                            type=content.Type.ARRAY,
                            items=content.Schema(
                                type=content.Type.STRING,
                            ),
                        ),
                        "negative": content.Schema(
                            type=content.Type.ARRAY,
                            items=content.Schema(
                                type=content.Type.STRING,
                            ),
                        ),
                        "advice": content.Schema(
                            type=content.Type.ARRAY,
                            items=content.Schema(
                                type=content.Type.STRING,
                            ),
                        ),
                    },
                ),
                "relationships": content.Schema(
                    type=content.Type.OBJECT,
                    enum=[],
                    required=["positive", "negative", "advice"],
                    properties={
                        "positive": content.Schema(
                            type=content.Type.ARRAY,
                            items=content.Schema(
                                type=content.Type.STRING,
                            ),
                        ),
                        "negative": content.Schema(
                            type=content.Type.ARRAY,
                            items=content.Schema(
                                type=content.Type.STRING,
                            ),
                        ),
                        "advice": content.Schema(
                            type=content.Type.ARRAY,
                            items=content.Schema(
                                type=content.Type.STRING,
                            ),
                        ),
                    },
                ),
                "talents_work_finances": content.Schema(
                    type=content.Type.OBJECT,
                    enum=[],
                    required=["positive", "negative", "advice"],
                    properties={
                        "positive": content.Schema(
                            type=content.Type.ARRAY,
                            items=content.Schema(
                                type=content.Type.STRING,
                            ),
                        ),
                        "negative": content.Schema(
                            type=content.Type.ARRAY,
                            items=content.Schema(
                                type=content.Type.STRING,
                            ),
                        ),
                        "advice": content.Schema(
                            type=content.Type.ARRAY,
                            items=content.Schema(
                                type=content.Type.STRING,
                            ),
                        ),
                    },
                ),
                "famous_people": content.Schema(
                    type=content.Type.ARRAY,
                    items=content.Schema(
                        type=content.Type.STRING,
                    ),
                ),
                "fulfilling_destiny": content.Schema(
                    type=content.Type.OBJECT,
                    enum=[],
                    required=["guidelines", "questions"],
                    properties={
                        "guidelines": content.Schema(
                            type=content.Type.ARRAY,
                            items=content.Schema(
                                type=content.Type.STRING,
                            ),
                        ),
                        "questions": content.Schema(
                            type=content.Type.ARRAY,
                            items=content.Schema(
                                type=content.Type.STRING,
                            ),
                        ),
                    },
                ),
            },
        ),
    ),
    system_instruction="You are a highly skilled English-to-Turkish translator with expertise in JSON. Translate the following English text into idiomatic and accurate Turkish, preserving the original JSON formatting. Pay close attention to maintaining the tone and style of the original text.",
)

In [None]:
# Create the JSON extend model
json_extend_model = genai.GenerativeModel(
    model_name="gemini-2.0-pro-exp-02-05",
    safety_settings=[
        gtypes.SafetySettingDict(
            category=gtypes.HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT,
            threshold=gtypes.HarmBlockThreshold.BLOCK_NONE,
        ),
        gtypes.SafetySettingDict(
            category=gtypes.HarmCategory.HARM_CATEGORY_HARASSMENT,
            threshold=gtypes.HarmBlockThreshold.BLOCK_NONE,
        ),
        gtypes.SafetySettingDict(
            category=gtypes.HarmCategory.HARM_CATEGORY_HATE_SPEECH,
            threshold=gtypes.HarmBlockThreshold.BLOCK_NONE,
        ),
        gtypes.SafetySettingDict(
            category=gtypes.HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT,
            threshold=gtypes.HarmBlockThreshold.BLOCK_NONE,
        ),
    ],
    generation_config=gtypes.GenerationConfig(
        max_output_tokens=8192,
        temperature=1.0,
        top_p=0.95,
        top_k=64,
        response_mime_type="application/json",
        response_schema=content.Schema(
            type=content.Type.OBJECT,
            enum=[],
            required=[
                "key_traits",
                "challenges",
                "opportunities",
                "health",
                "relationships",
                "talents_work_finances",
                "famous_people",
                "fulfilling_destiny",
            ],
            properties={
                "key_traits": content.Schema(
                    type=content.Type.ARRAY,
                    items=content.Schema(
                        type=content.Type.STRING,
                    ),
                ),
                "challenges": content.Schema(
                    type=content.Type.ARRAY,
                    items=content.Schema(
                        type=content.Type.STRING,
                    ),
                ),
                "opportunities": content.Schema(
                    type=content.Type.ARRAY,
                    items=content.Schema(
                        type=content.Type.STRING,
                    ),
                ),
                "health": content.Schema(
                    type=content.Type.OBJECT,
                    enum=[],
                    required=["positive", "negative", "advice"],
                    properties={
                        "positive": content.Schema(
                            type=content.Type.ARRAY,
                            items=content.Schema(
                                type=content.Type.STRING,
                            ),
                        ),
                        "negative": content.Schema(
                            type=content.Type.ARRAY,
                            items=content.Schema(
                                type=content.Type.STRING,
                            ),
                        ),
                        "advice": content.Schema(
                            type=content.Type.ARRAY,
                            items=content.Schema(
                                type=content.Type.STRING,
                            ),
                        ),
                    },
                ),
                "relationships": content.Schema(
                    type=content.Type.OBJECT,
                    enum=[],
                    required=["positive", "negative", "advice"],
                    properties={
                        "positive": content.Schema(
                            type=content.Type.ARRAY,
                            items=content.Schema(
                                type=content.Type.STRING,
                            ),
                        ),
                        "negative": content.Schema(
                            type=content.Type.ARRAY,
                            items=content.Schema(
                                type=content.Type.STRING,
                            ),
                        ),
                        "advice": content.Schema(
                            type=content.Type.ARRAY,
                            items=content.Schema(
                                type=content.Type.STRING,
                            ),
                        ),
                    },
                ),
                "talents_work_finances": content.Schema(
                    type=content.Type.OBJECT,
                    enum=[],
                    required=["positive", "negative", "advice"],
                    properties={
                        "positive": content.Schema(
                            type=content.Type.ARRAY,
                            items=content.Schema(
                                type=content.Type.STRING,
                            ),
                        ),
                        "negative": content.Schema(
                            type=content.Type.ARRAY,
                            items=content.Schema(
                                type=content.Type.STRING,
                            ),
                        ),
                        "advice": content.Schema(
                            type=content.Type.ARRAY,
                            items=content.Schema(
                                type=content.Type.STRING,
                            ),
                        ),
                    },
                ),
                "famous_people": content.Schema(
                    type=content.Type.ARRAY,
                    items=content.Schema(
                        type=content.Type.STRING,
                    ),
                ),
                "fulfilling_destiny": content.Schema(
                    type=content.Type.OBJECT,
                    enum=[],
                    required=["guidelines", "questions"],
                    properties={
                        "guidelines": content.Schema(
                            type=content.Type.ARRAY,
                            items=content.Schema(
                                type=content.Type.STRING,
                            ),
                        ),
                        "questions": content.Schema(
                            type=content.Type.ARRAY,
                            items=content.Schema(
                                type=content.Type.STRING,
                            ),
                        ),
                    },
                ),
            },
        ),
    ),
    system_instruction="""You are a professional summarization and text expansion specialist. Your primary task is to take information provided in a JSON format, and elaborate upon it, creating more detailed and nuanced text in Turkish. You should focus on natural, flowing language, as if a native Turkish speaker were explaining the concepts to a friend or colleague. Avoid overly formal or technical language unless the context specifically requires it (which will be indicated in the JSON if necessary).

**Specific Instructions:**

1.  **Input:** You will receive a JSON object.  The specific structure may vary, but it will always contain keys whose values are *strings* representing sentences or short phrases that need to be expanded.  These are the core ideas you will work with.  There may be additional contextual information within the JSON, also.
2.  **Expansion & Elaboration:** For each of these core strings (sentences/phrases):
    *   **Extend:**  Expand the sentence or phrase into one or more *paragraphs*. The goal is not just to make the text longer, but to add relevant details, examples, implications, or related information.  Think about answering the "who, what, where, when, why, and how" related to the original idea.
    *   **Rewrite for Detail:**  Rephrase the original idea with more descriptive language.  Don't just repeat the same concept; provide greater clarity and depth.  Imagine you're explaining the concept to someone who has very little background knowledge.
3.  **Turkish Language:**  All output *must* be in grammatically correct and natural-sounding Turkish. Use appropriate idioms and expressions where they fit naturally.
4.  **Natural Tone (Default):**  Use a natural, conversational tone. Imagine you are explaining the concepts to a friend or colleague in a relaxed setting.""",
)

## Stage 1: Partitioning the book by its chapters and life paths

The book "The Life You Were Born to Live" is structured with several chapters, each chapter containing "life path" pair of digits and their traits. Here, we map these chapters and life paths to their page ranges in the book and we partition based on these page ranges.

Partitioning is important to provide the LLMs only the related information in order to reduce the chances of hallucination.

In [None]:
PAGE_RANGE_TO_LIFE_PATHS = {
    (130, 136): [(19, 10)],
    (136, 142): [(28, 10)],
    (142, 148): [(37, 10)],
    (148, 153): [(46, 10)],
    (154, 161): [(29, 11)],
    (161, 167): [(38, 11)],
    (167, 173): [(47, 11)],
    (174, 180): [(20, 2)],
    (181, 188): [(39, 12)],
    (188, 194): [(48, 12)],
    (195, 202): [(30, 3)],
    (202, 207): [(21, 3), (12, 3)],
    (209, 215): [(40, 4)],
    (215, 221): [(22, 4)],
    (221, 228): [(31, 4), (13, 4)],
    (229, 236): [(32, 5), (23, 5)],
    (236, 242): [(41, 5), (14, 5)],
    (243, 249): [(15, 6)],
    (249, 256): [(24, 6), (42, 6)],
    (256, 263): [(33, 6)],
    (264, 269): [(16, 7)],
    (269, 276): [(25, 7)],
    (276, 282): [(34, 7), (43, 7)],
    (283, 289): [(17, 8)],
    (289, 296): [(26, 8)],
    (296, 303): [(35, 8)],
    (303, 309): [(44, 8)],
    (310, 316): [(18, 9)],
    (316, 323): [(27, 9)],
    (323, 331): [(36, 9)],
    (331, 338): [(45, 9)],
}

In [None]:
with pymupdf.open(BOOK) as millman_book:
    for (start, end), life_paths in PAGE_RANGE_TO_LIFE_PATHS.items():
        for life_path in life_paths:
            with pymupdf.open() as part_pdf:
                part_pdf.insert_pdf(millman_book, from_page=start, to_page=end)
                part_pdf.save(f"{PDF_EN}/{life_path[0]}_{life_path[1]}.pdf")

## Stage 2: Convert PDF partitions to Markdown files

Working with raw textual data is almost always better than working with binary data formats like PDF. That is why we're converting each partition to the Markdown format.

In [None]:
def pdf_to_md(pdf_path: str, md_path: str) -> None:
    files = [upload_to_gemini(pdf_path, mime_type="application/pdf")]
    wait_for_files_active(files)

    chat_session = md_model.start_chat(history=[{"role": "user", "parts": files}])
    response = chat_session.send_message("Convert this PDF file to Markdown.")

    files[0].delete()

    with open(md_path, "w", encoding="UTF-8") as md_f:
        md_f.write(response.text)

In [None]:
for pdf_path, md_path in [
    (f"{PDF_EN}/{pdf}", f"{MD_EN}/{pdf.split('.')[0]}.md")
    for pdf in sorted(os.listdir(PDF_EN))
    if not ".keepdir" in pdf
]:
    print(f"[PROCESSING] {pdf_path}...")
    try:
        pdf_to_md(pdf_path, md_path)
    except Exception as err:
        print(f"[FAILED] {pdf_path}! Reason: {err}")
    else:
        print(f"[PROCESSED] {pdf_path}!")

    time.sleep(TIMEOUT_IN_SECONDS)

## Stage 3: Extracting useful structured data using JSON objects

Extracting structured key data points is important for coherent and concise summarizations, thus here we're creating them.

In [None]:
def md_to_json(md_path: str, json_path: str) -> None:
    chat_session = json_model.start_chat(history=[])

    content = ""
    with open(md_path, "r", encoding="UTF-8") as f:
        content = f.read()

    response = chat_session.send_message(content)

    with open(json_path, "w", encoding="UTF-8") as f:
        f.write(response.text)

In [None]:
for md_path, json_path in [
    (f"{MD_EN}/{md}", f"{JSON_EN}/{md.split('.')[0]}.json") for md in os.listdir(MD_EN)
]:
    print(f"[PROCESSING] {md_path}...")

    try:
        md_to_json(md_path, json_path)
    except Exception as err:
        print(f"[FAILED] {md_path}! Reason: {err}.")
    else:
        print(f"[PROCESSED] {md_path}!")

    time.sleep(TIMEOUT_IN_SECONDS)

## Stage 4: Summarization

Apparently the full text is too long -- so we summarize it.

In [None]:
def summarize(md_path: str, json_path: str, summarization_path: str) -> None:
    chat_session = summ_model.start_chat(history=[])

    content = ""

    with open(md_path, "r", encoding="UTF-8") as f:
        content += f.read()

    with open(json_path, "r", encoding="UTF-8") as f:
        content += f"\n\n```json\n{f.read()}```"

    response = chat_session.send_message(content)

    with open(summarization_path, "w", encoding="UTF-8") as f:
        f.write(response.text)

In [None]:
for md_path, json_path, summ_path in [
    (
        f"{MD_EN}/{md}",
        f"{JSON_EN}/{md.split('.')[-2]}.json",
        f"{SUMM_EN}/{md.split('.')[-2]}.md",
    )
    for md in sorted(os.listdir(MD_EN))
]:
    print(f"[PROCESSING] {md_path}...")

    try:
        summarize(md_path, json_path, summ_path)
    except Exception as err:
        print(f"[FAILED] {md_path}! Reason: {err}.")
    else:
        print(f"[PROCESSED] {md_path}!")

    time.sleep(TIMEOUT_IN_SECONDS)

## Stage 5: Translation

Since we're working with Turkish audience, we need to translate our data into Turkish.

In [None]:
def translate_md(en_path: str, tr_path: str) -> None:
    chat_session = trans_md_model.start_chat(history=[])

    with open(en_path, "r", encoding="UTF-8") as f:
        en_content = f.read()

    response = chat_session.send_message(en_content)

    with open(tr_path, "w", encoding="UTF-8") as f:
        f.write(response.text)

In [None]:
def translate_json(en_path: str, tr_path: str) -> None:
    chat_session = trans_json_model.start_chat(history=[])

    with open(en_path, "r", encoding="UTF-8") as f:
        en_content = f.read()

    response = chat_session.send_message(en_content)

    with open(tr_path, "w", encoding="UTF-8") as f:
        f.write(response.text)

### Stage 5.1: Translating full text Markdown

In [None]:
for en_path, tr_path in [
    (f"{MD_EN}/{md}", f"{MD_TR}/{md}") for md in os.listdir(MD_EN)
]:
    print(f"[PROCESSING] {en_path}...")

    try:
        translate_md(en_path, tr_path)
    except Exception as err:
        print(f"[FAILED] {md_path}! Reason: {err}.")
    else:
        print(f"[PROCESSED] {md_path}!")

    time.sleep(TIMEOUT_IN_SECONDS)

### Stage 5.2: Translating summarized Markdown

In [None]:
for en_path, tr_path in [
    (f"{SUMM_EN}/{md}", f"{SUMM_TR}/{md}") for md in os.listdir(SUMM_EN)
]:
    print(f"[PROCESSING] {en_path}...")

    try:
        translate_md(en_path, tr_path)
    except Exception as err:
        print(f"[FAILED] {md_path}! Reason: {err}.")
    else:
        print(f"[PROCESSED] {md_path}!")

    time.sleep(TIMEOUT_IN_SECONDS)

### Stage 5.3: Translating JSON

In [None]:
for en_path, tr_path in [
    (f"{JSON_EN}/{json}", f"{JSON_TR}/{json}") for json in os.listdir(JSON_EN)
]:
    print(f"[PROCESSING] {en_path}...")

    try:
        translate_json(en_path, tr_path)
    except Exception as err:
        print(f"[FAILED] {md_path}! Reason: {err}.")
    else:
        print(f"[PROCESSED] {md_path}!")

    time.sleep(TIMEOUT_IN_SECONDS)

## Stage 6: Extending JSON

Unfortunately the book doesn't have enough data for us to work with. This is why we let the LLM get creative and extend our JSON fields with very similar values.

In [None]:
def extend_json(json_path: str, extended_json_path: str) -> None:
    chat_session = json_extend_model.start_chat(history=[])

    with open(json_path, "r", encoding="UTF-8") as f:
        json_data = f.read()

    response = chat_session.send_message(f"```json\n{json_data}\n```")

    with open(extended_json_path, "w", encoding="UTF-8") as f:
        f.write(response.text)

In [None]:
for json_path, json_extend_path in [
    (f"{JSON_TR}/{json_path}", f"{JSON_TR_EXT}/{json_path}")
    for json_path in sorted(os.listdir(JSON_TR))
]:
    print(f"[PROCESSING] {json_path}...")

    try:
        extend_json(json_path, json_extend_path)
    except Exception as err:
        print(f"[FAILED] {json_path}! Reason: {err}.")
    else:
        print(f"[PROCESSED] {json_path}!")

    time.sleep(TIMEOUT_IN_SECONDS)