# Data Processing Pipeline for [Human Pin Code](https://humanpincode.com/) by *Douglas Forbes*

In [None]:
import os
import time

import pymupdf
import google.generativeai as genai

import google.generativeai.types as gtypes

## Stage 0: Preparations

We need to initialize several constants and setup & define Google Gemini related stuff.

In [49]:
# Constants
DIR_TR = "./tr"
PDF_TR = f"{DIR_TR}/PDFs"
MD_TR = f"{DIR_TR}/MDs"
SUMM_TR = f"{DIR_TR}/Summarizations"
BOOK = "./forbes.pdf"

TIMEOUT_IN_SECONDS = 15

In [None]:
# Setup Google Gemini
genai.configure(api_key=os.environ["GEMINI_API_KEY"])

In [None]:
def upload_to_gemini(path, mime_type=None) -> gtypes.file_types.File:
    """Uploads the given file to Gemini.

    See https://ai.google.dev/gemini-api/docs/prompting_with_media
    """
    file = genai.upload_file(path, mime_type=mime_type)
    print(f"Uploaded file '{file.display_name}' as: {file.uri}")
    return file

In [None]:
def wait_for_files_active(files) -> None:
    """Waits for the given files to be active.

    Some files uploaded to the Gemini API need to be processed before they can be
    used as prompt inputs. The status can be seen by querying the file's "state"
    field.

    This implementation uses a simple blocking polling loop. Production code
    should probably employ a more sophisticated approach.
    """
    print("Waiting for file processing...")
    for name in (file.name for file in files):
        file = genai.get_file(name)
        while file.state.name == "PROCESSING":
            print(".", end="", flush=True)
            time.sleep(TIMEOUT_IN_SECONDS)
            file = genai.get_file(name)
        if file.state.name != "ACTIVE":
            raise Exception(f"File {file.name} failed to process")
    print("...all files ready")
    print()

In [None]:
# Create the Markdown model
md_model = genai.GenerativeModel(
    model_name="gemini-2.0-pro-exp-02-05",
    safety_settings=[
        gtypes.SafetySettingDict(
            category=gtypes.HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT,
            threshold=gtypes.HarmBlockThreshold.BLOCK_NONE,
        ),
        gtypes.SafetySettingDict(
            category=gtypes.HarmCategory.HARM_CATEGORY_HARASSMENT,
            threshold=gtypes.HarmBlockThreshold.BLOCK_NONE,
        ),
        gtypes.SafetySettingDict(
            category=gtypes.HarmCategory.HARM_CATEGORY_HATE_SPEECH,
            threshold=gtypes.HarmBlockThreshold.BLOCK_NONE,
        ),
        gtypes.SafetySettingDict(
            category=gtypes.HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT,
            threshold=gtypes.HarmBlockThreshold.BLOCK_NONE,
        ),
    ],
    generation_config=gtypes.GenerationConfig(
        max_output_tokens=8192,
        temperature=1.0,
        top_p=0.95,
        top_k=64,
        response_mime_type="text/plain",
    ),
    system_instruction="""You are a specialized Markdown converter designed to process and repair corrupted Turkish text files. Your task is to take potentially corrupted Turkish text, clean it, and convert it into well-formatted Markdown.  You should perform the following steps in order:

1.  **Text Extraction and Preservation:**  Extract the complete text from the input.  Crucially, preserve the original character order and any unusual characters *even if they appear to be errors*.  Do not attempt to "correct" anything at this stage.

2.  **De-hyphenation (Turkish-Specific):** Carefully de-hyphenate the raw text, paying close attention to Turkish hyphenation rules.  This involves:
    *   Identifying hyphens at the end of lines.
    *   Determining if the hyphen represents a true word break (requiring removal and joining of the word parts) or a hyphenated word (requiring the hyphen to be retained).  Use Turkish linguistic rules and context to make this determination. *Prioritize accurately joining words that were split across lines.*  Be conservative; if unsure, it's better to leave a hyphen than to incorrectly join unrelated words.

3.  **Corruption Repair (Turkish-Specific):** This is the most complex step and requires a deep understanding of Turkish orthography and common OCR/scanning errors. Address the following types of corruption:
    *   **Typos:** Correct common Turkish typographical errors, including incorrect characters, transpositions, and omissions. Use a Turkish spellchecker or language model (internally, if possible) to assist, but *prioritize corrections that are highly likely to be accurate*.  Avoid making speculative changes.
    *   **Spacing Errors:**
        *   **Missing Spaces:** Insert spaces between words where they are missing (e.g., "kelimelerarasındaboşluk").
        *   **Extra Spaces:** Remove extraneous spaces within words (e.g., "k e l i m e") or between characters.
        *   **Incorrect Spaces around Punctuation:** Ensure correct spacing around Turkish punctuation marks (periods, commas, question marks, etc.).
    *   **Paragraph Reconstruction:**  Identify and correct incorrect paragraph breaks caused by page endings or scanning artifacts.  Use contextual clues (sentence structure, topic shifts) to determine true paragraph boundaries.  Combine fragments of sentences that were split across lines.
    *   **Character Corruption:** Correct corrupted UTF-8 or other encoding issues. The goal is to correct the text to accurate Turkish spelling.

4.  **Markdown Conversion:** Convert the cleaned and corrected Turkish text to Markdown, adhering to the following rules:
    *   **Headings:**  Identify potential headings based on context and capitalization. Use appropriate Markdown heading levels (`#`, `##`, `###`, etc.).  Be conservative; if unsure, prefer a lower heading level or plain text.
    *   **Lists:**  Identify and format bulleted or numbered lists. Look for common list indicators (e.g., numbers, bullets, dashes).
    *   **Paragraphs:** Separate paragraphs with *two* newline characters (`\n\n`). This is crucial for proper Markdown rendering.
    *   **Other Elements:** If you confidently identify other Markdown elements (e.g., bold, italics, blockquotes), format them appropriately. However, *prioritize accuracy over completeness*.  It's better to have plain text than incorrect Markdown.
    * **Do not add any elements not supported in Markdown**

5. **Output**
    *   **Markdown Only:** Output *only* the resulting Markdown text. Do not include any explanations, comments, or additional information.
    * **No additional changes:** Do not provide additional output.""",
)

In [None]:
# Create the summarization model
summ_model = genai.GenerativeModel(
    model_name="gemini-2.0-pro-exp-02-05",
    safety_settings=[
        gtypes.SafetySettingDict(
            category=gtypes.HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT,
            threshold=gtypes.HarmBlockThreshold.BLOCK_NONE,
        ),
        gtypes.SafetySettingDict(
            category=gtypes.HarmCategory.HARM_CATEGORY_HARASSMENT,
            threshold=gtypes.HarmBlockThreshold.BLOCK_NONE,
        ),
        gtypes.SafetySettingDict(
            category=gtypes.HarmCategory.HARM_CATEGORY_HATE_SPEECH,
            threshold=gtypes.HarmBlockThreshold.BLOCK_NONE,
        ),
        gtypes.SafetySettingDict(
            category=gtypes.HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT,
            threshold=gtypes.HarmBlockThreshold.BLOCK_NONE,
        ),
    ],
    generation_config=gtypes.GenerationConfig(
        max_output_tokens=8192,
        temperature=1.0,
        top_p=0.95,
        top_k=64,
        response_mime_type="text/plain",
    ),
    system_instruction="""You are a specialist in summarizing personality typing systems, particularly those similar to and including Douglas Forbes' "Human Design System" (often referred to as the "Human Pin Code," though this isn't the official name).  Your task is to create a comprehensive summary of the provided text (which will be pasted below this prompt).  The summary should be in the form of a bulleted list.

**Specific Instructions:**

1.  **Target Audience:** Assume the reader has *some* familiarity with the general concept of personality typing (e.g., Myers-Briggs, Enneagram) but may be new to Forbes' system or similar concepts.
2.  **Focus:** Identify and summarize the *key concepts, principles, and terminology* presented in the text.  Don't get bogged down in minor details; prioritize the core ideas.  If the text describes specific types, profiles, or categories, clearly outline their defining characteristics.
3.  **Language:**  The summary must be written in **Turkish**.
4.  **Format:** Use Markdown for the bulleted list. Each bullet point should be concise but informative.  Use nested bullet points (indentation) to show hierarchical relationships between concepts where appropriate.  For example, if a main concept has several sub-components, list the main concept as a top-level bullet and the sub-components as indented bullets beneath it.
5.  **Comprehensiveness:** While concise, the summary should be comprehensive enough that someone reading it would gain a solid understanding of the main ideas presented in the original text. Avoid overly simplistic or vague summaries.
6.  **Objectivity:** Maintain a neutral and objective tone.  Do not express personal opinions about the validity or usefulness of the system being described.  Present the information as it is presented in the text.
7.  **Terminology:** Pay close attention to any specialized terminology used in the text.  If the Turkish translation of a term isn't immediately obvious, provide the English term in parentheses after the Turkish term the *first* time it appears.  (e.g., "Enerji Tipi (Energy Type)")
8. **Contextualization (if applicable):** If the provided text refers to other personality systems or authors, briefly note these connections in the summary *if they are essential to understanding the main points*.

**Example Structure (Markdown - Turkish):**

```markdown
- **Ana Kavram 1:** Kısa açıklama.
    - Alt Kavram 1.1: Daha detaylı açıklama.
    - Alt Kavram 1.2: Daha detaylı açıklama.
- **Ana Kavram 2:** Kısa açıklama (İngilizce Terim).
    - Alt Kavram 2.1: Daha detaylı açıklama.
- **Ana Kavram 3:** ...
```

Do not provide additional output.""",
)

## Stage 1: Partition the book by its chapters and digits

The book "Human Pin Code" is structured with several chapters, each chapter containing "pin code" digits and their traits. Here, we map these chapters and digits to their page ranges in the book and we partition based on these page ranges.

Partitioning is important to provide the LLMs only the related information in order to reduce the chances of hallucination.

In [None]:
DIGITS_TO_PAGE_RANGE = {
    1: {
        "initial": (59, 61),
        1: (61, 65),
        2: (65, 67),
        3: (67, 69),
        4: (69, 71),
        5: (71, 73),
        6: (73, 76),
        7: (76, 78),
        8: (78, 80),
        9: (80, 83),
    },
    2: {
        "initial": (85, 87),
        1: (87, 89),
        2: (89, 91),
        3: (91, 93),
        4: (93, 96),
        5: (98, 101),
        6: (101, 103),
        7: (103, 105),
        8: (105, 107),
        9: (107, 109),
    },
    3: {
        "initial": (111, 112),
        1: (112, 115),
        2: (115, 117),
        3: (117, 119),
        4: (119, 122),
        5: (122, 124),
        6: (124, 126),
        7: (126, 129),
        8: (129, 131),
        9: (131, 133),
    },
    4: {
        "initial": (135, 136),
        1: (136, 138),
        2: (138, 140),
        3: (140, 142),
        4: (142, 144),
        5: (144, 146),
        6: (146, 148),
        7: (148, 150),
        8: (150, 152),
        9: (152, 155),
    },
    5: {
        "initial": (157, 159),
        1: (159, 161),
        2: (161, 163),
        3: (163, 165),
        4: (165, 167),
        5: (167, 169),
        6: (169, 171),
        7: (171, 173),
        8: (173, 175),
        9: (175, 177),
    },
    6: {
        "initial": (180, 182),
        1: (182, 184),
        2: (184, 186),
        3: (186, 188),
        4: (188, 191),
        5: (191, 193),
        6: (193, 195),
        7: (195, 198),
        8: (198, 201),
        9: (201, 203),
    },
    7: {
        "initial": (204, 205),
        1: (205, 207),
        2: (207, 209),
        3: (209, 211),
        4: (211, 213),
        5: (213, 214),
        6: (214, 216),
        7: (216, 218),
        8: (218, 220),
        9: (220, 222),
    },
    8: {
        "initial": (224, 226),
        1: (226, 228),
        2: (228, 230),
        3: (230, 232),
        4: (232, 234),
        5: (234, 236),
        6: (236, 238),
        7: (238, 240),
        8: (240, 242),
        9: (242, 244),
    },
    9: {
        "initial": (246, 247),
        1: (247, 248),
        2: (248, 249),
        3: (249, 250),
        4: (250, 251),
        5: (251, 252),
        6: (252, 253),
        7: (253, 254),
        8: (254, 255),
        9: (255, 256),
    },
}

In [None]:
with pymupdf.open(BOOK) as forbes_book:
    for place, digits in DIGITS_TO_PAGE_RANGE.items():
        for digit, (start, end) in digits.items():
            with pymupdf.open() as part_pdf:
                part_pdf.insert_pdf(forbes_book, from_page=start, to_page=end - 1)
                part_pdf.save(f"{PDF_TR}/{place}_{digit}.pdf")

## Stage 2: Convert PDF partitions to Markdown files

Working with raw textual data is almost always better than working with binary data formats like PDF. That is why we're converting each partition to the Markdown format.

In [None]:
def pdf_to_md(pdf_path: str, md_path: str) -> None:
    files = [upload_to_gemini(pdf_path, mime_type="application/pdf")]
    wait_for_files_active(files)

    chat_session = md_model.start_chat(history=[{"role": "user", "parts": files}])
    response = chat_session.send_message("Convert this PDF file to Markdown.")

    files[0].delete()

    with open(md_path, "w", encoding="UTF-8") as md_f:
        md_f.write(response.text)

In [None]:
for pdf_path, md_path in [
    (f"{PDF_TR}/{pdf}", f"{MD_TR}/{pdf.split('.')[0]}.md")
    for pdf in sorted(os.listdir(PDF_TR))
]:
    print(f"[PROCESSING] {pdf_path}...")
    try:
        pdf_to_md(pdf_path, md_path)
    except Exception as err:
        print(f"[FAILED] {pdf_path}! Reason: {err}")
    else:
        print(f"[PROCESSED] {pdf_path}!")

    time.sleep(TIMEOUT_IN_SECONDS)

## Stage 3: Summarization

Apparently the full text is too long -- so we summarize it.

In [None]:
def summarize(initial_path: str, digit_path: str, summarization_path: str) -> None:
    with (
        open(initial_path, "r", encoding="UTF-8") as initial_f,
        open(digit_path, "r", encoding="UTF-8") as digit_f,
    ):
        content = f"{initial_f.read()}\n\n{digit_f.read()}"

    chat_session = summ_model.start_chat()
    response = chat_session.send_message(content)

    with open(summarization_path, "w", encoding="UTF-8") as summ_f:
        summ_f.write(response.text)

In [None]:
for md_path, initial_path, summ_path in [
    (
        f"{MD_TR}/{md}",
        f"{MD_TR}/{md.split('.')[0].split('_')[0]}_initial.md",
        f"{SUMM_TR}/{md.split('.')[-2]}.md",
    )
    for md in sorted(os.listdir(MD_TR))
    if "initial" not in md
]:
    print(f"[PROCESSING] {md_path}...")
    try:
        summarize(initial_path, md_path, summ_path)
    except Exception as err:
        print(f"[FAILED] {md_path}! Reason: {err}.")
    else:
        print(f"[PROCESSED] {md_path}!")

    time.sleep(TIMEOUT_IN_SECONDS)