In [9]:
import os
import time
import google.generativeai as genai
from google.generativeai.types import HarmCategory, HarmBlockThreshold

In [10]:
genai.configure(api_key=os.environ['GEMINI_API_KEY'])

In [11]:
def upload_to_gemini(path, mime_type=None):
    """Uploads the given file to Gemini.

    See https://ai.google.dev/gemini-api/docs/prompting_with_media
    """
    file = genai.upload_file(path, mime_type=mime_type)
    print(f"Uploaded file '{file.display_name}' as: {file.uri}")
    return file

In [12]:
def wait_for_files_active(files):
    """Waits for the given files to be active.

    Some files uploaded to the Gemini API need to be processed before they can be
    used as prompt inputs. The status can be seen by querying the file's "state"
    field.

    This implementation uses a simple blocking polling loop. Production code
    should probably employ a more sophisticated approach.
    """
    print("Waiting for file processing...")
    for name in (file.name for file in files):
        file = genai.get_file(name)
        while file.state.name == "PROCESSING":
            print(".", end="", flush=True)
            time.sleep(10)
            file = genai.get_file(name)
        if file.state.name != "ACTIVE":
            raise Exception(f"File {file.name} failed to process")
    print("...all files ready")
    print()

In [13]:
# Create the model
model = genai.GenerativeModel(
    model_name="gemini-1.5-pro",
    generation_config={
        "temperature": 1,
        "top_p": 0.95,
        "top_k": 64,
        "max_output_tokens": 8192,
        "response_mime_type": "text/plain",
    },
    safety_settings={
        HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_NONE,
        HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_NONE,
        HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_NONE,
        HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE,
    },
    system_instruction="You are a professional text to Markdown converter.\n\n1. Extract the text from the document.\n2. De-hyphenate the raw text.\n3. Convert the de-hyphenated raw text to Markdown format.\n\nThe Markdown output should have appropriate syntax for the titles, lists and other elements. There must be a double new line between the paragraphs. It should have no additional changes. Do not provide additional output.",
)

In [14]:
def pdf_to_markdown(pdf_path: str, markdown_path: str) -> None:
    files = [
        upload_to_gemini(pdf_path, mime_type="application/pdf"),
    ]
    wait_for_files_active(files)

    chat_session = model.start_chat(
        history=[
            {
                "role": "user",
                "parts": [
                    files[0],
                ],
            },
        ]
    )

    life_path = pdf_path.split('/')[-1].split('.')[0].replace('_', '/')

    response = chat_session.send_message(f"Convert this document into Markdown format. Ignore the \"Spiritual Laws\" and \"Deepening Your Understanding\" headings. If there are additional unspecified headings, you should add their content under the most appropriate heading. The final output must contain all of these headings:\n\n## Understanding Life Purpose\n(content goes here)\n\n### Working {life_path} in the Positive\n(content goes here)\n\n### Working {life_path} in the Negative\n(content goes here)\n\n## Life-Path Issues\n### Health\n(content goes here)\n\n### Relationships\n(content goes here)\n\n### Talents, Work, and Finances\n(content goes here)\n\n## Some Well-Known {life_path}s\n(content goes here)\n\n## Keys to Fulfilling Your Destiny\n(content goes here)\n\n### Guidelines and Recommendations\n(content goes here)\n\n### Useful Questions\n(content goes here)")

    files[0].delete()

    with open(markdown_path, "w", encoding="UTF-8") as f:
        f.write(response.text)

In [18]:
paths = [(f"./PDFs/{pdf}", f"./MDs/{pdf.split('.')[0]}.md") for pdf in os.listdir("./PDFs/")]

In [None]:
for pdf_path, md_path in paths:
    print(pdf_path, md_path)
    pdf_to_markdown(pdf_path, md_path)

    time.sleep(60)

In [16]:
for f in genai.list_files():
    f.delete()