# Outline

1. Transcribe everything first time around from a given folder as clsoely as possible in JSON-format: `list[Pages] -> str`
2. Get an LLM to classify it into topics based on the text: `str -> list[Topics]`
3. Transcribe on a topic-level instead: `list[Topics] (images) -> list[Topics] (transcribed)`

# Gemini initialisation and Prep of Notes

In [45]:
from google import genai
import os
from pathlib import Path
from typing import Optional
import PIL.Image

GEMINI_API_KEY = os.getenv("GEMINI_API_KEY") 

class GeminiFiles:
    def __init__(self, response_schema, model_name="gemini-2.0-flash"):
        self.client = genai.Client(api_key=GEMINI_API_KEY)
        self.model_name = model_name
        self.uploaded_files = {}
        self.response_schema = response_schema

    def prompt_with_image(self, prompt: str, image_paths: Path | list[Path]):
        if isinstance(image_paths, Path):
            image_paths = [image_paths]
        
        print(image_paths)

        image_paths = [PIL.Image.open(image_path) for image_path in image_paths]
        contents = [prompt] + image_paths

        response = self.client.models.generate_content(
            model=self.model_name,
            contents=contents,
            config={
                "response_mime_type": "application/json",
                "response_schema": self.response_schema
            }
        )
        return response.parsed

    def query(self, prompt: str):
        model_info = [prompt]

        response = self.client.models.generate_content(
            model=self.model_name,
            contents=model_info,
            config={
                "response_mime_type": "application/json",
                "response_schema": self.response_schema
            }
        )
        return response.parsed



In [2]:
from pathlib import Path

folder_path = "/Users/oscarjuliusadserballe/Google Drive/My Drive/Handwritten Notes"
classes = Path(folder_path)

# Create a dictionary to store folders and their files
folder_files = {}

for root, dirs, files in classes.walk():
    relative_path = Path(root).relative_to(classes)
    
    if str(relative_path) == '.':
        continue
        
    image_files = [f for f in files if f.lower().endswith(('.png', '.jpg', '.jpeg', '.gif', '.bmp'))]
    
    if image_files:
        folder_files[relative_path] = image_files

# for folder, files in folder_files.items():
#     print(f"\nFolder: {folder}")
#     for file in files:
#         print(f"  - {file}")
#         print(f"folder_path: {folder_path / folder / file}")

In [3]:
folder_files.keys()

dict_keys([PosixPath('Business Strategy and SDGs'), PosixPath('15.455x'), PosixPath('KKV'), PosixPath('Jurisprudence'), PosixPath('Isaiah Berlin'), PosixPath('Measurement'), PosixPath('18.01'), PosixPath('14.750x'), PosixPath('CS'), PosixPath('Opting out of the EU'), PosixPath('Case Studies and Theory Development in the Social Sciences'), PosixPath('6.86x'), PosixPath('15.435x'), PosixPath('6.041'), PosixPath('Epistemology in the Ancient World'), PosixPath('14.740x'), PosixPath('International Economics Notes'), PosixPath('Concepts of Security'), PosixPath('IPE'), PosixPath('18.03'), PosixPath('MGMT2670'), PosixPath('CPE'), PosixPath('China under Mao'), PosixPath('Principles of Sustainable Finance'), PosixPath('HW 6'), PosixPath('18.6501x'), PosixPath('Financial Accounting'), PosixPath('Mathematical Thinking and Logic')])

In [4]:
test_folder = Path('15.455x')

## 1. Transcribe raw

In [5]:
from transcription_prompts import FIRST_TRANSCRIPTION_PROMPT

In [6]:
from pydantic import BaseModel

class Page(BaseModel):
    content: str

gemini_pages = GeminiFiles(response_schema=Page)

In [7]:
from tqdm import tqdm
pages_transcribed = {}

for page in tqdm(folder_files[test_folder]):
    pages_transcribed[page] = gemini_pages.prompt_with_image(
        FIRST_TRANSCRIPTION_PROMPT, Path(folder_path / test_folder / page)
    )

I0000 00:00:1740174797.417964 6081329 check_gcp_environment_no_op.cc:29] ALTS: Platforms other than Linux and Windows are not supported
100%|██████████| 26/26 [03:42<00:00,  8.58s/it]


In [10]:
import json

for page in pages_transcribed:
    pages_transcribed[page] = json.loads(pages_transcribed[page])
    
pages_transcribed

{'15.455x(22).jpg': {'content': "Fixed Income Modelling\nItô calc. in new context.\n↳ insight: r's dynamic and stochastic, there's a term structure and they too change dynamically and stochastically.\nWhich interest rates to use?\n\nA first model for bonds: elapeds only on short rate $r_t$ given by $y_t$, and $dy_t = adt + bdBr$\n$V = V(t, T, r_t)$\nCan we construct PDE for bonds?\nMain idea: Use portfolio of bonds (two) to elin risk founded on premise that all bonds'\n\nSingle movements bc. of changes in r. Than, finally,\ndepository\n↓\nPDE that is ind. of maturity or anything specific.\nWe can elim to a bond itself. Setting Zero coupon bond as risk by boundary condition (anchoring PDF).\nshorting $q_2$\nUsing Itô: $dV_i = \\left[\\frac{\\partial V_i}{\\partial t} + \\frac{b^2}{2} \\frac{\\partial^2 V}{\\partial r^2}\\right] dt + \\frac{\\partial V_i}{\\partial r} dy$\nPortfolio: $\\pi = q_1 V_1 + q_2 V_2$ and Zero risk w.\n$\\frac{q_1}{q_2} = -\\frac{\\partial V_2 / \\partial r}{\\p

# 2. Categorize to Topics

In [38]:
from config_categorization import CATEGORIZE_PROMPT

class Topic(BaseModel):
    name: str
    description: str
    pages: list[str]

gemini_topics = GeminiFiles(model_name="gemini-exp-1206", response_schema=list[Topic])

content_to_categorize = ""
for page in pages_transcribed:
    content_to_categorize += f"Page: {page}\n\n\n\n{pages_transcribed[page]["content"]}\n\n"
prompt = CATEGORIZE_PROMPT + "\n\nHere are the notes you need to categorize:\n\n" + content_to_categorize
prompt

topics = gemini_topics.query(prompt)

In [39]:
topics

[Topic(name='Ito Calculus', description="Introduction to Ito Calculus, including the Ito process, Ito's Lemma, and its application to stochastic differential equations. Focus on Brownian motion and its properties, with extensions to multivariate cases.", pages=['15.455x(12).jpg', '15.455x(13).jpg', '15.455x(8).jpg', '15.455x(20).jpg', '15.455x.jpg', '15.455x(1).jpg']),
 Topic(name='Black-Scholes Model', description='Derivation and analysis of the Black-Scholes model, including dynamic hedging, risk-neutral valuation, and extensions to the model. Discussion of the Greeks and their implications for risk management.', pages=['15.455x(14).jpg', '15.455x(2).jpg', '15.455x(15).jpg', '15.455x(19).jpg', '15.455x(16).jpg', '15.455x(17).jpg', '15.455x(18).jpg']),
 Topic(name='PDE Solutions and Applications', description='Methods for solving partial differential equations (PDEs) in finance, particularly focusing on the heat equation and its solutions. Includes techniques like change of variables,

In [49]:
from transcription_prompts import SECOND_TRANSCRIPTION_PROMPT
from datetime import datetime

class TopicTranscription(BaseModel):
    title: str
    keywords: list[str]
    summary: str
    content: str
    problem_space: str
    examples: list[str]
    reflection: str
    questions: list[str]


def transcribe_topic(
        topic: Topic,
        output_dir: Path
    ):
    gemini = GeminiFiles(response_schema=TopicTranscription, model_name="gemini-exp-1206")

    pages = [Path(folder_path / test_folder / page) for page in topic.pages]
    
    output = gemini.prompt_with_image(
        prompt=SECOND_TRANSCRIPTION_PROMPT,
        image_paths=pages
    )

    output_dir.mkdir(parents=True, exist_ok=True)
    
    safe_name = "".join(c for c in topic.name if c.isalnum() or c in (' ', '-', '_')).strip()
    safe_name = safe_name.replace(' ', '_').lower()
    
    output_path = output_dir / f"{safe_name}.json"
    with open(output_path, 'w', encoding='utf-8') as f:
        output_data = {
            "topic_transcription": output.model_dump(),
            "metadata": {
                "source_folder": str(test_folder),
                "source_pages": topic.pages,
                "created_at": datetime.now().isoformat()
            }
        }
        json.dump(output_data, f, indent=2, ensure_ascii=False)
    
    return output

transcribe_topic(topics[0], output_dir=Path("test"))

[PosixPath('/Users/oscarjuliusadserballe/Google Drive/My Drive/Handwritten Notes/15.455x/15.455x(12).jpg'), PosixPath('/Users/oscarjuliusadserballe/Google Drive/My Drive/Handwritten Notes/15.455x/15.455x(13).jpg'), PosixPath('/Users/oscarjuliusadserballe/Google Drive/My Drive/Handwritten Notes/15.455x/15.455x(8).jpg'), PosixPath('/Users/oscarjuliusadserballe/Google Drive/My Drive/Handwritten Notes/15.455x/15.455x(20).jpg'), PosixPath('/Users/oscarjuliusadserballe/Google Drive/My Drive/Handwritten Notes/15.455x/15.455x.jpg'), PosixPath('/Users/oscarjuliusadserballe/Google Drive/My Drive/Handwritten Notes/15.455x/15.455x(1).jpg')]


TopicTranscription(title='Ito Calculus', keywords=['Ito Process', 'Stochastic Calculus', 'Brownian Motion', 'Chain Rule', 'Taylor Expansion', 'Differential Equations', 'Bachelier Model', 'Geometric Brownian Motion', 'Martingale'], summary="These notes discuss Ito calculus, focusing on its application to continuous-time finance. It covers Ito's lemma, stochastic differential equations, and their relation to deterministic calculus, with examples like the Bachelier and Geometric Brownian Motion models.", content='# Ito Calculus\n\nIto process as a stochastic function where\n\n$dX_t = a dt + b dB_t$\n\n$X_t$ path-wise depends on $t$, so it\'s the same as normal brownian motion\n\n$d(F(X))$? Chain rule, $dF(t, X) = \\frac{\\partial F}{\\partial t} dt + \\frac{\\partial F}{\\partial X} dx$\n\nBut $X$ is nowhere differentiable as $B_t$ is nowhere differentiable!\n\nIto\'s lemma tries to introduce differentiability for probabilistic functions, using Taylor\'s theorem.\n\n$F\'(x) = \\lim_{\\Del