In [3]:
from sentence_transformers import SentenceTransformer
import torch
import lancedb
from lancedb.rerankers import LinearCombinationReranker, RRFReranker
from openai import OpenAI
from collections import defaultdict
import re
import pandas as pd
from pathlib import Path
import json
from datetime import datetime

from google import genai
from google.genai import types
from dotenv import load_dotenv
import os
load_dotenv('env_var')

True

In [4]:
client = OpenAI(base_url="http://localhost:1234/v1", api_key="lm-studio")
def call_llm(query, temperature=0.3, seed=42, model="gemma-3-4b-it@Q8_0"):
    completion = client.chat.completions.create(
        model=model,
        messages=[
            {"role": "user", "content": query}
        ],
        temperature=temperature,
        seed=seed,
    )
    return completion.choices[0].message.content

model = "gemini-2.0-flash"
total_tokens = list()

def call_llm_flash(query, temperature=0.1, seed=42, max_tokens=2000 ):
    client = genai.Client(api_key=os.environ['GEMINI_API_KEY'])
    response = client.models.generate_content(
        model=model,
        contents=[query],
        config=types.GenerateContentConfig(
            max_output_tokens=max_tokens,
            temperature=temperature,
            seed=seed
        )
    )
    total_tokens.append({'prompt_tokens':response.usage_metadata.prompt_token_count,
                         'completion_tokens':response.usage_metadata.candidates_token_count,
                         'total_tokens':response.usage_metadata.total_token_count,
                         'timestamp':datetime.now().strftime("%Y_%m_%d_%H_%M_%S")})

    return response.text

In [5]:
from burr.core import action, State, ApplicationBuilder, ApplicationContext, Action
from burr.core.parallelism import MapStates, RunnableGraph

In [6]:
%load_ext burr.integrations.notebook
%burr_ui

In [7]:
parsing_prompt = """I have a document that's written in markdown. I want to split it into the main sections so that I can organize the different sections, and have a summary for the whole document that I can reference later on. I need you to provide the text that I can split the document on. That text should be the title or beginning of each major section.
If a section is the title of the document, prefix the string with "DOCUMENT TITLE:" so that I know it's not a section title. If it is a major section, prefix it with "SECTION #:" where the section number is a number that increments up from 1 to number the sections. If the document has an overall summary or conclusion for the entire document, provide that as an additional section.
If there is no summary, then the summary you write should be a paragraph or two at most and capture the main points about the document.

Structure your response with the following sections and formatting:

DOCUMENT_TITLE: [document title]
SECTION #:
SECTION #:
...
SUMMARY:

Here is my document:
{report}"""

In [8]:
encoder = SentenceTransformer('nomic-ai/nomic-embed-text-v1.5', device='mps',trust_remote_code=True)

!!!!!!!!!!!!megablocks not available, using torch.matmul instead
<All keys matched successfully>


In [9]:
def parse_report_to_sections(report, section_markers):
    parsed_document = list()
    split_document = report.split('\n')

    current_section_idx = 0
    current_section_marker = section_markers[current_section_idx]
    current_section = [split_document[0]]

    for line_number, line in enumerate(split_document):
        if current_section_marker[1].lower() in line.lower():
            if line_number != 0:
                parsed_document.append('\n'.join(current_section))
            # current_section = list()
            current_section_idx += 1
            current_section = [line]
            if current_section_idx < len(section_markers):
                current_section_marker = section_markers[current_section_idx]
            continue
        else:
            current_section.append(line)
    parsed_document.append('\n'.join(current_section))
    return parsed_document

In [10]:
def parse_report(report, section_markers):
    parsed_document = parse_report_to_sections(report, section_markers)
    segmented_report = list()
    section_idx = 1
    for marker, section in zip(section_markers, parsed_document):
        segmented_report.append({'section_idx': section_idx,
                                 'section_id':marker[0],
                                 'section_title':marker[1],
                                 'section_text':section})
        section_idx += 1
    return segmented_report

In [19]:
@action(reads=[], writes=["report_data", "file_name", "file_path"])
def load_report_from_file(state: State, file_path: Path):
    with open(file_path,'r') as f:
        report_data = json.load(f)

    return state.update(report_data=report_data,
                        file_path=str(file_path),
                        file_name=str(file_path.name))

@action(reads=["report_data"], writes=["report","section_markers", "parsing_prompt","summary"])
def llm_segment_and_summarize(state: State) -> State:
    report = state['report_data']['report']
    formatted_parsing_prompt = parsing_prompt.format(report=report)
    sections_and_summary = call_llm_flash(formatted_parsing_prompt)
    document_title = re.search(r"DOCUMENT_TITLE: (.+?)\n", sections_and_summary).group(1)
    document_summary = re.search(r"SUMMARY:\s+(.+)", sections_and_summary, flags=re.DOTALL | re.MULTILINE).group(1).strip('`')
    section_numbers = re.findall(r"SECTION (\d+): (.+?)\n", sections_and_summary)
    section_markers = [('title', document_title)] + section_numbers
    return state.update(report=report, section_markers=section_markers, parsing_prompt=formatted_parsing_prompt, summary=document_summary)

@action(reads=["report","report_data","section_markers","summary"], writes=["parsed_document"])
def parse_report_to_segments(state: State) -> State:
    report = state['report']
    segments = state['section_markers']
    report_data = state['report_data']
    parsed_document = parse_report(report, segments)
    parsed_data = {
        "sections":parsed_document,
        "report":report_data['report'],
        "summary":state['summary'],
        "topic":report_data['topic'],
        "focus":report_data['focus'],
        "relevant_citations":report_data['relevant_citations'],
        "source_file":state['file_path'],
        "source_file_name":state['file_name']
    }
    return state.update(parsed_document=parsed_data)

@action(reads=["parsed_document"], writes=["parsed_document"])
def encode_text(state: State) -> State:
    parsed_document = state['parsed_document']
    for section in parsed_document['sections']:
        section['section_vector'] = encoder.encode(section['section_text']).tolist()
        torch.mps.empty_cache()

    parsed_document['report_vector'] = encoder.encode(parsed_document['report']).tolist()
    torch.mps.empty_cache()
    parsed_document['summary_vector'] = encoder.encode(parsed_document['summary']).tolist()
    torch.mps.empty_cache()
    return state.update(parsed_document=parsed_document)

In [38]:
project_name = "south_america_research_topics"

In [39]:
source_folder = Path('wonky_data/data/research_data')
project_folder = source_folder.joinpath(project_name)
reports = list(project_folder.glob('*.json'))
save_folder = Path('wonky_data/data/processed_research')
save_folder = save_folder.joinpath(project_name)
save_folder.mkdir(parents=True, exist_ok=True)
reports

[PosixPath('wonky_data/data/research_data/south_america_research_topics/the_geopolitics_of_environmental_issues_deforestation_climate_20250415151425.json'),
 PosixPath('wonky_data/data/research_data/south_america_research_topics/the_influence_of_external_actors_and_shifting_20250415151555.json'),
 PosixPath('wonky_data/data/research_data/south_america_research_topics/the_evolving_dynamics_of_regional_integration_and_20250415151143.json'),
 PosixPath('wonky_data/data/research_data/south_america_research_topics/the_rise_of_populism_and_political_polarization_20250415151311.json'),
 PosixPath('wonky_data/data/research_data/south_america_research_topics/the_impact_of_resource_nationalism_and_commodity_20250415151009.json')]

In [40]:
for report_file in reports:
    parsing_app = (
        ApplicationBuilder()
        .with_actions(
            load_report_from_file,
            llm_segment_and_summarize,
            parse_report_to_segments,
            encode_text
        )
        .with_transitions(
            ("load_report_from_file", "llm_segment_and_summarize"),
            ("llm_segment_and_summarize", "parse_report_to_segments"),
            ("parse_report_to_segments", "encode_text"))
        .with_entrypoint("load_report_from_file")
        .with_tracker(
            "local",
            project=f"research_batch-parsing-{report_file.stem[:25]}",
        )
        .build()
    )

    parsing_action, parsing_result, parsing_state = parsing_app.run(
        halt_after=["encode_text"],
        inputs={
            "file_path": report_file
        }
    )

    with open(save_folder.joinpath(f'{report_file.name}'), 'w') as f:
        json.dump(parsing_state['parsed_document'], f)