In [1]:
from sentence_transformers import SentenceTransformer
import torch
import lancedb
from openai import OpenAI
import re
import pandas as pd
import numpy as np
import json
import time
import pypandoc
from google import genai
from google.genai import types
from dotenv import load_dotenv
from tqdm.notebook import tqdm
import os

load_dotenv('env_var')

  from tqdm.autonotebook import tqdm, trange


True

In [73]:
import hashlib
def hash_string_to_digits(input_string, num_digits=6):
    """Hashes a string to an 8-digit integer using SHA-256."""
    hashed_value = hashlib.sha256(input_string.encode('utf-8')).hexdigest()
    return int(hashed_value, 16) % (10**num_digits)

In [74]:
def format_evidence(evidence_list):
    """Formats the evidence list into a Markdown string."""
    markdown = ""
    if len(evidence_list)==0:# or not isinstance(evidence_list, list):
        return "No evidence provided.\n"

    for i, item in enumerate(evidence_list):
        markdown += f"  - **{item.get('Description', 'N/A')}**\n"
        markdown += f"    - **Key Data/Details:** {item.get('Key Data/Details', 'N/A')}\n"
        markdown += f"    - **Methodology Note:** {item.get('Methodology Note', 'N/A')}\n"
        markdown += f"    - **Source Note:** {item.get('Source Note', 'N/A')}\n"
    return markdown


def format_list(items, title):
    """Formats a simple list of strings into a Markdown list."""
    markdown = f"#### *{title}*\n"
    if len(items) == 0: # or not isinstance(items, list):
        return markdown + "- N/A\n"
    for item in items:
        markdown += f"- {item}\n"
    return markdown


def generate_markdown(data):
    """
    Generates a Markdown string from a list of insight objects.

    Args:
        data (list): A list of dictionaries, where each dictionary
                     represents an insight.

    Returns:
        str: A string containing the formatted Markdown document.
    """
    if not isinstance(data, list):
        return "Error: JSON data must be a list of insight objects."

    full_markdown = ""
    for i, insight in enumerate(data):
        # --- Source Information ---
        full_markdown += "## Source Information\n"
        full_markdown += f"- **Original Source Title:** {insight.get('Original Source Title', 'N/A')}\n"
        full_markdown += f"- **Author(s) / Organization:** {insight.get('Author(s) / Organization', 'N/A')}\n"
        full_markdown += f"- **Location in Source:** {insight.get('Location in Source', 'N/A')}\n\n"

        # --- Comprehensive Explanation ---
        full_markdown += "## Comprehensive Explanation\n"
        full_markdown += f"{insight.get('Comprehensive Explanation of the Insight', 'N/A')}\n\n"

        # --- Evidence Section ---
        full_markdown += "## Evidence & Reasoning\n\n"
        full_markdown += "### Evidence FOR this Insight\n"
        full_markdown += format_evidence(insight.get('Evidence FOR this Insight'))
        full_markdown += "\n"

        full_markdown += """### Author's Reasoning FOR this Insight (The "Why")\n"""
        full_markdown += insight.get('''Author's Reasoning FOR this Insight (The "Why")''', 'N/A') + '\n\n'

        full_markdown += """### Evidence AGAINST or Contradicting this Insight\n"""
        full_markdown += f"{insight.get('Evidence AGAINST or Contradicting this Insight', 'N/A')}\n\n"

        full_markdown += "### Author's Reasoning AGAINST this Insight (or for the Nuance)\n"
        full_markdown += insight.get('Author\'s Reasoning AGAINST this Insight (or for the Nuance)', 'N/A') + "\n\n"
        full_markdown += "### Author's position on this insight:\n"
        full_markdown += insight.get('Position Taken', 'N/A') + "\n\n"
        # --- Strength of Insight ---
        strength = insight.get('Strength of This Specific Insight', {})
        full_markdown += "## Strength of This Specific Insight\n"
        full_markdown += f"- **Assessment:** {strength.get('Assessment', 'N/A')}\n"
        full_markdown += f"- **Confidence Level:** {strength.get('Confidence Level', 'N/A')}\n"
        full_markdown += f"- **Common Sensibility:** {strength.get('Common Sensibility', 'N/A')}\n\n"


        # --- Actionable Recommendations ---
        recommendations = insight.get('Actionable Recommendations or Implications', {})
        full_markdown += "## Actionable Recommendations or Implications\n"
        full_markdown += f"- **If this insight is true, it implies that we should:** {recommendations.get('If this insight is true, it implies that we should', 'N/A')}\n"
        full_markdown += f"- **This insight could be used in our project to:** {recommendations.get('This insight could be used in our project to', 'N/A')}\n\n"
        full_markdown += f"- **If this insight is not true, it implies that we should:** {recommendations.get('If this insight is not true, it implies that we should', 'N/A')}\n\n"

        # --- Indexing ---
        indexing = insight.get('Indexing for Future Reference', {})
        full_markdown += "## Indexing for Future Reference\n"
        full_markdown += format_list(indexing.get('General Topics', []), "General Topics")
        full_markdown += format_list(indexing.get('Specific Topics', []), "Specific Topics")
        full_markdown += format_list(indexing.get('General Keywords', []), "General Keywords")
        full_markdown += format_list(indexing.get('Specific Keywords', []), "Specific Keywords")
        full_markdown += "\n"

        # --- Unanswered Questions ---
        full_markdown += format_list(insight.get('Unanswered Questions', []), "Unanswered Questions")

                # --- Main Title for the Insight ---
        insight_hash = hash_string_to_digits(full_markdown, 6)
        full_markdown = f"## Insight: [INST{insight_hash}]\n\n### *{insight.get('Statement of the Insight', 'No Title Provided')}*\n\n" + full_markdown

        # --- Separator for next insight ---
        if i < len(data) - 1:
            full_markdown += "\n---\n\n"

    return full_markdown

In [147]:
def generate_insight_grounding_markdown(data, ids=None):
    """
    Generates a Markdown string from a list of insight objects.

    Args:
        data (list): A list of dictionaries, where each dictionary
                     represents an insight.

    Returns:
        str: A string containing the formatted Markdown document.
    """
    if not isinstance(data, list):
        return "Error: JSON data must be a list of insight objects."

    full_markdown = ""
    for i, insight in enumerate(data):
        # --- Main Title for the Insight ---
        # full_markdown += f"## Insight: \n\n### *{insight.get('Statement of the Insight', 'No Title Provided')}*\n\n"

        # --- Source Information ---
        full_markdown += "## Source Information\n"
        full_markdown += f"- **Original Source Title:** {insight.get('Original Source Title', 'N/A')}\n"
        full_markdown += f"- **Author(s) / Organization:** {insight.get('Author(s) / Organization', 'N/A')}\n"
        full_markdown += f"- **Location in Source:** {insight.get('Location in Source', 'N/A')}\n\n"

        # --- Comprehensive Explanation ---
        full_markdown += "## Comprehensive Explanation\n"
        full_markdown += f"{insight.get('Comprehensive Explanation of the Insight', 'N/A')}\n\n"

        # --- Evidence Section ---
        full_markdown += "## Evidence & Reasoning\n\n"
        full_markdown += "### Evidence FOR this Insight\n"
        full_markdown += format_evidence(insight.get('Evidence FOR this Insight'))
        full_markdown += "\n"

        full_markdown += """### Author's Reasoning FOR this Insight (The "Why")\n"""
        full_markdown += insight.get('''Author's Reasoning FOR this Insight (The "Why")''', 'N/A') + '\n\n'

        full_markdown += """### Evidence AGAINST or Contradicting this Insight\n"""
        full_markdown += f"{insight.get('Evidence AGAINST or Contradicting this Insight', 'N/A')}\n\n"

        full_markdown += "### Author's Reasoning AGAINST this Insight (or for the Nuance)\n"
        full_markdown += insight.get('Author\'s Reasoning AGAINST this Insight (or for the Nuance)', 'N/A') + "\n\n"
        full_markdown += "### Author's position on this insight:\n"
        full_markdown += insight.get('Position Taken', 'N/A') + "\n\n"
        # --- Strength of Insight ---
        strength = insight.get('Strength of This Specific Insight', {})
        full_markdown += "## Strength of This Specific Insight\n"
        full_markdown += f"- **Assessment:** {strength.get('Assessment', 'N/A')}\n"
        full_markdown += f"- **Confidence Level:** {strength.get('Confidence Level', 'N/A')}\n"
        full_markdown += f"- **Common Sensibility:** {strength.get('Common Sensibility', 'N/A')}\n\n"

        # --- Actionable Recommendations ---
        recommendations = insight.get('Actionable Recommendations or Implications', {})
        full_markdown += "## Actionable Recommendations or Implications\n"
        full_markdown += f"- **If this insight is true, it implies that we should:** {recommendations.get('If this insight is true, it implies that we should', 'N/A')}\n"
        full_markdown += f"- **This insight could be used in our project to:** {recommendations.get('This insight could be used in our project to', 'N/A')}\n\n"
        full_markdown += f"- **If this insight is not true, it implies that we should:** {recommendations.get('If this insight is not true, it implies that we should', 'N/A')}\n\n"

        # --- Unanswered Questions ---
        full_markdown += format_list(insight.get('Unanswered Questions', []), "Unanswered Questions")

        # --- Main Title for the Insight ---
        if ids is None:
            insight_hash = hash_string_to_digits(full_markdown, 6)
            insight_citation = f"INST{insight_hash}"
        else:
            insight_citation = ids[i]
        full_markdown = f"## Insight: [{insight_citation}]\n\n### *{insight.get('Statement of the Insight', 'No Title Provided')}*\n\n" + full_markdown


        # --- Separator for next insight ---
        if i < len(data) - 1:
            full_markdown += "\n---\n\n"

    return full_markdown

In [3]:
from pathlib import Path
import sqlite3
from datetime import datetime
from uuid import uuid4
from copy import deepcopy

project_folder = Path('insight_research')
project_folder.mkdir(parents=True, exist_ok=True)
research_json_folder = project_folder.joinpath('json_data')
research_json_folder.mkdir(parents=True, exist_ok=True)
database_location = project_folder.joinpath('research.sqlite')

conn = sqlite3.connect(database_location)
cursor = conn.cursor()

In [5]:
index = lancedb.connect('../wonky_data/indexes/')
table = index.open_table('sections_hybrid')
encoder = SentenceTransformer('nomic-ai/nomic-embed-text-v1.5', device='mps',trust_remote_code=True)

<All keys matched successfully>


In [7]:
client = OpenAI(base_url="http://localhost:1234/v1", api_key="lm-studio")
def call_llm(query, temperature=0.35, seed=42, model="gemma-3-12b-it-qat"):
    completion = client.chat.completions.create(
        model=model,
        messages=[
            {"role": "user", "content": query}
        ],
        temperature=temperature,
        seed=seed,
    )
    return completion.choices[0].message.content

model = "gemini-2.0-flash"
total_tokens = list()

def call_llm_flash(query, temperature=0.1, seed=42, max_tokens=7500 ):
    client = genai.Client(api_key=os.environ['GEMINI_API_KEY'])
    retries = 3
    time_delay = 15
    for i in range(retries):
        try:
            response = client.models.generate_content(
                model=model,
                contents=[query],
                config=types.GenerateContentConfig(
                    max_output_tokens=max_tokens,
                    temperature=temperature,
                    seed=seed
                )
            )
            break
        except Exception as e:
            print(e)
            print(f"Retries left: {retries - i}")
            time.sleep(time_delay)
            continue



    total_tokens.append({'prompt_tokens':response.usage_metadata.prompt_token_count,
                         'completion_tokens':response.usage_metadata.candidates_token_count,
                         'total_tokens':response.usage_metadata.total_token_count,
                         'timestamp':datetime.now().strftime("%Y_%m_%d_%H_%M_%S")})

    return response.text

def convert_df_sections_to_list(sections):
    section_list = sections.explode().to_list()
    section_list = convert_all_sections(section_list)
    return section_list

def convert_sections_to_dict(section):
    converted_sections = list()
    parts = section.replace("\'",'"').split('", ')
    for _part in parts:
        _part = _part + '"}'
        # print(_part)
        _part = re.findall(r"""^{?(.*?): "(.*?)}$""",_part, flags=re.DOTALL | re.MULTILINE)

        formatted_parts = {int(_part[0][0].strip('"} ')): _part[0][1].strip('"} \n')}
        converted_sections.append(formatted_parts)
    return converted_sections

def convert_all_sections(sections):
    extracted_sections = list()
    for _section in sections:
        section = convert_sections_to_dict(_section)
        extracted_sections.extend(section)
    return extracted_sections

In [8]:
def parse_data_for_chroma(data):
    _metadata = list()
    _vectors = list()
    _documents = list()
    for _row in data:
        _metadata.append({k:v for k, v in _row.items() if k not in ['extraction_text','vector']})
        _vectors.append(_row['vector'])
        _documents.append(_row['extraction_text'])
    return _metadata, _vectors, _documents

# Set variables

In [9]:
subject_matter = "Rural Broadband in the United States"
focus = "Barriers and opportunities to improve access"
depth_to_search = 50
max_documents = 5
research_id = hash_string_to_digits(f"""{subject_matter} : {focus}""")
with open(research_json_folder.joinpath('insights_181400.json'),'r') as f:
    data = json.load(f)

insight_index = lancedb.connect(project_folder.joinpath('insight_index'))
if 'insights' in insight_index.table_names():
    insight_index.drop_table('insights')
insight_table = insight_index.create_table('insights', data)

In [148]:
query = "Rural broadband expansion initiatives."
query_vec = encoder.encode(query)

In [149]:
results = insight_table.search(query_vec).distance_type('cosine').limit(25).to_pandas()
results['insight_id'] = results['extraction_text'].apply(lambda x: f"INST{hash_string_to_digits(x, 6)}")
results

Unnamed: 0,id,type,typeId,number,active,topics,date,title,summary,doc_id,filename,source_file,insights,extraction_details,extraction_text,vector,_distance,insight_id
0,R42524,CRS Report,REPORTS,R42524,True,"['Appropriations', 'Health Policy']",2013-06-25,Rural Broadband: The Roles of the Rural Utilit...,Since the initial deployment of broadband in t...,421932,20130625_R42524_6fda8e3e88ced531bc5562afe7ca2a...,R42524.json,{'insight': 'The Rural Utilities Service (RUS)...,{'Actionable Recommendations or Implications':...,## Insight: \n\n### *The Rural Utilities Servi...,"[-0.3524781, 1.1827747, -3.1551015, -0.6930941...",0.286355,INST819678
1,R46501,CRS Report,R,R46501,True,"['Economic Policy', 'Internet and Telecommunic...",2020-08-28,Rural Digital Opportunity Fund: Requirements a...,,R46501_1_2020-08-28,2020-08-28_R46501_69aa2b86f4262de2d971c425729a...,R46501.json,{'insight': 'The Rural Digital Opportunity Fun...,{'Actionable Recommendations or Implications':...,## Insight: \n\n### *The Rural Digital Opportu...,"[-0.03362766, 1.605579, -3.2129455, -0.4775158...",0.288256,INST90992
2,R46108,CRS Report,REPORTS,R46108,True,"['Economic Policy', 'Science and Technology Po...",2019-12-09,Demand for Broadband in Rural Areas: Implicati...,"As of 2019, over 20 million Americans—predomin...",610595,20191209_R46108_9fe4bb7f03d1a36e4fb10835ca8db9...,R46108.json,{'insight': 'Rural broadband markets are hyper...,{'Actionable Recommendations or Implications':...,## Insight: \n\n### *Rural broadband markets a...,"[0.5168763, 1.5408688, -3.3518171, -0.80001146...",0.290129,INST416620
3,R47017,CRS Report,R,R47017,True,"['Agricultural Policy', 'Internet and Telecomm...",2022-12-14,USDA’s ReConnect Program: Expanding Rural Broa...,,R47017_3_2022-12-14,2022-12-14_R47017_a40122c2bc84a3e34f3fa4bbb1fe...,R47017.json,{'insight': 'It is more expensive to build and...,{'Actionable Recommendations or Implications':...,## Insight: \n\n### *It is more expensive to b...,"[0.7560909, 1.8678172, -3.4455166, -0.21886699...",0.291445,INST478037
4,R46108,CRS Report,REPORTS,R46108,True,"['Economic Policy', 'Science and Technology Po...",2019-12-09,Demand for Broadband in Rural Areas: Implicati...,"As of 2019, over 20 million Americans—predomin...",610595,20191209_R46108_9fe4bb7f03d1a36e4fb10835ca8db9...,R46108.json,{'insight': 'Federal spending on broadband exp...,{'Actionable Recommendations or Implications':...,## Insight: \n\n### *Federal spending on broad...,"[0.40469792, 2.04957, -3.3293974, -0.6780929, ...",0.295334,INST322913
5,R46108,CRS Report,REPORTS,R46108,True,"['Economic Policy', 'Science and Technology Po...",2019-12-09,Demand for Broadband in Rural Areas: Implicati...,"As of 2019, over 20 million Americans—predomin...",610595,20191209_R46108_9fe4bb7f03d1a36e4fb10835ca8db9...,R46108.json,{'insight': 'Rural households' broadband adopt...,{'Actionable Recommendations or Implications':...,## Insight: \n\n### *Rural households' broadba...,"[0.16907349, 1.8395344, -3.3484895, -0.3824893...",0.300136,INST206607
6,R47017,CRS Report,R,R47017,True,"['Agricultural Policy', 'Internet and Telecomm...",2022-12-14,USDA’s ReConnect Program: Expanding Rural Broa...,,R47017_3_2022-12-14,2022-12-14_R47017_a40122c2bc84a3e34f3fa4bbb1fe...,R47017.json,{'insight': 'Policy options for Congress to in...,{'Actionable Recommendations or Implications':...,## Insight: \n\n### *Policy options for Congre...,"[0.07626522, 1.8770603, -3.110022, -0.69244874...",0.30141,INST578700
7,R47017,CRS Report,R,R47017,True,"['Agricultural Policy', 'Internet and Telecomm...",2022-12-14,USDA’s ReConnect Program: Expanding Rural Broa...,,R47017_3_2022-12-14,2022-12-14_R47017_a40122c2bc84a3e34f3fa4bbb1fe...,R47017.json,{'insight': 'Rural residents have lower broadb...,{'Actionable Recommendations or Implications':...,## Insight: \n\n### *Rural residents have lowe...,"[0.37247777, 1.2701703, -3.340323, -0.39932805...",0.304082,INST920073
8,R42524,CRS Report,REPORTS,R42524,True,"['Appropriations', 'Health Policy']",2013-06-25,Rural Broadband: The Roles of the Rural Utilit...,Since the initial deployment of broadband in t...,421932,20130625_R42524_6fda8e3e88ced531bc5562afe7ca2a...,R42524.json,{'insight': 'The Rural Health Care Program pro...,{'Actionable Recommendations or Implications':...,## Insight: \n\n### *The Rural Health Care Pro...,"[-0.11230092, 1.1732627, -3.273556, -0.8880532...",0.30567,INST164724
9,R42524,CRS Report,REPORTS,R42524,True,"['Appropriations', 'Health Policy']",2013-06-25,Rural Broadband: The Roles of the Rural Utilit...,Since the initial deployment of broadband in t...,421932,20130625_R42524_6fda8e3e88ced531bc5562afe7ca2a...,R42524.json,"{'insight': 'A ""rural-rural divide"" exists, wi...",{'Actionable Recommendations or Implications':...,"## Insight: \n\n### *A ""rural-rural divide"" ex...","[0.017555246, 1.2374147, -2.9802232, -0.579398...",0.306199,INST792622


In [152]:
insights = list()
insight_texts = list()
for index, row in results.iterrows():
    insights.append(row['insights']['insight'])
    insight_texts.append(generate_insight_grounding_markdown([row['extraction_details']], [row['insight_id']]))
insight_text = '\n\n----\n\n'.join(insight_texts)
len(insight_text.split(' '))

14293

In [153]:
insight_text

'## Insight: [INST819678]\n\n### *The Rural Utilities Service (RUS) offers various programs (loans, grants, etc.) to support broadband and telecommunications infrastructure in rural areas, some specifically for broadband and others evolving from traditional telephone support.*\n\n## Source Information\n- **Original Source Title:** Rural Broadband: The Roles of the Rural Utilities Service and the Universal Service Fund\n- **Author(s) / Organization:** June 25, 2013 (R42524)\n- **Location in Source:** [\'R42524__1___1\' \'R42524__6___1\']\n\n## Comprehensive Explanation\nThe RUS has a portfolio of programs that offer financial assistance such as loans, loan guarantees, grants, and combinations thereof to support telecommunications and broadband infrastructure. Some programs are designed specifically for broadband deployment, while others have evolved from supporting traditional telephone services to now supporting broadband-capable services. The document states, "The RUS has a portfolio 

In [154]:
with open(project_folder.joinpath('relevant_insights.md'),'w') as f:
    f.write(insight_text)

In [155]:
with open(project_folder.joinpath('relevant_insight_single.md'),'w') as f:
    f.write(insight_texts[0])

In [156]:
def create_listlike_policy_analysis_prompt(question: str, insights: str) -> str:
    prompt = f"""
You are an expert junior public policy analyst. Your task is to answer a specific policy question based *only* on the provided 'insights'. You must follow these rules meticulously:

**THE CARDINAL RULE: CITE EVERY CLAIM**
This is the most important rule. You must add a citation to every single piece of information you write. Every sentence, every clause, and every data point you pull from an insight document must be followed by its citation key in brackets (e.g., [INST123456]). If multiple insights support a single claim, list them in the same brackets, separated by a comma (e.g., [INST123456, INST987654]). Do not cite a claim at the end of a sentence if the beginning of the sentence makes a different claim; cite each claim individually.

**YOUR STEP-BY-STEP WORKFLOW**
1.  **Understand the Question:** Read the user's policy question to understand what you need to answer.
2.  **Analyze Insights:** Carefully read the provided insights to find all relevant information.
3.  **Map Information:** For each piece of information, determine where it belongs in the template (Supporting Evidence, Contradictory Evidence, Context, etc.).
4.  **Condense and Rephrase:** Do not copy-paste from the insights. Rewrite the information concisely in your own words without changing the original meaning.
5.  **Fill the Template:** Populate the template below with the rephrased information, ensuring every single claim is cited.
6.  **Write the Summary Last:** After completing all other sections, write a brief executive summary that directly answers the question, again citing every claim.
7.  **Output Format:** Your final output must be in Markdown format, following the structure of the template exactly.

---
**INPUTS**

**Policy Question:** "{question}"

**Provided Insights:**
\"\"\"
{insights}
\"\"\"

---
**OUTPUT TEMPLATE (Use this exact structure for your response)**

## Public Policy Analysis Response

**Question Being Addressed:** {question}

---

### 1. Executive Summary

* [Provide a 1-3 sentence direct answer to the question, summarizing your most critical findings. Remember to cite every claim.]

---

### 2. Detailed Findings

#### A. Supporting Evidence / Reasons For

* **Finding 1:** [State the first piece of supporting evidence and add its citation.]
* **Finding 2:** [State the next piece of supporting evidence and add its citation.]
* ...

#### B. Contradictory Evidence / Reasons Against

* **Finding 1:** [State the first piece of contradictory evidence and add its citation.]
* ...

#### C. Context and Nuance

* **Point 1:** [Explain relevant context, author's reasoning, or other nuances and add its citation.]
* ...

#### D. Identified Gaps & Unanswered Questions

* **Gap 1:** [State a key piece of information that is missing from the insight and add its citation.]
* ...

---

### 3. Strength of Evidence Assessment

* **Evidence Quality:** [State the insight's qualitative assessment of the evidence (e.g., 'strong', 'weak') and add citation.]
* **Evidence Support:** [Describe how the evidence supports the insight (e.g., 'directly supportive', 'provides examples') and add citation.]
* **Confidence Level:** [State the specific confidence level indicated (e.g., 'High', 'Medium', 'Low') and add citation.]
"""
    return prompt

In [157]:
def create_senior_analyst_briefing_prompt(question: str, insights: str) -> str:
    prompt = f"""
You are an expert senior policy analyst. Your task is to synthesize the provided 'insights' into a concise and comprehensive briefing memo that addresses a central policy question. Your audience consists of other experts and decision-makers who require a rapid, high-level understanding of the issue, its evidence base, and its strategic implications.

**Core Directives:**
1.  **Synthesize and Group Thematically:** Do not simply list findings. Synthesize related pieces of information into thematic, analytical points. Group findings, points, and identified gaps by their common topic or subject matter to create a cohesive narrative.
2.  **Maintain Rigorous Citation:** Every factual claim, data point, or piece of evidence must be meticulously cited. Use bracketed citations (e.g., [INST123456]). If multiple sources support a point, cite them all (e.g., [INST123456, INST987654]).
3.  **Adopt an Analytical Tone:** The memo should be objective, concise, and focused on the "so what." The language should be professional and direct.
4.  **Structure is Key:** Adhere strictly to the briefing memo template provided below. The structure is designed to facilitate quick comprehension by a senior audience.

---
**INPUTS**

**Core Topic/Question:** "{question}"

**Provided Insights for Synthesis:**
\"\"\"
{insights}
\"\"\"

---
**OUTPUT TEMPLATE: BRIEFING MEMO**

**SUBJECT:** Briefing on: {question}

### 1. Executive Summary & Key Judgments

* **Top-Line Synthesis:** [Provide a 2-3 sentence summary that synthesizes the most critical information and directly addresses the core topic. Every claim must be cited.]
* **Key Judgments:** [Use 2-4 bullet points to state the most significant analytical conclusions drawn from the evidence. These are not just facts, but interpretations of the facts. Every judgment must be supported by a citation.]

---

### 2. Thematic Analysis of Findings

*[This section replaces a simple list of evidence. Group related findings from all insights into themes.]*

* **Theme 1:** [Name the first analytical theme (e.g., "Programmatic Evolution," "Evidence of Inefficiency").]
    * [Synthesize all supporting and contradictory evidence for this theme from one or more insights, ensuring every claim is cited.]
* **Theme 2:** [Name the second analytical theme (e.g., "Contradictory Outcomes," "Geographic Disparities").]
    * [Synthesize all supporting and contradictory evidence for this theme, citing every claim.]
* ...

---

### 3. Assessment of Evidence Base

*[This section summarizes the overall quality of the evidence from the provided insights.]*

* **The Good (Reliable & Clear):** [Summarize the strengths of the evidence base. What is well-documented, directly supportive, or based on high-quality sources? [CITE RELEVANT SOURCES].]
* **The Bad (Contradictory & Weak):** [Summarize the weaknesses of the evidence base. Are there direct contradictions between sources or poorly supported claims? [CITE RELEVANT SOURCES].]
* **The Questionable (Gaps & Ambiguities):** [Summarize the most critical gaps, under-explained points, or unanswered questions that prevent a complete analysis. Group related gaps by topic. [CITE SOURCES THAT REVEAL THE GAPS].]

---

### 4. Strategic Implications & Considerations

* **For Current Policy:** [Based on the analysis, what are the direct implications for current policy or operations? [CITE THE SUPPORTING EVIDENCE].]
* **For Future Analysis:** [What are the key considerations or questions that need to be addressed in future analytical work? [CITE THE SUPPORTING EVIDENCE].]
"""
#     prompt = f"""
# You are an expert senior policy analyst. Your task is to synthesize the provided 'insights' into a concise and comprehensive briefing memo that addresses a central policy question. Your audience consists of other experts and decision-makers who require a rapid, high-level understanding of the issue, its evidence base, and its strategic implications.
#
# **Core Directives:**
# 1.  **Synthesize, Don't List:** Do not simply list findings from the insights. Synthesize them into thematic, analytical points. Connect related pieces of information, even if they come from different insights.
# 2.  **Maintain Rigorous Citation:** Every factual claim, data point, or piece of evidence must be meticulously cited. Use bracketed citations (e.g., [INST123456]). If multiple sources support a point, cite them all (e.g., [INST123456, INST987654]).
# 3.  **Adopt an Analytical Tone:** The memo should be objective, concise, and focused on the "so what." The language should be professional and direct.
# 4.  **Structure is Key:** Adhere strictly to the briefing memo template provided below. The structure is designed to facilitate quick comprehension by a senior audience.
#
# ---
# **INPUTS**
#
# **Core Topic/Question:** "{question}"
#
# **Provided Insights for Synthesis:**
# \"\"\"
# {insights}
# \"\"\"
#
# ---
# **OUTPUT TEMPLATE: BRIEFING MEMO**
#
# **SUBJECT:** Briefing on: {question}
#
# ### 1. Executive Summary & Key Judgments
#
# * **Top-Line Synthesis:** [Provide a 2-3 sentence summary that synthesizes the most critical information and directly addresses the core topic. Every claim must be cited.]
# * **Key Judgments:** [Use 2-4 bullet points to state the most significant analytical conclusions drawn from the evidence. These are not just facts, but interpretations of the facts. Every judgment must be supported by a citation.]
#
# ---
#
# ### 2. Thematic Analysis of Findings
#
# *[This section replaces a simple list of evidence. Group related findings into themes.]*
#
# * **Theme 1:** [Name the first analytical theme (e.g., "Programmatic Evolution," "Evidence of Inefficiency").]
#     * [Synthesize the supporting evidence for this theme from one or more insights, ensuring every claim is cited.]
# * **Theme 2:** [Name the second analytical theme (e.g., "Contradictory Outcomes," "Data and Measurement Gaps").]
#     * [Synthesize the supporting evidence for this theme, citing every claim.]
# * ...
#
# ---
#
# ### 3. Assessment of Evidence Base
#
# * **Overall Quality:** [Provide a holistic assessment of the quality of the evidence presented across all insights. Comment on its strengths (e.g., 'based on primary government data') and weaknesses (e.g., 'lacks recent information') [CITE ALL RELEVANT SOURCES].]
# * **Key Gaps Identified:** [Summarize the most critical unanswered questions or gaps in the provided information that prevent a complete analysis [CITE SOURCES THAT REVEAL THE GAPS].]
#
# ---
#
# ### 4. Strategic Implications & Considerations
#
# * **For Current Policy:** [Based on the analysis, what are the direct implications for current policy or operations? [CITE THE SUPPORTING EVIDENCE].]
# * **For Future Analysis:** [What are the key considerations or questions that need to be addressed in future analytical work? [CITE THE SUPPORTING EVIDENCE].]
#
# ---
#
# ### 5. Source Reference Log
#
# * [List all Insight IDs used to generate this report.]
#
# """
    return prompt

In [159]:
policy_analysis_prompt = create_senior_analyst_briefing_prompt(query, insight_text)
len(policy_analysis_prompt.split(' '))

14779

In [160]:
policy_analysis = call_llm_flash(policy_analysis_prompt, temperature=0.1)

In [161]:
with open(project_folder.joinpath('insight_analysis.md'),'w') as f:
    f.write(policy_analysis)

In [162]:
policy_analysis_local = call_llm(policy_analysis_prompt, temperature=0.1, model='google/gemma-3-4b')

In [163]:
with open(project_folder.joinpath('insight_analysis_local.md'),'w') as f:
    f.write(policy_analysis_local)

### Insight citation mapping

In [257]:
footnote_analysis = policy_analysis

In [258]:
analysis_citations = list()
for _citation in re.findall(r"(INST\d+)(?:\]|, )|\[(INST\d+)(?:\]|, )", footnote_analysis):
    if _citation[0]:
        if _citation[0] not in analysis_citations:
            analysis_citations.append(_citation[0])
    elif _citation[1]:
        if _citation[1] not in analysis_citations:
            analysis_citations.append(_citation[1])
    else:
        continue

In [259]:
for _citation in analysis_citations:
    if _citation not in results['insight_id'].to_list():
        print(_citation)
footnote_lookup = {i:_citation for i,_citation in enumerate(analysis_citations, start=1)}
len(analysis_citations)

21

In [262]:
footnote_text = list()
for i, _citation in sorted(footnote_lookup.items(), key=lambda x: x[0]):
    footnote_analysis = re.sub(fr"({footnote_lookup[i]})", f'[^{i}]', footnote_analysis)
    footnote_text.append(f'[^{i}]: {_citation}')
footnote_text = '\n'.join(footnote_text)
footnote_analysis += f"\n\n{footnote_text}"
footnote_analysis = re.sub(r'(\[+)', r'[', footnote_analysis)
footnote_analysis = re.sub(r'(\]+)', r']', footnote_analysis)
with open(project_folder.joinpath('insight_analysis_w_footnotes.md'),'w') as f:
    f.write(footnote_analysis)

In [294]:
footnote_analysis_latex = pypandoc.convert_text(footnote_analysis, to='latex', format='markdown')
footnote_analysis_latex = footnote_analysis_latex.replace('subsection','subsection*')
latex_header = r"""\documentclass{article}
\usepackage{graphicx} % Required for inserting images
\usepackage[para]{footmisc}

\title{Policy insights}
\author{James Littiebrant}
\date{June 2025}

\begin{document}

\maketitle"""
latex_footer = r"""
\end{document}"""
footnote_analysis_latex = latex_header + '\n\n' + footnote_analysis_latex + '\n\n' + latex_footer
with open(project_folder.joinpath("insight_analysis_latex.tex"),"w") as f:
    f.write(footnote_analysis_latex)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


# Source Traceback

In [408]:
source_trace_back = policy_analysis

In [409]:
results

Unnamed: 0,id,type,typeId,number,active,topics,date,title,summary,doc_id,filename,source_file,insights,extraction_details,extraction_text,vector,_distance,insight_id
0,R42524,CRS Report,REPORTS,R42524,True,"['Appropriations', 'Health Policy']",2013-06-25,Rural Broadband: The Roles of the Rural Utilit...,Since the initial deployment of broadband in t...,421932,20130625_R42524_6fda8e3e88ced531bc5562afe7ca2a...,R42524.json,{'insight': 'The Rural Utilities Service (RUS)...,{'Actionable Recommendations or Implications':...,## Insight: \n\n### *The Rural Utilities Servi...,"[-0.3524781, 1.1827747, -3.1551015, -0.6930941...",0.286355,INST819678
1,R46501,CRS Report,R,R46501,True,"['Economic Policy', 'Internet and Telecommunic...",2020-08-28,Rural Digital Opportunity Fund: Requirements a...,,R46501_1_2020-08-28,2020-08-28_R46501_69aa2b86f4262de2d971c425729a...,R46501.json,{'insight': 'The Rural Digital Opportunity Fun...,{'Actionable Recommendations or Implications':...,## Insight: \n\n### *The Rural Digital Opportu...,"[-0.03362766, 1.605579, -3.2129455, -0.4775158...",0.288256,INST90992
2,R46108,CRS Report,REPORTS,R46108,True,"['Economic Policy', 'Science and Technology Po...",2019-12-09,Demand for Broadband in Rural Areas: Implicati...,"As of 2019, over 20 million Americans—predomin...",610595,20191209_R46108_9fe4bb7f03d1a36e4fb10835ca8db9...,R46108.json,{'insight': 'Rural broadband markets are hyper...,{'Actionable Recommendations or Implications':...,## Insight: \n\n### *Rural broadband markets a...,"[0.5168763, 1.5408688, -3.3518171, -0.80001146...",0.290129,INST416620
3,R47017,CRS Report,R,R47017,True,"['Agricultural Policy', 'Internet and Telecomm...",2022-12-14,USDA’s ReConnect Program: Expanding Rural Broa...,,R47017_3_2022-12-14,2022-12-14_R47017_a40122c2bc84a3e34f3fa4bbb1fe...,R47017.json,{'insight': 'It is more expensive to build and...,{'Actionable Recommendations or Implications':...,## Insight: \n\n### *It is more expensive to b...,"[0.7560909, 1.8678172, -3.4455166, -0.21886699...",0.291445,INST478037
4,R46108,CRS Report,REPORTS,R46108,True,"['Economic Policy', 'Science and Technology Po...",2019-12-09,Demand for Broadband in Rural Areas: Implicati...,"As of 2019, over 20 million Americans—predomin...",610595,20191209_R46108_9fe4bb7f03d1a36e4fb10835ca8db9...,R46108.json,{'insight': 'Federal spending on broadband exp...,{'Actionable Recommendations or Implications':...,## Insight: \n\n### *Federal spending on broad...,"[0.40469792, 2.04957, -3.3293974, -0.6780929, ...",0.295334,INST322913
5,R46108,CRS Report,REPORTS,R46108,True,"['Economic Policy', 'Science and Technology Po...",2019-12-09,Demand for Broadband in Rural Areas: Implicati...,"As of 2019, over 20 million Americans—predomin...",610595,20191209_R46108_9fe4bb7f03d1a36e4fb10835ca8db9...,R46108.json,{'insight': 'Rural households' broadband adopt...,{'Actionable Recommendations or Implications':...,## Insight: \n\n### *Rural households' broadba...,"[0.16907349, 1.8395344, -3.3484895, -0.3824893...",0.300136,INST206607
6,R47017,CRS Report,R,R47017,True,"['Agricultural Policy', 'Internet and Telecomm...",2022-12-14,USDA’s ReConnect Program: Expanding Rural Broa...,,R47017_3_2022-12-14,2022-12-14_R47017_a40122c2bc84a3e34f3fa4bbb1fe...,R47017.json,{'insight': 'Policy options for Congress to in...,{'Actionable Recommendations or Implications':...,## Insight: \n\n### *Policy options for Congre...,"[0.07626522, 1.8770603, -3.110022, -0.69244874...",0.30141,INST578700
7,R47017,CRS Report,R,R47017,True,"['Agricultural Policy', 'Internet and Telecomm...",2022-12-14,USDA’s ReConnect Program: Expanding Rural Broa...,,R47017_3_2022-12-14,2022-12-14_R47017_a40122c2bc84a3e34f3fa4bbb1fe...,R47017.json,{'insight': 'Rural residents have lower broadb...,{'Actionable Recommendations or Implications':...,## Insight: \n\n### *Rural residents have lowe...,"[0.37247777, 1.2701703, -3.340323, -0.39932805...",0.304082,INST920073
8,R42524,CRS Report,REPORTS,R42524,True,"['Appropriations', 'Health Policy']",2013-06-25,Rural Broadband: The Roles of the Rural Utilit...,Since the initial deployment of broadband in t...,421932,20130625_R42524_6fda8e3e88ced531bc5562afe7ca2a...,R42524.json,{'insight': 'The Rural Health Care Program pro...,{'Actionable Recommendations or Implications':...,## Insight: \n\n### *The Rural Health Care Pro...,"[-0.11230092, 1.1732627, -3.273556, -0.8880532...",0.30567,INST164724
9,R42524,CRS Report,REPORTS,R42524,True,"['Appropriations', 'Health Policy']",2013-06-25,Rural Broadband: The Roles of the Rural Utilit...,Since the initial deployment of broadband in t...,421932,20130625_R42524_6fda8e3e88ced531bc5562afe7ca2a...,R42524.json,"{'insight': 'A ""rural-rural divide"" exists, wi...",{'Actionable Recommendations or Implications':...,"## Insight: \n\n### *A ""rural-rural divide"" ex...","[0.017555246, 1.2374147, -2.9802232, -0.579398...",0.306199,INST792622


In [410]:
source_document_lookup = dict()
for index, row in results.iterrows():
    source_document_lookup[row['insight_id']] = row['id']

for _insight_id, _id in source_document_lookup.items():
    source_trace_back = source_trace_back.replace(_insight_id, _id)

In [411]:
source_citations = list()
for _citation in re.findall(r"(\w+\d+)(?:\]|, )|\[(\w+\d+)(?:\]|, )", source_trace_back):
    if _citation[0]:
        if _citation[0] not in source_citations:
            source_citations.append(_citation[0])
    elif _citation[1]:
        if _citation[1] not in source_citations:
            source_citations.append(_citation[1])
    else:
        continue

for _citation in source_citations:
    if _citation not in results['insight_id'].to_list():
        print(_citation)
source_footnote_lookup = {i:_citation for i,_citation in enumerate(source_citations, start=1)}

R47017
R46108
R42524
R46501


In [412]:
source_footnote_text = list()
for i, _citation in sorted(source_footnote_lookup.items(), key=lambda x: x[0]):
    source_trace_back = re.sub(fr"({source_footnote_lookup[i]})", f'[^{i}]', source_trace_back)
    source_footnote_text.append(f'[^{i}]: {_citation}')
source_footnote_text = '\n'.join(source_footnote_text)
source_trace_back += f"\n\n{source_footnote_text}"
source_trace_back = re.sub(r'(\[+)', r'[', source_trace_back)
source_trace_back = re.sub(r'(\]+)', r']', source_trace_back)
with open(project_folder.joinpath('insight_analysis_w_source_footnotes.md'),'w') as f:
    f.write(source_trace_back)

In [413]:
source_footnote_analysis_latex = pypandoc.convert_text(source_trace_back, to='latex', format='markdown')
source_footnote_analysis_latex = source_footnote_analysis_latex.replace('subsection','subsection*')
latex_header = r"""\documentclass{article}
\usepackage{graphicx} % Required for inserting images
\usepackage[para]{footmisc}

\title{Policy insights}
\author{James Littiebrant}
\date{June 2025}

\begin{document}

\maketitle"""
latex_footer = r"""
\end{document}"""
source_footnote_analysis_latex = latex_header + '\n\n' + source_footnote_analysis_latex + '\n\n' + latex_footer
with open(project_folder.joinpath("insight_analysis_source_latex.tex"),"w") as f:
    f.write(source_footnote_analysis_latex)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


# Source Passage Traceback

In [399]:
source_passage_trace_back = policy_analysis

In [400]:
results

Unnamed: 0,id,type,typeId,number,active,topics,date,title,summary,doc_id,filename,source_file,insights,extraction_details,extraction_text,vector,_distance,insight_id
0,R42524,CRS Report,REPORTS,R42524,True,"['Appropriations', 'Health Policy']",2013-06-25,Rural Broadband: The Roles of the Rural Utilit...,Since the initial deployment of broadband in t...,421932,20130625_R42524_6fda8e3e88ced531bc5562afe7ca2a...,R42524.json,{'insight': 'The Rural Utilities Service (RUS)...,{'Actionable Recommendations or Implications':...,## Insight: \n\n### *The Rural Utilities Servi...,"[-0.3524781, 1.1827747, -3.1551015, -0.6930941...",0.286355,INST819678
1,R46501,CRS Report,R,R46501,True,"['Economic Policy', 'Internet and Telecommunic...",2020-08-28,Rural Digital Opportunity Fund: Requirements a...,,R46501_1_2020-08-28,2020-08-28_R46501_69aa2b86f4262de2d971c425729a...,R46501.json,{'insight': 'The Rural Digital Opportunity Fun...,{'Actionable Recommendations or Implications':...,## Insight: \n\n### *The Rural Digital Opportu...,"[-0.03362766, 1.605579, -3.2129455, -0.4775158...",0.288256,INST90992
2,R46108,CRS Report,REPORTS,R46108,True,"['Economic Policy', 'Science and Technology Po...",2019-12-09,Demand for Broadband in Rural Areas: Implicati...,"As of 2019, over 20 million Americans—predomin...",610595,20191209_R46108_9fe4bb7f03d1a36e4fb10835ca8db9...,R46108.json,{'insight': 'Rural broadband markets are hyper...,{'Actionable Recommendations or Implications':...,## Insight: \n\n### *Rural broadband markets a...,"[0.5168763, 1.5408688, -3.3518171, -0.80001146...",0.290129,INST416620
3,R47017,CRS Report,R,R47017,True,"['Agricultural Policy', 'Internet and Telecomm...",2022-12-14,USDA’s ReConnect Program: Expanding Rural Broa...,,R47017_3_2022-12-14,2022-12-14_R47017_a40122c2bc84a3e34f3fa4bbb1fe...,R47017.json,{'insight': 'It is more expensive to build and...,{'Actionable Recommendations or Implications':...,## Insight: \n\n### *It is more expensive to b...,"[0.7560909, 1.8678172, -3.4455166, -0.21886699...",0.291445,INST478037
4,R46108,CRS Report,REPORTS,R46108,True,"['Economic Policy', 'Science and Technology Po...",2019-12-09,Demand for Broadband in Rural Areas: Implicati...,"As of 2019, over 20 million Americans—predomin...",610595,20191209_R46108_9fe4bb7f03d1a36e4fb10835ca8db9...,R46108.json,{'insight': 'Federal spending on broadband exp...,{'Actionable Recommendations or Implications':...,## Insight: \n\n### *Federal spending on broad...,"[0.40469792, 2.04957, -3.3293974, -0.6780929, ...",0.295334,INST322913
5,R46108,CRS Report,REPORTS,R46108,True,"['Economic Policy', 'Science and Technology Po...",2019-12-09,Demand for Broadband in Rural Areas: Implicati...,"As of 2019, over 20 million Americans—predomin...",610595,20191209_R46108_9fe4bb7f03d1a36e4fb10835ca8db9...,R46108.json,{'insight': 'Rural households' broadband adopt...,{'Actionable Recommendations or Implications':...,## Insight: \n\n### *Rural households' broadba...,"[0.16907349, 1.8395344, -3.3484895, -0.3824893...",0.300136,INST206607
6,R47017,CRS Report,R,R47017,True,"['Agricultural Policy', 'Internet and Telecomm...",2022-12-14,USDA’s ReConnect Program: Expanding Rural Broa...,,R47017_3_2022-12-14,2022-12-14_R47017_a40122c2bc84a3e34f3fa4bbb1fe...,R47017.json,{'insight': 'Policy options for Congress to in...,{'Actionable Recommendations or Implications':...,## Insight: \n\n### *Policy options for Congre...,"[0.07626522, 1.8770603, -3.110022, -0.69244874...",0.30141,INST578700
7,R47017,CRS Report,R,R47017,True,"['Agricultural Policy', 'Internet and Telecomm...",2022-12-14,USDA’s ReConnect Program: Expanding Rural Broa...,,R47017_3_2022-12-14,2022-12-14_R47017_a40122c2bc84a3e34f3fa4bbb1fe...,R47017.json,{'insight': 'Rural residents have lower broadb...,{'Actionable Recommendations or Implications':...,## Insight: \n\n### *Rural residents have lowe...,"[0.37247777, 1.2701703, -3.340323, -0.39932805...",0.304082,INST920073
8,R42524,CRS Report,REPORTS,R42524,True,"['Appropriations', 'Health Policy']",2013-06-25,Rural Broadband: The Roles of the Rural Utilit...,Since the initial deployment of broadband in t...,421932,20130625_R42524_6fda8e3e88ced531bc5562afe7ca2a...,R42524.json,{'insight': 'The Rural Health Care Program pro...,{'Actionable Recommendations or Implications':...,## Insight: \n\n### *The Rural Health Care Pro...,"[-0.11230092, 1.1732627, -3.273556, -0.8880532...",0.30567,INST164724
9,R42524,CRS Report,REPORTS,R42524,True,"['Appropriations', 'Health Policy']",2013-06-25,Rural Broadband: The Roles of the Rural Utilit...,Since the initial deployment of broadband in t...,421932,20130625_R42524_6fda8e3e88ced531bc5562afe7ca2a...,R42524.json,"{'insight': 'A ""rural-rural divide"" exists, wi...",{'Actionable Recommendations or Implications':...,"## Insight: \n\n### *A ""rural-rural divide"" ex...","[0.017555246, 1.2374147, -2.9802232, -0.579398...",0.306199,INST792622


In [401]:
def format_subsection_citations(citation):
    return f"{citation[0]}({citation[1]})({citation[2]})"

source_document_lookup = dict()
for index, row in results.iterrows():
    source_section = re.findall(r'([a-zA-Z\d]+?)__(\d+)___(\d+)', '|'.join(row['insights']['location']))
    source_section = [format_subsection_citations(x) for x in source_section]
    source_section = ', '.join(source_section)
    source_document_lookup[row['insight_id']] = source_section

for _insight_id, _id in source_document_lookup.items():
    source_passage_trace_back = source_passage_trace_back.replace(_insight_id, _id)

In [402]:
source_passage_citations = list()
_citation_track = list()
for _citation in source_document_lookup.values():
    if _citation in _citation_track:
        continue
    _re_citation = _citation.replace('(','\(').replace(')','\)')
    if re.search(_re_citation, source_passage_trace_back):
        _citation_span = re.search(_re_citation, source_passage_trace_back).span(0)[0]
        source_passage_citations.append((_citation_span, _citation))
        _citation_track.append(_citation)
source_passage_citations = sorted(source_passage_citations, key=lambda x: x[0])
source_passage_citations = [x[1] for x in source_passage_citations]
source_passage_footnote_lookup = {i:_citation for i,_citation in enumerate(source_passage_citations, start=1)}

In [403]:
source_passage_footnote_lookup

{1: 'R47017(1)(1)',
 2: 'R46108(1)(1)',
 3: 'R47017(3)(1)',
 4: 'R46108(16)(1)',
 5: 'R46108(9)(1)',
 6: 'R42524(6)(1)',
 7: 'R46108(25)(1)',
 8: 'R42524(14)(1)',
 9: 'R47017(4)(1)',
 10: 'R46108(7)(1)',
 11: 'R42524(26)(1)',
 12: 'R46108(6)(1)',
 13: 'R42524(1)(1)',
 14: 'R46501(1)(1)',
 15: 'R47017(22)(1)',
 16: 'R42524(22)(1)',
 17: 'R46108(8)(1)'}

In [405]:
source_footnote_text = list()
for i, _citation in sorted(source_passage_footnote_lookup.items(), key=lambda x: x[0]):
    _re_citation = _citation.replace('(','\(').replace(')','\)')
    source_passage_trace_back = re.sub(fr"({_re_citation})", f'[^{i}]', source_passage_trace_back)
    source_footnote_text.append(f'[^{i}]: {_citation}')
source_footnote_text = '\n'.join(source_footnote_text)

source_passage_trace_back += f"\n\n{source_footnote_text}"
source_passage_trace_back = re.sub(r'(\[+)', r'[', source_passage_trace_back)
source_passage_trace_back = re.sub(r'(\]+)', r']', source_passage_trace_back)
with open(project_folder.joinpath('insight_analysis_w_source_passage_footnotes.md'),'w') as f:
    f.write(source_passage_trace_back)

In [407]:
source_passage_footnote_analysis_latex = pypandoc.convert_text(source_passage_trace_back, to='latex', format='markdown')
source_passage_footnote_analysis_latex = source_passage_footnote_analysis_latex.replace('subsection','subsection*')
latex_header = r"""\documentclass{article}
\usepackage{graphicx} % Required for inserting images
\usepackage[para]{footmisc}

\title{Policy insights}
\author{James Littiebrant}
\date{June 2025}

\begin{document}

\maketitle"""
latex_footer = r"""
\end{document}"""
source_passage_footnote_analysis_latex = latex_header + '\n\n' + source_passage_footnote_analysis_latex + '\n\n' + latex_footer
with open(project_folder.joinpath("insight_analysis_source_passage_latex.tex"),"w") as f:
    f.write(source_passage_footnote_analysis_latex)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [417]:
__ = pypandoc.convert_text(source_passage_footnote_analysis_latex, to='docx', format='latex', outputfile=project_folder.joinpath('word_doc_test.docx'))

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [416]:
pypandoc.convert_text?