In [None]:
from sentence_transformers import SentenceTransformer
import torch
import lancedb
from openai import OpenAI
import re
import pandas as pd
import numpy as np
import json
import time
from fuzzy_json import loads as fuzzy_loads

from google import genai
from google.genai import types
from dotenv import load_dotenv
from tqdm.notebook import tqdm
import os
load_dotenv('env_var')

In [None]:
from pathlib import Path
import sqlite3
from datetime import datetime
from uuid import uuid4
from copy import deepcopy

from text_lookup import get_text_by_chunk_id, get_chunk_text_by_indexes_expansion, get_text_by_indexes_expansion, CitationFormatter, make_on_the_fly_citations,get_text_by_indexes_sections

project_folder = Path('project_research')
project_folder.mkdir(parents=True, exist_ok=True)
research_json_folder = project_folder.joinpath('json_data')
research_json_folder.mkdir(parents=True, exist_ok=True)
database_location = project_folder.joinpath('research.sqlite')

document_database = Path('../wonky_data/databases/documents.sqlite')
document_conn = sqlite3.connect(document_database)

conn = sqlite3.connect(database_location)
cursor = conn.cursor()

with open('insight_prompts.json','r') as f:
    prompts = json.load(f)

In [None]:
index = lancedb.connect('../wonky_data/crs_reports/')
table = index.open_table('sections')
encoder = SentenceTransformer('nomic-ai/nomic-embed-text-v1.5', device='mps',trust_remote_code=True)

In [None]:
import json

def _parse_single_policy_to_markdown(policy: dict, index: int) -> str:
    """
    Parses a single policy dictionary and converts it into a Markdown string.

    Args:
        policy: A Python dictionary for a single policy.
        index: The policy number for titling (e.g., 1, 2, 3).

    Returns:
        A string containing the formatted Markdown for a single policy.
    """
    policy_lines = []

    policy_lines.append("---")
    policy_type = policy.get('policyType', 'N/A')
    policy_lines.append(f"## Policy {policy.get('policyName', 'Unnamed Policy')} (`{policy_type}`)")

    # Policy Details
    policy_lines.append("\n### Policy Details")
    policy_lines.append(f"**Primary Objective:** {policy.get('primaryObjective', 'N/A')}")
    policy_lines.append(f"**Mechanism of Action:** {policy.get('mechanismOfAction', 'N/A')}")
    policy_lines.append(f"\n**Summary:**\n{policy.get('policyDetails', 'No details provided.')}")

    # Stakeholders
    stakeholders = policy.get('keyStakeholders', {})
    if stakeholders:
        policy_lines.append("\n#### Key Stakeholders")
        policy_lines.append(f"- **Beneficiaries:** {stakeholders.get('beneficiaries', 'N/A')}")
        policy_lines.append(f"- **Regulated Parties:** {stakeholders.get('regulatedParties', 'N/A')}")
        policy_lines.append(f"- **Implementing Agency:** {stakeholders.get('implementingAgency', 'N/A')}")

    # Arguments
    policy_lines.append("\n### Analysis & Arguments")

    arguments_in_favor = policy.get('argumentsInFavor', [])
    policy_lines.append("\n**Arguments In Favor:**")
    if arguments_in_favor:
        for arg in arguments_in_favor:
            policy_lines.append(f"- {arg}")
    else:
        policy_lines.append("- None mentioned.")

    arguments_against = policy.get('argumentsAgainst', [])
    policy_lines.append("\n**Arguments Against / Challenges:**")
    if arguments_against:
        for arg in arguments_against:
            policy_lines.append(f"- {arg}")
    else:
        policy_lines.append("- None mentioned.")

    policy_lines.append(f"\n**Author's Apparent Stance:** {policy.get('authorsApparentStance', 'N/A')}")

    # Evidence and Sources
    policy_lines.append("\n### Evidence & Sources")
    policy_lines.append("\n> **Specific Evidence:**")
    policy_lines.append(f"> {policy.get('specificEvidence', 'No specific evidence quoted.')}")

    source_locations = policy.get('sourceLocations', [])
    policy_lines.append("\n**Source Locations:**")
    if isinstance(source_locations, list):
        for loc in source_locations:
            policy_lines.append(f"- `{loc}`")
    elif isinstance(source_locations, str):
        policy_lines.append(f"- `{source_locations}`")
    else:
        policy_lines.append("- None mentioned.")

    policy_lines.append("\n")

    return "\n".join(policy_lines)

def parse_crs_json_to_markdown(data: dict) -> str:
    """
    Parses a dictionary (from CRS JSON) and converts it into a readable Markdown document.

    Args:
        data: A Python dictionary conforming to the CRS analysis JSON schema.

    Returns:
        A string containing the formatted Markdown document.
    """
    markdown_lines = []

    # --- Part 1: Report Information ---
    report_info = data.get("reportInfo", {})
    if report_info:
        markdown_lines.append(f"# {report_info.get('reportTitle', 'Untitled Report')}")
        markdown_lines.append("---")
        markdown_lines.append(f"**Report Number:** {report_info.get('reportNumber', 'N/A')}")
        markdown_lines.append(f"**Publication Date:** {report_info.get('publicationDate', 'N/A')}")
        analysts = ", ".join(report_info.get('crsAnalysts', ['N/A']))
        markdown_lines.append(f"**Analysts:** {analysts}")
        markdown_lines.append("\n> ### Overall Subject")
        markdown_lines.append(f"> {report_info.get('overallSubject', 'No summary provided.')}\n")

    # --- Part 2: Policies ---
    policies = data.get("policies", [])
    if not policies:
        markdown_lines.append("## No policies were identified in this report.")
        return "\n".join(markdown_lines)

    for i, policy in enumerate(policies, 1):
        policy_markdown = _parse_single_policy_to_markdown(policy, i)
        markdown_lines.append(policy_markdown)

    return "\n".join(markdown_lines)

In [None]:
def format_evidence(evidence_list):
    """Formats the evidence list into a Markdown string."""
    markdown = ""
    if not evidence_list or not isinstance(evidence_list, list):
        return "No evidence provided.\n"

    for i, item in enumerate(evidence_list):
        markdown += f"  - **{item.get('Description', 'N/A')}**\n"
        markdown += f"    - **Key Data/Details:** {item.get('Key Data/Details', 'N/A')}\n"
        markdown += f"    - **Methodology Note:** {item.get('Methodology Note', 'N/A')}\n"
        markdown += f"    - **Source Note:** {item.get('Source Note', 'N/A')}\n"
    return markdown


def format_list(items, title):
    """Formats a simple list of strings into a Markdown list."""
    markdown = f"#### *{title}*\n"
    if not items or not isinstance(items, list):
        return markdown + "- N/A\n"
    for item in items:
        markdown += f"- {item}\n"
    return markdown


def format_evidence(evidence_list):
    """Formats the evidence list into a Markdown string using the new schema."""
    markdown = ""
    if not evidence_list or not isinstance(evidence_list, list):
        return "No evidence provided.\n"

    for i, item in enumerate(evidence_list):
        # Using new camelCase keys from the updated schema
        markdown += f"  - **{item.get('description', 'N/A')}**\n"
        markdown += f"    - **Details:** {item.get('details', 'N/A')}\n"
        markdown += f"    - **Methodology:** {item.get('methodology', 'N/A')}\n"
        markdown += f"    - **Source:** {item.get('source', 'N/A')}\n"
    return markdown


def format_list(items, title):
    """Formats a simple list of strings into a Markdown list."""
    markdown = f"#### {title}\n"
    if not items or not isinstance(items, list):
        return markdown + "- N/A\n"
    for item in items:
        markdown += f"- {item}\n"
    return markdown


def generate_markdown(data):
    if not isinstance(data, list):
        return "Error: JSON data must be a list of insight objects."

    full_markdown = ""
    for i, insight in enumerate(data):
        # --- Main Title for the Insight (using 'statement') ---
        full_markdown += f"## Insight: \n\n### *{insight.get('statement', 'No Title Provided')}*\n\n"

        # --- Source Information (using new keys) ---
        full_markdown += "## 1. Source Information\n"
        full_markdown += f"- **Title:** {insight.get('sourceTitle', 'N/A')}\n"
        full_markdown += f"- **Author:** {insight.get('author', 'N/A')}\n"
        # Joining list of locations for cleaner output
        locations = insight.get('location', ['N/A'])
        full_markdown += f"- **Location:** {', '.join(locations)}\n\n"

        # --- Comprehensive Explanation (using 'explanation') ---
        full_markdown += "## 2. Comprehensive Explanation\n"
        full_markdown += f"{insight.get('explanation', 'N/A')}\n\n"

        # --- Evidence Section (using new keys) ---
        full_markdown += "## 3. Evidence & Reasoning\n\n"
        full_markdown += "### Evidence For\n"
        full_markdown += format_evidence(insight.get('evidenceFor'))
        full_markdown += "\n"

        full_markdown += "### Reasoning For\n"
        full_markdown += insight.get('reasoningFor', 'N/A') + '\n\n'

        full_markdown += "### Evidence Against\n"
        full_markdown += f"{insight.get('evidenceAgainst', 'N/A')}\n\n"

        full_markdown += "### Reasoning Against\n"
        full_markdown += insight.get('reasoningAgainst', 'N/A') + "\n\n"

        full_markdown += "### Author's Position\n"
        full_markdown += insight.get('position', 'N/A') + "\n\n"

        # --- Strength of Insight (using 'strength' and its sub-keys) ---
        strength = insight.get('strength', {})
        full_markdown += "## 4. Strength of Insight\n"
        full_markdown += f"- **Assessment:** {strength.get('assessment', 'N/A')}\n"
        full_markdown += f"- **Confidence:** {strength.get('confidence', 'N/A')}\n"
        full_markdown += f"- **Plausibility:** {strength.get('plausibility', 'N/A')}\n\n"

        # --- Actionable Recommendations (using 'implications' and its sub-keys) ---
        recommendations = insight.get('implications', {})
        full_markdown += "## 5. Actionable Implications\n"
        full_markdown += f"- **If True:** {recommendations.get('ifTrue', 'N/A')}\n"
        full_markdown += f"- **Potential Use:** {recommendations.get('use', 'N/A')}\n"
        full_markdown += f"- **If False:** {recommendations.get('ifFalse', 'N/A')}\n\n"

        # --- Indexing (using 'indexing' and its sub-keys) ---
        # indexing = insight.get('indexing', {})
        # full_markdown += "## 6. Indexing\n"
        # full_markdown += format_list(indexing.get('generalTopics', []), "General Topics")
        # full_markdown += format_list(indexing.get('specificTopics', []), "Specific Topics")
        # full_markdown += format_list(indexing.get('generalKeywords', []), "General Keywords")
        # full_markdown += format_list(indexing.get('specificKeywords', []), "Specific Keywords")
        # full_markdown += "\n"

        # --- Unanswered Questions (using 'questions') ---
        full_markdown += "## 6. Unanswered Questions\n"
        full_markdown += format_list(insight.get('questions', []), "")


        # --- Separator for next insight ---
        if i < len(data) - 1:
            full_markdown += "\n---\n\n"

    return full_markdown


In [None]:
client = OpenAI(base_url="http://localhost:1234/v1", api_key="lm-studio")
def call_llm(query, temperature=0.35, seed=42, model="gemma-3-12b-it-qat"):
    completion = client.chat.completions.create(
        model=model,
        messages=[
            {"role": "user", "content": query}
        ],
        temperature=temperature,
        seed=seed,
    )
    return completion.choices[0].message.content

model = "gemini-2.0-flash"
total_tokens = list()

def call_llm_flash(query, temperature=0.1, seed=42, max_tokens=8193 ):
    client = genai.Client(api_key=os.environ['GEMINI_API_KEY'])
    retries = 3
    time_delay = 15
    for i in range(retries):
        try:
            response = client.models.generate_content(
                model=model,
                contents=[query],
                config=types.GenerateContentConfig(
                    max_output_tokens=max_tokens,
                    temperature=temperature,
                    seed=seed
                )
            )
            break
        except Exception as e:
            print(e)
            print(f"Retries left: {retries - i}")
            time.sleep(time_delay)
            continue



    total_tokens.append({'prompt_tokens':response.usage_metadata.prompt_token_count,
                         'completion_tokens':response.usage_metadata.candidates_token_count,
                         'total_tokens':response.usage_metadata.total_token_count,
                         'timestamp':datetime.now().strftime("%Y_%m_%d_%H_%M_%S")})

    return response.text

In [None]:
def format_enriched_report(extracted_article, full_document_df, report_sections, overview_type):
    article_pages = extracted_article.split('---')
    article_metadata = full_document_df.iloc[0].to_dict()

    article_metadata['overview'] = extracted_article
    article_metadata['overview_type'] = overview_type
    article_metadata['overview_pages'] = article_pages

    used_citations = list()
    for section in report_sections:
        _citation = section['citation']
        if _citation in extracted_article:
            used_citations.append(section)

    article_metadata['overview_citations'] = sorted([x['citation'] for x in used_citations])
    article_metadata['overview_cite_sources'] = used_citations
    return article_metadata

In [None]:
def xml_passage_text(_article, _article_meta):
    passage_text = CitationFormatter().formatter_xml_tag_article(_article['content'].to_list(),
                                                        _article['citation'].to_list()
                                                        )
    passage_text = f"""# Title: {_article_meta['title']}
    # Report ID: {_article_meta['id']}

    {passage_text}"""
    return passage_text

In [None]:
query = "Agricultural subsidies on rural economics."
query_vec = encoder.encode(query)
search_results = table.search(query_vec).limit(5).to_pandas()
article_title_list = search_results[['title','id']].to_dict(orient='records')

In [None]:
class PolicyDocument():
    def __init__(self, search_result, search_position):
        self.search_position = search_position
        self.search_result = search_result
        self.chunks = list()
        self.passages = list()
        self.chunk_text = list()
        self.passage_text = list()
        self.insights = list()
        self.policies = list()
        self.citation_mapping = list()
        self.variables = dict()

In [None]:
## Get the text with the paragraph level citations
policy_documents = list()
for index, row in search_results.iterrows():
    _passages = get_text_by_indexes_sections(row['id'],
                                                       start_index=row['start_index'],
                                                       end_index=row['end_index'],
                                                       conn=document_conn)
    _passage_text = xml_passage_text(_passages, row)
    policy_document = PolicyDocument(row, index)
    policy_document.passages = _passages
    policy_document.passage_text = _passage_text
    policy_document.variables['insight_extraction_prompt'] = prompts['insight_extraction'].format(document=_passage_text)

    citation_mapping = {citation:chunk for citation, chunk in zip(_passages['citation'].values, _passages['chunk_id'].values)}
    policy_document.citation_mapping = citation_mapping

    policy_documents.append(policy_document)

#### Identify the locations of the different citations

In [None]:
for policy_document in tqdm(policy_documents):
    insights = call_llm_flash(policy_document.variables['insight_extraction_prompt'], temperature=0.1)
    insights_citations = re.findall(r'\*\*\d+\. Insight:\*\* (.*?)\n\* \*\*Location in Document:\*\*(.+?)(?:\n\n|$)',
                                    insights,
                                    flags=re.MULTILINE|re.DOTALL)
    insight_mapping = list()
    for _insight in insights_citations:
        insight_mapping.append({'insight': _insight[0], 'location':re.findall(r'\[(.*?)\]', _insight[1])})
    policy_document.insights = insight_mapping

In [None]:
batch_size = 4

for policy_document in policy_documents:
    insight_batches = list()
    _insights = policy_document.insights
    if len(_insights) <= batch_size:
        all_insights = '\n'.join(['* ' + x['insight'] for x in _insights[:batch_size+1]])
        insight_batches.append(all_insights)

    if len(_insights) > batch_size:
        for i in range(len(_insights) // batch_size + 1):
            all_insights = '\n'.join(['* ' + x['insight'] for x in _insights[i*batch_size:(i+1)*batch_size]])
            insight_batches.append(all_insights)

    article_insights = list()
    for _insight_batch in tqdm(insight_batches):
        batch_insight_prompt = prompts['insight_template_instructions_all_insights'].format(insights=_insight_batch,
                                                                           document=policy_document.passage_text)
        print(len(batch_insight_prompt.split(' ')), len(prompts['insight_template_instructions_all_insights'].split(' ')))
        batch_insight_report = call_llm_flash(batch_insight_prompt, temperature=0.1)
        article_insights.append(batch_insight_report)
    policy_document.variables['raw_insight_text'] = article_insights

In [None]:
for policy_document in policy_documents:
    _insights = list()
    for _batch in policy_document.variables['raw_insight_text']:
        try:
            _insights.extend(fuzzy_loads(re.search(r'```json(.+?)```', _batch, flags=re.DOTALL).group(1)))
        except Exception as e:
            print(e)
            continue
    policy_document.insights = pd.DataFrame(_insights)

In [None]:
policy_document.insights

In [None]:
def chunk_mapper(citations, _mapping):
    mapped_citations = [_mapping.get(_cite.strip('<[]()>/\\'), None) for _cite in citations]
    mapped_citations = list(set(mapped_citations))
    return mapped_citations

for policy_document in tqdm(policy_documents):
    policy_document.insights['mapped_chunks'] = policy_document.insights['location'].apply(lambda x: chunk_mapper(x, policy_document.citation_mapping))
    policy_document.insights = policy_document.insights.explode('mapped_chunks')

## Policy Extraction Prompt

In [None]:
policy_extraction_prompt = """
Your goal is to identify all distinct policies discussed in the provided text and format the output as a single, clean JSON object according to the rules and schema below.

**Rules for Extraction:**

1.  **Analyze Full Text:** Read the entire report text provided at the end of this prompt.
2.  **Extract Report Info:** Identify the report's general information (title, number, date, authors, overall subject) and populate the `reportInfo` object.
3.  **Identify All Policies:** Scan the report to identify every distinct policy. A policy can be an existing law, a proposed bill, or a "policy option" presented by the author. Create a JSON object for each one in the `policies` list.
4.  **Populate Policy Fields with High Detail:** For each policy, adhere to the following field-specific rules:
    * **`primaryObjective`**: Extract the specific, stated goal of the policy. What problem is it explicitly designed to solve? Look for phrases like "in order to," "the purpose of this is," or "this policy aims to address."
    * **`mechanismOfAction`**: Describe the concrete, functional steps of how the policy works. Do not describe the goal, but the *actions*. For example: "It authorizes $50M in block grants to states," "It imposes a 2% tariff on imported steel," or "It directs the EPA to develop new emissions standards."
    * **`policyDetails`**: Provide a comprehensive, detailed summary of the policy. This field should synthesize the objective, mechanism, history, and any other key details mentioned in the report into a thorough paragraph. This should be the most detailed field for the policy.
    * **`argumentsInFavor`**: Identify every distinct argument *for* the policy mentioned in the text. Each unique argument should be a separate string in this list.
    * **`argumentsAgainst`**: Identify every distinct argument, challenge, or drawback *against* the policy mentioned in the text. Each unique point should be a separate string in this list.
    * **`authorsApparentStance`**: Infer the author's overall leaning on this specific policy based on framing, tone, and the balance of arguments.
    * **`specificEvidence`**: Provide direct quotes or specific data points (e.g., statistics, figures) from the report that serve as the primary evidence for your analysis of this policy.
5.  **Return Clean JSON:** Your final output must be **only the JSON object**, with no introductory text, explanations, or markdown formatting.

**JSON Schema for Output:**

```json
{{
  "reportInfo": {{
    "reportNumber": "string (e.g., R40123)",
    "overallSubject": "string (A 1-2 sentence summary of the report's main issue)"
  }},
  "policies": [
    {{
      "policyName": "string (Official name, bill number, or descriptive name)",
      "policyType": "string (Enum: 'Existing Law / Regulation', 'Proposed Legislation (Bill)', 'Policy Option / Alternative', 'Executive Action', 'Other')",
      "primaryObjective": "string (The specific, stated goal of the policy)",
      "mechanismOfAction": "string (The concrete, functional steps of how the policy works)",
      "policyDetails": "string (A comprehensive, detailed summary of the policy)",
      "keyStakeholders": {{
        "beneficiaries": "string (Who stands to benefit?)",
        "regulatedParties": "string (Who must comply or is most impacted?)",
        "implementingAgency": "string (Which government agency is in charge?)"
      }},
      "argumentsInFavor": ["string (List of distinct arguments for the policy)"],
      "argumentsAgainst": ["string (List of distinct arguments/challenges against the policy)"],
      "authorsApparentStance": "string (Enum: 'Generally Favorable', 'Generally Unfavorable', 'Strictly Neutral / Indiscernible')",
      "sourceLocations": ["string (A specific citation from the report, e.g., R40123_ref123)"],
      "specificEvidence": "string (Direct quotes or specific data points from the report that serve as evidence for the analysis)"
    }}
  ]
}}
```

{document_text}
"""

In [None]:
for policy_document in policy_documents:
    chunk_text = list()
    for _chunk_id in policy_document.insights['mapped_chunks'].unique():
        chunk_text.append(get_text_by_chunk_id(_chunk_id, document_conn))
    chunk_text = pd.concat(chunk_text)
    policy_document.variables['insight_chunks'] = chunk_text
    policy_text = CitationFormatter().formatter_xml_tag_article(chunk_text['passage_text'].to_list(),
                                                    chunk_text['chunk_id'].to_list()
                                                    )
    policy_text = f"""# Title: {chunk_text.iloc[0]['title']}
    # Report ID: {chunk_text.iloc[0]['id']}

    {policy_text}"""
    policy_document.variables['formatted_insight_chunks'] = policy_text
    policy_document.variables['policy_extraction_prompt'] = policy_extraction_prompt.format(document_text=policy_text)

In [None]:
extracted_policies = list()
for policy_document in tqdm(policy_documents):
    _extracted_policies = call_llm_flash(policy_document.variables['policy_extraction_prompt'], temperature=0.1, max_tokens=10000)
    extracted_policies_json = fuzzy_loads(re.search(r'```json(.+?)```', _extracted_policies, flags=re.DOTALL).group(1).strip())
    policy_document.policies = pd.DataFrame(extracted_policies_json['policies'])
    policy_document.policies = policy_document.policies.explode('sourceLocations')
    policy_document.policies['report_number'] = extracted_policies_json['reportInfo']['reportNumber']
    policy_document.policies['overallSubject'] = extracted_policies_json['reportInfo']['overallSubject']

In [None]:
policy_document.__dict__.keys()

In [None]:
insights_related_to_policy

In [None]:
_policies

In [None]:
for policy_document in tqdm(policy_documents):
    _insights = policy_document.insights
    _policies = policy_document.policies
    policy_insight_sources = list()
    for index, row in _policies.iterrows():
        policy_card = _parse_single_policy_to_markdown(row.to_dict(), index=0).strip()

        insights_related_to_policy = _insights[_insights['mapped_chunks'] == row['sourceLocations']]
        insight_card = generate_markdown(insights_related_to_policy.to_dict(orient='records'))

        source_cards = list()
        for chunk_id in insights_related_to_policy['mapped_chunks'].unique():
            _chunk_text = get_text_by_chunk_id(chunk_id, document_conn)
            report_id = _chunk_text.iloc[0]['id']
            source_card_text = get_text_by_indexes_sections(report_id,
                                                        int(_chunk_text['start_index'].min()),
                                                        int(_chunk_text['end_index'].max()),
                                                        document_conn)
            source_card_text = CitationFormatter().formatter_xml_tag_article(source_card_text['content'].to_list(),
                                                    source_card_text['citation'].to_list()
                                                    )
            source_cards.append(source_card_text)
        policy_insight_sources.append({
            'policy_card': policy_card,
            'insight_card': insight_card,
            'source_cards': '\n-----\n'.join(source_cards),
            'policy':row.to_dict()
        })
    print(len(policy_insight_sources))
    policy_document.variables['policy_insight_sources'] = policy_insight_sources

In [None]:
strategic_analysis_guide_template = """Only respond with the sections identifed in the phases. Do not write tables, do not introduce your response, do not conclude your response.

**1. Source Materials**

This analysis is to be performed using only the following embedded source materials.

### **Provided Policy Card**
{policy_card_text}

### **Provided Insight Cards**
{insight_cards_text}

### **Source Documents**
{source_card_text}

**2. Core Task & Output**

* **Task:** Using the provided Policy Card and Insight Cards, analyze the policy's long-term resilience and effectiveness under conditions of high uncertainty. This involves identifying key external driving forces, developing a set of plausible future scenarios, and formulating robust strategies that can succeed across multiple potential futures.
* **Guiding Principle on Limited Information:** This analysis must be based *only* on the provided source materials. If information is insufficient to complete a section, you must explicitly state the limitation in that section and detail the gap in Phase 5. Do not make assumptions or use outside knowledge. The goal is to assess what is knowable from the provided text.
* **Guiding Principle on Insight Relevance:** Not every provided Insight Card may be relevant to the specific strategic question. Use your judgment to select and apply only the insights that directly inform the identification of drivers, uncertainties, and potential impacts. It is not necessary to use every insight.
* **Output Requirements:** The final report or presentation must contain:
    * A clear definition of the policy from the provided Policy Card and the time horizon for the analysis.
    * An identification of the key drivers of change and critical uncertainties, with full source citation.
    * A set of 2-4 detailed, plausible, and distinct future scenario narratives.
    * An analysis of the policy's performance, risks, and opportunities within each scenario.
    * A set of recommended strategies, distinguishing between robust actions (valuable in all futures) and contingent actions (dependent on a specific future).
    * A list of "signposts" or leading indicators to monitor which future is unfolding.
    * A "Follow-Up & Further Research" section detailing information gaps.

**3. Analytical Workflow & Rules**

#### **Phase 1: Scoping and Identifying Drivers**

* **1.1. Define the Analytical Scope and Time Horizon:**
    * **Source Policy:** Reference the provided Policy Card.
    * **Source Insights:** Reference the provided Insight Cards.
    * **Time Horizon:** Define the time frame for the analysis (e.g., 5, 10, or 20 years). The time horizon should be long enough for significant changes to occur.
* **1.2. Identify Key Drivers of Change:**
    * **Rule:** Brainstorm a comprehensive list of external forces by extracting information directly from the provided source materials.
    * **Guideline for using the Insight Cards:**
        * Review the "Comprehensive Explanation," "Evidence FOR this Insight," and "Author's Reasoning" sections to identify established trends and forces.
        * Review the "Actionable Recommendations" and "Unanswered Questions" sections to identify potential future pressures or developments.
    * **Guideline for using the Policy Card:**
        * Review the "Summary" and "Arguments Against / Challenges" sections to identify external pressures, dependencies, and potential obstacles.
    * **Citation Requirement:** Every key driver, trend, or insight used must be cited in-line. The citation must be the specific alphanumeric location code found in the source materials (e.g., `RL32624_7__1` or `RL32624_111_122`). This ensures full traceability.
* **1.3. Prioritize Critical Uncertainties:**
    * **Rule:** From your list of drivers, select the **top two** most critical uncertainties to form the axes of your scenario framework. A driver is a "critical uncertainty" if it is both highly important and highly uncertain.
    * **Guideline for Assessing Importance:** A driver is **important** if it is mentioned across multiple Insight Cards, or if the "Arguments Against / Challenges" section of the Policy Card or the "Actionable Recommendations" of an Insight Card suggest it has a major impact on the policy's success.
    * **Guideline for Assessing Uncertainty:** A driver is **uncertain** if:
        * It is framed as an "Unanswered Question" in an Insight Card.
        * The "Strength of This Specific Insight" section indicates a low or medium confidence level.
        * The Policy Card describes a key feature as "Unclear" or notes significant "disagreements" among stakeholders.
        * An Insight Card explicitly identifies a dependency on an external event with an unknown outcome (e.g., "depend... on outcomes from ongoing... negotiations").

#### **Phase 2: Scenario Development**

* **2.1. Define Scenario Axes:**
    * **Rule:** For each of the two Critical Uncertainties selected in Phase 1, define two plausible, extreme outcomes. These will form the axes for your scenarios.
    * **Format:** List each uncertainty and its two opposing outcomes.

    **Standard Axis Definition Format:**

    * **Axis 1 - [Name of Critical Uncertainty 1]:**
        * Outcome A: [Description of first extreme outcome]
        * Outcome B: [Description of second extreme outcome]

    * **Axis 2 - [Name of Critical Uncertainty 2]:**
        * Outcome X: [Description of first extreme outcome]
        * Outcome Y: [Description of second extreme outcome]

    * **Guideline:** The four scenarios you develop will be based on the four possible combinations of these outcomes (A+X, B+X, A+Y, B+Y).
* **2.2. Develop Scenario Narratives:**
    * **Requirement:** For each of the four combinations, write a detailed and compelling narrative describing that future world. Give each scenario a memorable name that captures its essence.
    * **Guideline for Grounding Narratives:** To build each narrative, synthesize the "Comprehensive Explanation" and "Evidence" sections from the Insight Cards. Imagine how those facts and descriptions would change or be emphasized in a world defined by that scenario's specific outcomes. This ensures the scenarios are extensions of the provided evidence.

#### **Phase 3: Impact Analysis and Strategy Formulation**

* **3.1. Stress-Test the Policy:**
    * **Rule:** For each scenario narrative, analyze the performance of the policy outlined in the provided Policy Card.
    * **Guideline for Structuring the Stress Test:** For each scenario, explicitly evaluate how the "Arguments In Favor" and "Arguments Against / Challenges" from the Policy Card would be amplified or diminished. Use the "Actionable Recommendations or Implications" from the Insight Cards as a checklist of potential impacts to consider. This directly links the impact analysis to the pre-identified arguments and implications.
* **3.2. Identify Robust and Contingent Strategies:**
    * **Requirement:** Based on the stress test, develop a set of strategic options.
    * **Categorize Strategies:**
        * **Robust Strategies:** High-priority actions that are beneficial across most or all scenarios.
        * **Contingent Strategies:** Actions held in reserve, to be implemented only if evidence shows a specific scenario is emerging.
* **3.3. Define Signposts:**
    * For each scenario, identify a list of 3-5 "signposts" or leading indicators. These are early warning signals that suggest a particular scenario is becoming more likely.

#### **Phase 4: Reporting and Monitoring**

* **4.1. Structure the Strategic Report:**
    * **Executive Summary:** Briefly describe the four scenarios and highlight the most critical strategic recommendations.
    * **Introduction:** Define the source policy, time horizon, and the critical uncertainties used to build the scenarios.
    * **Scenario Narratives:** Present the detailed story for each of the four futures.
    * **Implications:** For each scenario, discuss the results of the stress test.
    * **Strategic Options:** Detail the recommended robust and contingent strategies.
    * **Monitoring Plan:** List the signposts to be tracked.

#### **Phase 5: Follow-Up and Further Research**

* **5.1. Document Information Gaps:**
    * **Rule:** Explicitly list any areas where the analysis was limited due to missing information in the source documents. You must state what could not be completed and why.
    * **Example:** "The economic impact of Scenario 2 could not be fully assessed because the Policy Card lacks specific data on potential funding mechanisms (`RL32624_111_122`)."
* **5.2. Formulate Key Questions for Follow-Up:**
    * **Rule:** Transform each documented information gap into a specific, actionable research question. This formalizes the "Unanswered Questions" identified in the source materials.
    * **Example:** "What are the three most likely funding mechanisms for 'Green Payments', and what is the estimated cost of each?"
* **5.3. Recommend Next Steps:**
    * **Rule:** For each key question, recommend a concrete next step to obtain the missing information.
    * **Example:** "Recommend a targeted review of budget proposals from the last 5 years to identify precedents for funding similar environmental programs."
"""

In [None]:
for policy_document in tqdm(policy_documents):
    strategy_analyses = list()
    for _policy in tqdm(policy_document.variables['policy_insight_sources']):
        strategic_analysis_prompt = strategic_analysis_guide_template.format(policy_card_text=_policy['policy_card'],
                                         insight_cards_text=_policy['insight_card'],
                                         source_card_text=_policy['source_cards'])
        print(len(strategic_analysis_prompt.split(' ')))
        strategic_analysis = call_llm_flash(strategic_analysis_prompt, temperature=0.2)
        strategic_analysis_policy = deepcopy(_policy)
        strategic_analysis_policy['analysis'] = strategic_analysis
        strategy_analyses.append(strategic_analysis_policy)
    policy_document.variables['strategy_analysis'] = strategy_analyses

In [None]:
json_friendly = list()
policy_document.__dict__.keys()

In [None]:
json_policy_documents = list()
for policy_document in tqdm(policy_documents):
    policy_document_dict = deepcopy(policy_document.__dict__)
    policy_document_dict['search_result'] = policy_document_dict['search_result'].to_json(orient='records')
    policy_document_dict['policies'] = policy_document_dict['policies'].to_json(orient='records')
    policy_document_dict['insights'] = policy_document_dict['insights'].to_json(orient='records')
    policy_document_dict['passages'] = policy_document_dict['passages'].to_json(orient='records')
    policy_document_dict['variables']['insight_chunks'] = policy_document_dict['variables']['insight_chunks'].to_json(orient='records')
    json_policy_documents.append(policy_document_dict)

In [None]:
policy_document.variables.keys()

In [None]:
with open(research_json_folder.joinpath(f"strategic_analysis_{query.replace(' ','_')}"),'w') as f:
    json.dump(json_policy_documents, f)

In [None]:
policy_document.variables['strategy_analysis'][0].keys()

In [None]:
print(policy_document.variables['strategy_analysis'][0]['analysis'])

In [None]:
strategic_policy.keys()

In [None]:
all_strategies = list()
for policy_document in policy_documents:
    for strategic_policy in policy_document.variables['strategy_analysis']:
        all_strategies.append(f"""**Policy Name:** {strategic_policy['policy']['policyName']}
**Policy Type:** {strategic_policy['policy']['policyType']}
**Policy Objective:** {strategic_policy['policy']['primaryObjective']}
**Policy Details:** {strategic_policy['policy']['policyDetails']}

**Policy Strategic Analysis:**
{strategic_policy['analysis']}""".strip())

In [None]:
len('\n\n----\n\n'.join(all_strategies).split(' '))

In [None]:
print('\n----\n'.join(all_strategies))

In [None]:
with open(f"test_{query.replace(' ','_')}_all_strategies.md", 'w') as f:
    f.write('\n\n-----\n\n'.join(all_strategies))

In [None]:
query