In [None]:
from sentence_transformers import SentenceTransformer
import torch
import lancedb
from openai import OpenAI
import re
import pandas as pd
import numpy as np
import json
import time

from google import genai
from google.genai import types
from dotenv import load_dotenv
from tqdm.notebook import tqdm
import os
load_dotenv('env_var')

In [None]:
def format_evidence(evidence_list):
    """Formats the evidence list into a Markdown string."""
    markdown = ""
    if not evidence_list or not isinstance(evidence_list, list):
        return "No evidence provided.\n"

    for i, item in enumerate(evidence_list):
        markdown += f"  - **{item.get('Description', 'N/A')}**\n"
        markdown += f"    - **Key Data/Details:** {item.get('Key Data/Details', 'N/A')}\n"
        markdown += f"    - **Methodology Note:** {item.get('Methodology Note', 'N/A')}\n"
        markdown += f"    - **Source Note:** {item.get('Source Note', 'N/A')}\n"
    return markdown


def format_list(items, title):
    """Formats a simple list of strings into a Markdown list."""
    markdown = f"#### *{title}*\n"
    if not items or not isinstance(items, list):
        return markdown + "- N/A\n"
    for item in items:
        markdown += f"- {item}\n"
    return markdown


def generate_markdown(data):
    """
    Generates a Markdown string from a list of insight objects.

    Args:
        data (list): A list of dictionaries, where each dictionary
                     represents an insight.

    Returns:
        str: A string containing the formatted Markdown document.
    """
    if not isinstance(data, list):
        return "Error: JSON data must be a list of insight objects."

    full_markdown = ""
    for i, insight in enumerate(data):
        # --- Main Title for the Insight ---
        full_markdown += f"## Insight: \n\n### *{insight.get('Statement of the Insight', 'No Title Provided')}*\n\n"

        # --- Source Information ---
        full_markdown += "## Source Information\n"
        full_markdown += f"- **Original Source Title:** {insight.get('Original Source Title', 'N/A')}\n"
        full_markdown += f"- **Author(s) / Organization:** {insight.get('Author(s) / Organization', 'N/A')}\n"
        full_markdown += f"- **Location in Source:** {insight.get('Location in Source', 'N/A')}\n\n"

        # --- Comprehensive Explanation ---
        full_markdown += "## Comprehensive Explanation\n"
        full_markdown += f"{insight.get('Comprehensive Explanation of the Insight', 'N/A')}\n\n"

        # --- Evidence Section ---
        full_markdown += "## Evidence & Reasoning\n\n"
        full_markdown += "### Evidence FOR this Insight\n"
        full_markdown += format_evidence(insight.get('Evidence FOR this Insight'))
        full_markdown += "\n"

        full_markdown += """### Author's Reasoning FOR this Insight (The "Why")\n"""
        full_markdown += insight.get('''Author's Reasoning FOR this Insight (The "Why")''', 'N/A') + '\n\n'

        full_markdown += """### Evidence AGAINST or Contradicting this Insight\n"""
        full_markdown += f"{insight.get('Evidence AGAINST or Contradicting this Insight', 'N/A')}\n\n"

        full_markdown += "### Author's Reasoning AGAINST this Insight (or for the Nuance)\n"
        full_markdown += insight.get('Author\'s Reasoning AGAINST this Insight (or for the Nuance)', 'N/A') + "\n\n"
        full_markdown += "### Author's position on this insight:\n"
        full_markdown += insight.get('Position Taken', 'N/A') + "\n\n"
        # --- Strength of Insight ---
        strength = insight.get('Strength of This Specific Insight', {})
        full_markdown += "## Strength of This Specific Insight\n"
        full_markdown += f"- **Assessment:** {strength.get('Assessment', 'N/A')}\n"
        full_markdown += f"- **Confidence Level:** {strength.get('Confidence Level', 'N/A')}\n\n"
        full_markdown += f"- **Common Sensibility:** {strength.get('Common Sensibility', 'N/A')}\n\n"


        # --- Actionable Recommendations ---
        recommendations = insight.get('Actionable Recommendations or Implications', {})
        full_markdown += "## Actionable Recommendations or Implications\n"
        full_markdown += f"- **If this insight is true, it implies that we should:** {recommendations.get('If this insight is true, it implies that we should', 'N/A')}\n"
        full_markdown += f"- **This insight could be used in our project to:** {recommendations.get('This insight could be used in our project to', 'N/A')}\n\n"
        full_markdown += f"- **If this insight is not true, it implies that we should:** {recommendations.get('If this insight is not true, it implies that we should', 'N/A')}\n\n"

        # --- Indexing ---
        indexing = insight.get('Indexing for Future Reference', {})
        full_markdown += "## Indexing for Future Reference\n"
        full_markdown += format_list(indexing.get('General Topics', []), "General Topics")
        full_markdown += format_list(indexing.get('Specific Topics', []), "Specific Topics")
        full_markdown += format_list(indexing.get('General Keywords', []), "General Keywords")
        full_markdown += format_list(indexing.get('Specific Keywords', []), "Specific Keywords")
        full_markdown += "\n"

        # --- Unanswered Questions ---
        full_markdown += format_list(insight.get('Unanswered Questions', []), "Unanswered Questions")

        # --- Separator for next insight ---
        if i < len(data) - 1:
            full_markdown += "\n---\n\n"

    return full_markdown

In [None]:
from pathlib import Path
import sqlite3
from datetime import datetime
from uuid import uuid4
from copy import deepcopy

project_folder = Path('insight_research')
project_folder.mkdir(parents=True, exist_ok=True)
research_json_folder = project_folder.joinpath('json_data')
research_json_folder.mkdir(parents=True, exist_ok=True)
database_location = project_folder.joinpath('research.sqlite')

conn = sqlite3.connect(database_location)
cursor = conn.cursor()

In [None]:
index = lancedb.connect('../wonky_data/indexes/')
table = index.open_table('sections_hybrid')
encoder = SentenceTransformer('nomic-ai/nomic-embed-text-v1.5', device='mps',trust_remote_code=True)

In [None]:
table.create_fts_index(['text','source_file','id'], replace=True)

In [None]:
import hashlib
def hash_string_to_digits(input_string, num_digits=6):
    """Hashes a string to an 8-digit integer using SHA-256."""
    hashed_value = hashlib.sha256(input_string.encode('utf-8')).hexdigest()
    return int(hashed_value, 16) % (10**num_digits)

In [None]:
def format_documents(documents):
    formatted_texts = []
    report_ids = list(set([record['id'] for record in documents]))
    for id in report_ids:
        report_sections = [section for section in documents if section['id'] == id]
        report_sections = sorted(report_sections, key=lambda section: section['section_start'])
        report_text = [text['text'] for text in report_sections]
        report_text = [re.sub('\n+', '\n', _text) for _text in report_text]
        report_text = [re.sub(' +', ' ', _text) for _text in report_text]
        report_text = [re.sub(r'(\[.*?\])','', _text) for _text in report_text]
        report_text = [_text.replace('\\','') for _text in report_text]
        report_text = [re.sub(r'(\[.*?\])','', _text) for _text in report_text]
        report_text = [re.sub(r'(\n?-{10,})','', _text) for _text in report_text]
        report_text = [re.sub(r'(\n.*?#_Toc.*?\n)','', _text) for _text in report_text]
        report_text = [re.sub(r'(- \n)','', _text) for _text in report_text]
        report_text = [re.sub(r'\n{2,}','\n', _text) for _text in report_text]
        report_text = [_text.strip() for _text in report_text]
        report_text = '\n-----\n'.join(report_text)
        report_header = f"""**{report_sections[0]['id']}:** {report_sections[0]['title']}"""
        formatted_texts.append(f"""{report_header}\n{report_text}""".strip())
    # print(formatted_texts[0])
    return '\n=======\n'.join(formatted_texts)

client = OpenAI(base_url="http://localhost:1234/v1", api_key="lm-studio")
def call_llm(query, temperature=0.35, seed=42, model="gemma-3-12b-it-qat"):
    completion = client.chat.completions.create(
        model=model,
        messages=[
            {"role": "user", "content": query}
        ],
        temperature=temperature,
        seed=seed,
    )
    return completion.choices[0].message.content

model = "gemini-2.0-flash"
total_tokens = list()

def call_llm_flash(query, temperature=0.1, seed=42, max_tokens=7500 ):
    client = genai.Client(api_key=os.environ['GEMINI_API_KEY'])
    retries = 3
    time_delay = 15
    for i in range(retries):
        try:
            response = client.models.generate_content(
                model=model,
                contents=[query],
                config=types.GenerateContentConfig(
                    max_output_tokens=max_tokens,
                    temperature=temperature,
                    seed=seed
                )
            )
            break
        except Exception as e:
            print(e)
            print(f"Retries left: {retries - i}")
            time.sleep(time_delay)
            continue



    total_tokens.append({'prompt_tokens':response.usage_metadata.prompt_token_count,
                         'completion_tokens':response.usage_metadata.candidates_token_count,
                         'total_tokens':response.usage_metadata.total_token_count,
                         'timestamp':datetime.now().strftime("%Y_%m_%d_%H_%M_%S")})

    return response.text

def convert_df_sections_to_list(sections):
    section_list = sections.explode().to_list()
    section_list = convert_all_sections(section_list)
    return section_list

def convert_sections_to_dict(section):
    converted_sections = list()
    parts = section.replace("\'",'"').split('", ')
    for _part in parts:
        _part = _part + '"}'
        # print(_part)
        _part = re.findall(r"""^{?(.*?): "(.*?)}$""",_part, flags=re.DOTALL | re.MULTILINE)

        formatted_parts = {int(_part[0][0].strip('"} ')): _part[0][1].strip('"} \n')}
        converted_sections.append(formatted_parts)
    return converted_sections

def convert_all_sections(sections):
    extracted_sections = list()
    for _section in sections:
        section = convert_sections_to_dict(_section)
        extracted_sections.extend(section)
    return extracted_sections

# Pompts

In [None]:
insight_extraction_prompt = """**ROLE & GOAL**

You are an expert research assistant. Your goal is to read the provided text and act as an "insight spotter."

Your task is to identify every major claim, argument, or finding within the text. For each one you identify, you must also list the specific citations that the author uses in the immediate section of that claim to support it.

Your output should be a simple, scannable list that a policy analyst can use as a roadmap for their own deeper investigation. The insights should not be overly granular. Only break things down into more elemental insights if warranted by the complexity of the insight. The section citations are only there for reference, they do not indicate any specific semantic structure to the insights.

* **Location in Document:** Be as specific as possible. This is crucial for allowing a senior analyst to quickly find the part of the document you are analyzing. The citation of this is in brackets in this format: [ABCD####__###___###]

! Important: Only use citations that are found in brackets similar to the one I supplied. Do not use references to laws, papers, or other documents. Only use the document's citations when identifying the Location in Source !

---

**CONTEXT: SOURCE DOCUMENT**

{document}

---

**TASK & OUTPUT FORMAT**

Based *only* on the source document provided above, generate a numbered list of the key insights. Under each insight, create a bulleted sub-list titled "Location in Document" that includes all relevant sections to that insight. Different insights can have overlapping sections.

Present your findings in the following strict format:

---
**1. Insight:** [Concise statement of the insight, claim, or argument.]
* **Location in Document:**
    * [First Relevant Section Citation]
    ...

**2. Insight:** [Concise statement of the next insight, claim, or argument.]
* **Location in Document:**
    * [First Relevant Section Citation]
    ...

**(Repeat this structure for every distinct insight you find in the document.)**
"""

In [None]:
insight_template_instructions = """
# How to Use the Insight Deconstruction Template: A Detailed Guide

### **Overall Purpose**
This template is your primary tool for breaking down dense research documents (like academic papers or long reports) into small, manageable, and analyzable pieces. A single report can contain dozens of important claims or "insights." Your job is to capture the details about this insight: "{insight}"; so we can evaluate its strengths and weaknesses on its own merits. **Only fill out this template for the desired insight. If there is not enough information

---

### **Part 1: Source and Context**
*This section is for basic documentation. It is critical for ensuring we can always trace an insight back to its original source.*

* **Original Source Title:** Copy and paste the full title of the article or report.
* **Author(s) / Organization:** List the names of the individual authors or the name of the publishing organization (e.g., "RAND Corporation").
* **Location in Source:** Be as specific as possible. This is crucial for allowing a senior analyst to quickly find the part of the document you are analyzing. The citation of this is in brackets in this format: [ABCD####__###___###]

! Important: Only use citations that are found in brackets similar to the one I supplied. Do not use references to laws, papers, or other documents. Only use the document's citations when identifying the Location in Source !

---

### **Part 2: The Core Insight**
*This section defines the specific unit of analysis for this report. It is the most important section to get right.*

* **Statement of the Insight:** Your goal is to summarize the core claim in a single, powerful sentence. Think of it as a newspaper headline for the insight. It should be a clear, declarative statement.
    * *Example:* "The author claims that state-funded job training programs reduce youth unemployment by an average of 15% within two years."

* **Comprehensive Explanation of the Insight:** This is where you provide the necessary nuance, quotes, and description that doesn't fit in the single-sentence statement. In a paragraph, explain the context around the insight. What makes it important? What is the author's angle? Does it challenge a conventional view? What are the specific quotes from the text that are highly relevant.
    * *Example:* "X found that Y is significant because it directly contradicts previous studies that found no effect. The argument being that the positive result is due to the inclusion of 'soft skills' training alongside technical skills, a component missing from earlier programs that were evaluated..."

---

### **Part 3: Deconstruction of the Argument**
*In this section, you are a neutral reporter. Your job is to objectively extract what the author presented, without yet adding your own critique or analysis. You are simply reporting the facts of their argument.*

* **Evidence FOR this Insight:** This is where you list the building blocks of the author's argument. Be exhaustive and precise.

    * **For each Evidence Point:**
        * **Description:** Briefly label the type of evidence. Examples: "Statistical analysis of survey data," "Direct quote from a stakeholder interview," "Case study results from the City X pilot program," "Finding from a literature review."
        * **Key Data/Details:** Be specific. **Do not** write "the program led to an increase." **Do** write "the program led to a 25% increase in participant wages (from an average of $15/hr to $18.75/hr)." Use direct quotes or the exact numbers presented by the author.
        * **Methodology Note:** Briefly explain how this piece of evidence was created. This is crucial for your later analysis of its quality. Examples: "From a randomized controlled trial with 500 participants," "From an analysis of publicly available census data," "From a series of 25 semi-structured interviews."
        * **Source Note:** This is very important for understanding the foundation of the author's argument. Identify the specific source the author references to support this evidence point. The source text will often contain citations like "(Smith, 2022)" or mention "a report from the RAND Corporation." Note that citation here exactly as the author presents it. Example: "The author cites Smith (2022) for this data point."

* **Author's Reasoning FOR this Insight (The "Why"):** Explain the author's logic. How do they connect the dots between the evidence points you listed and the main insight? This is where you summarize *their* argument.

* **Evidence AGAINST or Contradicting this Insight:** Did the author present any information that complicates or challenges their own insight? A good researcher often will. Note that here. If the author *didn't* mention any counter-evidence but you know it exists or see a clear gap, you can note that here as well.

* **Author's Reasoning AGAINST this Insight (or for the Nuance):** If the author did present counter-evidence, how did they explain it? Did they dismiss it? Did they say it only applies in certain contexts? This reveals a lot about the author's perspective.

* **Position Taken:** Author's position on the insight in by arguing for the insight, or casting it as a positive insight, or presenting the insight from a neutral position/stance, or arguing against or casting the insight in a negative light. Provide this as a classification: Pro, Neutral, Con.

---

### **Part 4: Analyst's Assessment of This Insight**
*Now, you shift from a neutral reporter to a critical analyst. This is where you provide your own judgment on the quality and usefulness of this specific insight.*

* **Strength of This Specific Insight:**
    * **Assessment:** Based on everything you documented in Part 3, make a judgment call. Is the evidence strong and directly supportive? Is the reasoning sound? Or is the evidence weak, anecdotal, or logically flawed?
    * **Confidence Level:** Summarize your assessment with a simple rating: High, Medium, or Low.

* **Actionable Recommendations or Implications:** This is the "so what?" section.
    * **If this insight is true, it implies that we should:** Think about the practical consequence. What action does this information suggest?
    * **This insight could be used in our project to:** How does this piece of information fit into our final report? Is it a main point? A supporting detail? A counter-argument we need to address? Be specific.

---

### **Part 5: Connections & Next Steps**
*This section makes your work discoverable and helps us identify what to do next.*

* **Indexing for Future Reference:** This is crucial for building our knowledge base.
    * **General Topics:** High-level categories. Examples: "Environmental Policy," "Education."
    * **Specific Topics:** More focused sub-categories. Examples: "Carbon Tax Models," "Early Childhood Literacy."
    * **General Keywords:** Broad, searchable terms. Examples: "climate," "schools," "healthcare."
    * **Specific Keywords:** Precise, technical terms from the insight. Examples: "cap-and-trade," "phonemic awareness," "vaccine efficacy."

* **Unanswered Questions:** What does this insight make you wonder about? What new questions does it raise? Identifying these questions is critical for pointing our research in the right direction.

***Source Document:***
{document}
"""

In [None]:
insight_template_instructions_all_insights = """
# How to Use the Insight Deconstruction Template: A Detailed Guide

### **Overall Purpose**
This template is your primary tool for breaking down dense research documents (like academic papers or long reports) into small, manageable, and analyzable pieces. A single report can contain dozens of important claims or "insights." Your job is to capture the details about this list of insights so we can evaluate its strengths and weaknesses on its own merits. **Only fill out this template for the desired insights.**
Do not introduce your response or conclude it. Only provide the filled out insights. If there is no information provided for a field, then just reply "N/A". Don't comment any further.

***Insights:***
{insights}

---

### **Part 1: Source and Context**
*This section is for basic documentation. It is critical for ensuring we can always trace an insight back to its original source.*

* **Original Source Title:** Copy and paste the full title of the article or report.
* **Author(s) / Organization:** List the names of the individual authors or the name of the publishing organization (e.g., "RAND Corporation").
* **Location in Source:** Be as specific as possible. This is crucial for allowing a senior analyst to quickly find the part of the document you are analyzing. The citation of this is in brackets in this format: [ABCD####__###___###]

! Important: Only use citations that are found in brackets similar to the one I supplied. Do not use references to laws, papers, or other documents. Only use the document's citations when identifying the Location in Source !

---

### **Part 2: The Core Insight**
*This section defines the specific unit of analysis for this report. It is the most important section to get right.*

* **Statement of the Insight:** Your goal is to summarize the core claim in a single, powerful sentence. Think of it as a newspaper headline for the insight. It should be a clear, declarative statement.
    * *Example:* "The author claims that state-funded job training programs reduce youth unemployment by an average of 15% within two years."

* **Comprehensive Explanation of the Insight:** This is where you provide the necessary nuance, quotes, and description that doesn't fit in the single-sentence statement. In a paragraph, explain the context around the insight. What makes it important? What is the author's angle? Does it challenge a conventional view? What are the specific quotes from the text that are highly relevant. Do not say that the insight shows, or says, provide the text and evidence that is in the text that *shows* the core insight.
    * *Example:* "X found that Y is significant because it directly contradicts previous studies that found no effect. The argument being that the positive result is due to the inclusion of 'soft skills' training alongside technical skills, a component missing from earlier programs that were evaluated..."

---

### **Part 3: Deconstruction of the Argument**
*In this section, you are a neutral reporter. Your job is to objectively extract what the author presented, without yet adding your own critique or analysis. You are simply reporting the facts of their argument.*

* **Evidence FOR this Insight:** This is where you list the building blocks of the author's argument. Be exhaustive and precise.

    * **For each Evidence Point:**
        * **Description:** Briefly label the type of evidence. Examples: "Statistical analysis of survey data," "Direct quote from a stakeholder interview," "Case study results from the City X pilot program," "Finding from a literature review."
        * **Key Data/Details:** Be specific. **Do not** write "the program led to an increase." **Do** write "the program led to a 25% increase in participant wages (from an average of $15/hr to $18.75/hr)." Use direct quotes or the exact numbers presented by the author. Provide all the key details, don't skimp or try to keep this brief. This is essential to understanding the arguments and the insight.
        * **Methodology Note:** Briefly explain how this piece of evidence was created. This is crucial for your later analysis of its quality. Examples: "From a randomized controlled trial with 500 participants," "From an analysis of publicly available census data," "From a series of 25 semi-structured interviews."
        * **Source Note:** This is very important for understanding the foundation of the author's argument. Identify the specific source the author references to support this evidence point. The source text will often contain citations like "(Smith, 2022)" or mention "a report from the RAND Corporation." Note that citation here exactly as the author presents it. Example: "The author cites Smith (2022) for this data point."

* **Author's Reasoning FOR this Insight (The "Why"):** Explain the author's logic. How do they connect the dots between the evidence points you listed and the main insight? This is where you summarize *their* argument.

* **Evidence AGAINST or Contradicting this Insight:** Did the author present any information that complicates or challenges their own insight? A good researcher often will. Note that here. If the author *didn't* mention any counter-evidence but you know it exists or see a clear gap, you can note that here as well.

* **Author's Reasoning AGAINST this Insight (or for the Nuance):** If the author did present counter-evidence, how did they explain it? Did they dismiss it? Did they say it only applies in certain contexts? This reveals a lot about the author's perspective.

* **Position Taken:** Author's position on the insight in by arguing for the insight, or casting it as a positive insight, or presenting the insight from a neutral position/stance, or arguing against or casting the insight in a negative light. Provide this as a classification: Pro, Neutral, Con.

---

### **Part 4: Analyst's Assessment of This Insight**
*Now, you shift from a neutral reporter to a critical analyst. This is where you provide your own judgment on the quality and usefulness of this specific insight.*

* **Strength of This Specific Insight:**
    * **Assessment:** Based on everything you documented in Part 3, make a judgment call. Is the evidence strong and directly supportive? Is the reasoning sound? Or is the evidence weak, anecdotal, or logically flawed?
    * **Confidence Level:** Summarize your assessment with a simple rating: High, Medium, or Low.
    * **Common Sensibility:** Based on your own knowledge, how likely or plausible the insight is: High, Medium, or Low.

* **Actionable Recommendations or Implications:** This is the "so what?" section.
    * **If this insight is true, it implies that we should:** Think about the practical consequence. What action does this information suggest?
    * **This insight could be used in our project to:** How does this piece of information fit into our final report? Is it a main point? A supporting detail? A counter-argument we need to address? Be specific.
    * **If this insight is not true, it implies that we should:** Think about the practical consequences of basing policy on this insight and it being wrong. What should be verified first or mitigating steps taken?

---

### **Part 5: Connections & Next Steps**
*This section makes your work discoverable and helps us identify what to do next.*

* **Indexing for Future Reference:** This is crucial for building our knowledge base.
    * **General Topics:** High-level categories. Examples: "Environmental Policy," "Education."
    * **Specific Topics:** More focused sub-categories. Examples: "Carbon Tax Models," "Early Childhood Literacy."
    * **General Keywords:** Broad, searchable terms. Examples: "climate," "schools," "healthcare."
    * **Specific Keywords:** Precise, technical terms from the insight. Examples: "cap-and-trade," "phonemic awareness," "vaccine efficacy."

* **Unanswered Questions:** What does this insight make you wonder about? What new questions does it raise? Identifying these questions is critical for pointing our research in the right direction.

*** Output Schema in JSON ***
```json
{{
  "$schema": "http://json-schema.org/draft-07/schema#",
  "title": "Insight Analysis Schema",
  "description": "A schema for structuring an analysis of an insight from a source document. This guides an LLM to break down an insight into its constituent parts, including evidence, reasoning, strength, and implications.",
  "type": "array",
  "items": {{
    "type": "object",
    "properties": {{
      "Original Source Title": {{
        "type": "string",
        "description": "The full title of the source document from which the insight is derived."
      }},
      "Author(s) / Organization": {{
        "type": "string",
        "description": "The primary author(s) or the organization responsible for the source document."
      }},
      "Location in Source": {{
        "type": "array",
        "description": "The section identifiers '[ABCD####__###___###]' indicating where the insight and relevant information/evidence is located within the source document.",
         "items": {{
               "type": "string"
             }}
      }},
      "Statement of the Insight": {{
        "type": "string",
        "description": "A concise, one-sentence summary of the core insight or main point."
      }},
      "Comprehensive Explanation of the Insight": {{
        "type": "string",
        "description": "A detailed paragraph expanding on the 'Statement of the Insight', explaining its meaning, context, and significance."
      }},
      "Evidence FOR this Insight": {{
        "type": "array",
        "description": "A list of evidence points that directly support the stated insight.",
        "items": {{
          "type": "object",
          "properties": {{
            "Description": {{
              "type": "string",
              "description": "A brief label for the piece of evidence (e.g., 'FCC Finding', 'Cost Estimate', 'Academic Research')."
            }},
            "Key Data/Details": {{
              "type": "string",
              "description": "All of the specific data, quotes, and/or factual details that constitute the evidence."
            }},
            "Methodology Note": {{
              "type": "string",
              "description": "A brief explanation of how the evidence was generated or its basis (e.g., 'Annual government report', 'Logical deduction')."
            }},
            "Source Note": {{
              "type": "string",
              "description": "The citation or source for this specific piece of evidence."
            }}
          }},
          "required": [
            "Description",
            "Key Data/Details",
            "Methodology Note",
            "Source Note"
          ]
        }}
      }},
      "Author's Reasoning FOR this Insight (The \"Why\")": {{
        "type": "string",
        "description": "An explanation of the author's logic or rationale for presenting the insight, connecting the evidence to the main point."
      }},
      "Evidence AGAINST or Contradicting this Insight": {{
        "type": "string",
        "description": "A summary of any evidence, data, or arguments presented in the source that might contradict, challenge, or add nuance to the insight. State if none is presented."
      }},
      "Author's Reasoning AGAINST this Insight (or for the Nuance)": {{
        "type": "string",
        "description": "An explanation of the author's reasoning for including contradicting or nuanced points. Should be 'N/A' if no such evidence is presented."
      }}
      "Position Taken": {{
      "type": "string",
      "enum": [
        "Pro",
        "Con",
        "Neutral"
        ],
      "description" : "Whether the document is arguing for / asserting in favor of the insight, neutrally evaluating the insight, or arguing against / asserting against the insight.
      }}
      "Strength of This Specific Insight": {{
        "type": "object",
        "description": "An assessment of the quality and reliability of the insight based on the provided evidence.",
        "properties": {{
          "Assessment": {{
            "type": "string",
            "description": "A qualitative summary of why the evidence is considered strong, weak, or mixed."
          }},
          "Confidence Level": {{
            "type": "string",
            "enum": [
              "High",
              "Medium",
              "Low"
            ],
            "description": "A categorical rating of confidence in the insight."
          }},
          "Common Sensibility": {{
            "type": "string",
            "enum": [
              "High",
              "Medium",
              "Low"
            ],
            "description": "A categorical rating of general likelihood or plausibility in the insight."
          }}
        }},
        "required": [
          "Assessment",
          "Confidence Level",
          "Common Sensibility"
        ]
      }},
      "Actionable Recommendations or Implications": {{
        "type": "object",
        "description": "Practical applications and logical consequences derived from the insight.",
        "properties": {{
          "If this insight is true, it implies that we should": {{
            "type": "string",
            "description": "A statement on the logical next steps or policy recommendations that follow from the insight."
          }},
          "This insight could be used in our project to": {{
            "type": "string",
            "description": "A description of how this insight can be practically applied to a specific project or goal."
          }},
          "If this insight is not true, it implies that we should": {{
            "type": "string",
            "description": "A description of how this insight can be verified, or what might happen if policy based on this insight when this insight is incorrect."
          }}
        }},
        "required": [
          "If this insight is true, it implies that we should",
          "This insight could be used in our project to",
          "If this insight is not true, it implies that we should"
        ]
      }},
      "Indexing for Future Reference": {{
        "type": "object",
        "description": "A set of tags and keywords to aid in searching and categorizing the insight.",
        "properties": {{
          "General Topics": {{
            "type": "array",
            "description": "Broad subject areas the insight falls under.",
            "items": {{
              "type": "string"
            }}
          }},
          "Specific Topics": {{
            "type": "array",
            "description": "More detailed, niche topics related to the insight.",
            "items": {{
              "type": "string"
            }}
          }},
          "General Keywords": {{
            "type": "array",
            "description": "Broad, single-word or short-phrase keywords for searching.",
            "items": {{
              "type": "string"
            }}
          }},
          "Specific Keywords": {{
            "type": "array",
            "description": "Specific, detailed keywords and proper nouns mentioned in the insight.",
            "items": {{
              "type": "string"
            }}
          }}
        }},
        "required": [
          "General Topics",
          "Specific Topics",
          "General Keywords",
          "Specific Keywords"
        ]
      }},
      "Unanswered Questions": {{
        "type": "array",
        "description": "A list of questions that arise from the insight and its evidence, suggesting areas for further research.",
        "items": {{
          "type": "string"
        }}
      }}
    }},
    "required": [
      "Original Source Title",
      "Author(s) / Organization",
      "Location in Source",
      "Statement of the Insight",
      "Comprehensive Explanation of the Insight",
      "Evidence FOR this Insight",
      "Author's Reasoning FOR this Insight (The \"Why\")",
      "Evidence AGAINST or Contradicting this Insight",
      "Author's Reasoning AGAINST this Insight (or for the Nuance)",
      "Strength of This Specific Insight",
      "Actionable Recommendations or Implications",
      "Indexing for Future Reference",
      "Unanswered Questions"
    ]
  }}
}}
```

***Source Document:***
{document}
"""

In [None]:
subject_matter = "Rural Broadband in the United States"
focus = "Barriers and opportunities to improve access"
depth_to_search = 50
max_documents = 5
research_id = hash_string_to_digits(f"""{subject_matter} : {focus}""")
research_id

# Start Search Process

In [None]:
query_conversion_prompt = f"""I am researching a subject matter and a focus of that subject matter. I need you to write four paragraphs about the subject matter and focus. The paragraphs should capture completely unique aspects that are relevant to the subject matter and focus. The goal is to get as diverse information about the subject matter and focus in just the space of four paragraphs.

Subject Matter: {subject_matter}
Focus: {focus}
"""


In [None]:
queries = call_llm_flash(query_conversion_prompt)

In [None]:
print(queries)

In [None]:
query_strings = [x.strip() for x in queries.split('\n') if x.strip() != '']
query_vectors = encoder.encode(query_strings)

# Search Queries

In [None]:
search_results = list()
for query_vec in query_vectors:
    search_results.append(table.search(query_vec).limit(depth_to_search).to_pandas())
search_results = pd.concat(search_results)

In [None]:
search_results = search_results.sort_values(by=['_distance'])
search_results = search_results.drop_duplicates(subset=['id'])

In [None]:
search_results

In [None]:
def get_full_document(search_results, max_results):
    full_documents = list()
    for index, row in search_results.iterrows():
        article_id = row["id"]
        _document = table.search().where(f"id = '{article_id}'").limit(300).to_pandas()
        _document['sections'] = _document['sections'].apply(lambda x: convert_sections_to_dict(x))
        full_documents.append(_document)
    return full_documents[:max_results]

def format_indexed_document(article_df):
    document_text = list()
    for index, row in article_df.iterrows():
        key = list(row['sections'][0].keys())[0]
        value = list(row['sections'][0].values())[0]
        document_text.append(f"{value} [{row['id']}__{key}___1]")
    return '\n\n-----\n\n'.join(document_text)

In [None]:
full_documents = get_full_document(search_results=search_results, max_results=max_documents)

In [None]:
formatted_documents = [format_indexed_document(_document) for _document in full_documents]

In [None]:
len(formatted_documents), len(full_documents)

In [None]:
formatted_documents[1]

# Get Insights

In [None]:
def identify_insights(article_text, prompt):
    insight_extractions = call_llm_flash(prompt.format(document=article_text), temperature=0.1)
    insights_citations = re.findall(r'\*\*\d+\. Insight:\*\* (.*?)\n\* \*\*Location in Document:\*\*(.+?)(?:\n\n|$)', insight_extractions, flags=re.MULTILINE|re.DOTALL)

    insight_mapping = list()
    for _insight in insights_citations:
        insight_mapping.append({'insight': _insight[0], 'location':re.findall(r'\[(.*?)\]', _insight[1])})
    print(len(insight_mapping))
    return insight_extractions, insight_mapping

In [None]:
def extract_insights(article_text, insight_mapping, prompt, batch_size=6):
    ## Prepare the batches of insights
    insight_batches = list()

    if len(insight_mapping) <= batch_size:
        all_insights = '\n'.join(['* ' + x['insight'] for x in insight_mapping[:batch_size+1]])
        insight_batches.append(all_insights)

    if len(insight_mapping) > batch_size:
        for i in range(len(insight_mapping) // batch_size + 1):
            all_insights = '\n'.join(['* ' + x['insight'] for x in insight_mapping[i*batch_size:(i+1)*batch_size]])
            insight_batches.append(all_insights)

    ## Call the llm to extract the insights
    extracted_insights = list()
    for _insight_batch in tqdm(insight_batches):
        batch_insight_prompt = prompt.format(insights=_insight_batch,
                                                                           document=article_text)
        print(len(batch_insight_prompt.split(' ')), len(insight_template_instructions_all_insights.split(' ')))
        batch_insight_report = call_llm_flash(batch_insight_prompt, temperature=0.1)
        extracted_insights.append(batch_insight_report)
    return extracted_insights

def convert_insights_to_json(extracted_insights):
    # all_insight_report_sections = all_insight_report.split('---')
    all_insight_report_sections = ['\n'.join(x.split('\n')[1:-1]) for x in extracted_insights]
    all_insight_report_sections = [x.replace('\\', '\\\\') for x in all_insight_report_sections]
    all_insight_report_sections = [x.replace('\\\\"', '\\"') for x in all_insight_report_sections]

    section_json = list()
    for _sections in all_insight_report_sections:
        try:
            _section_json_conversion = json.loads(_sections)
        except Exception as e:
            print(e)
            print("error in converting to json")
            _section_json_conversion = _sections
        section_json.extend(_section_json_conversion)

    all_insight_markdown = generate_markdown(section_json)
    return all_insight_markdown, section_json

In [None]:
insight_extractions = list()

In [None]:
for _document in tqdm(formatted_documents[len(insight_extractions):]):
    insight_text, insight_mapping = identify_insights(_document, prompt=insight_extraction_prompt)
    extractions = extract_insights(_document, insight_mapping, prompt=insight_template_instructions_all_insights, batch_size=5)
    extraction_insights, extraction_json = convert_insights_to_json(extractions)
    insight_extractions.append({'insights':{'text':insight_text, 'mapping':insight_mapping},
                                'extractions':{'text':extraction_insights, 'mapping':extraction_json}
                                }
                               )

In [None]:
len(insight_extractions)

In [None]:
for i in range(len(full_documents)):
    metadata = full_documents[i].drop(['text','sections','section_ids','section_start','section_end',
                                       'section_id','section_id','vector'],axis=1).iloc[0].to_dict()
    _insights = insight_extractions[i]
    _formatted_document = formatted_documents[i]
    _data = {'metadata':metadata,
             'insights':_insights,
             'document':_formatted_document,
             'research':{
                 'subject_matter':subject_matter,
                 'focus':focus,
                 'queries':queries
             },
             'research_id':hash_string_to_digits(f"""{subject_matter} : {focus}""")
             }
    with open(research_json_folder.joinpath(f"{metadata['id']}_{research_id}.json"), 'w') as f:
        json.dump(_data, f)

In [None]:
all_insights = list()
for i in range(len(full_documents)):
    metadata = full_documents[i].drop(['text','sections','section_ids','section_start','section_end',
                                       'section_id','section_id','vector'],axis=1).iloc[0].to_dict()
    _insights = insight_extractions[i]
    _formatted_document = formatted_documents[i]
    for _insight_map, _extraction_map in tqdm(zip(_insights['insights']['mapping'],_insights['extractions']['mapping'])):
        individual_insight = {k:v for k, v in metadata.items()}
        individual_insight['insights'] = _insight_map
        individual_insight['extraction_details'] = {k:v for k, v in _extraction_map.items() if k != 'text'}
        individual_insight['extraction_text'] = _extraction_map['text']
        individual_insight['vector'] = encoder.encode(individual_insight['extraction_text']).tolist()
        all_insights.append(individual_insight)

In [None]:
all_insights[0].keys()

In [None]:
with open(research_json_folder.joinpath(f"insights_{research_id}.json"), 'w') as f:
    json.dump(all_insights, f)