[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/wandb/weave/blob/master/examples/cookbooks/summarization/chain-of-density-arxiv.ipynb)
<!--- @wandbcode{weave-cod-summarization-cookbook} -->

# Arxiv PDF Summarization Bot

## Setup

In [1]:
try:
    from google.colab import userdata
    import os
    os.environ["WANDB_API_KEY"] = userdata.get("WANDB_API_KEY")
    os.environ["OPENAI_API_KEY"] = userdata.get("OPENAI_API_KEY")
    os.environ["ANTHROPIC_API_KEY"] = userdata.get("ANTHROPIC_API_KEY")
    !apt-get install poppler-utils
except:
    from dotenv import load_dotenv
    load_dotenv()

In [2]:
import os
import subprocess
import shutil

repo_url = "https://github.com/wandb/weave.git"
target_folder = "weave_cookbooks"
subdirectory = "examples/cookbooks"
branch = "add-summarization-example"

if not os.path.exists(target_folder):
    print(f"Cloning repository: {repo_url}")

    # Clone the entire repository to a temporary folder
    temp_folder = "temp_weave_repo"
    subprocess.run(["git", "clone", "--depth", "1", "--branch", branch, repo_url, temp_folder], check=True)

    # Move the desired subdirectory to the target folder
    shutil.move(os.path.join(temp_folder, subdirectory), target_folder)

    # Remove the temporary folder
    shutil.rmtree(temp_folder)

    print(f"Successfully cloned {subdirectory} from branch '{branch}' to {target_folder}")
else:
    print(f"Folder '{target_folder}' already exists.")

Cloning repository: https://github.com/wandb/weave.git
Successfully cloned examples/cookbooks from branch 'add-summarization-example' to weave_cookbooks


In [6]:
%cd weave_cookbooks/summarization
!pip install -r requirements.txt

/content/weave_cookbooks/summarization


In [8]:
import base64
import json
import os
from datetime import datetime, timezone
from itertools import product

import anthropic
import filetype
import numpy as np
import PyPDF2
import requests
import arxiv
from arxiv_models import ArxivPaper, Author, Link, convert_raw_arxiv_to_pydantic
from dotenv import load_dotenv
from openai import OpenAI
from pdf2image import convert_from_bytes
from PIL import Image

import weave
import io

In [11]:
weave.init("arxiv-chain-of-density-summarization")

<weave.weave_client.WeaveClient at 0x791ac28d1420>

In [12]:
anthropic_client = anthropic.Anthropic(api_key=os.getenv("ANTHROPIC_API_KEY"))

In [13]:
def flatten_dict(d, parent_key='', sep='_'):
    items = []
    for k, v in d.items():
        new_key = f"{parent_key}{sep}{k}" if parent_key else k
        if isinstance(v, dict):
            items.extend(flatten_dict(v, new_key, sep=sep).items())
        else:
            items.append((new_key, v))
    return dict(items)

## (Optional) Fetch Arxiv Papers

This section demonstrates how to fetch relevant papers from the ArXiv database based on a given research instruction. This step is optional but can be useful if you want to dynamically retrieve papers for summarization instead of using predefined examples.

### Generate ArXiv Query Arguments

We use the `generate_arxiv_query_args` function to create an optimal ArXiv search query and determine the appropriate number of results to fetch. This function leverages Claude to generate a well-crafted query string and suggest a suitable `max_results` value.

```python
instruction = "Answer the following question: What are the latest advancements in audio music information retrieval?"
arxiv_query, max_results = generate_arxiv_query_args(instruction)
print(f"ArXiv query: {arxiv_query}")
print(f"Max results: {max_results}")
```

### Fetch ArXiv Papers

Once we have the query and max_results, we can use the `fetch_arxiv_papers` function to retrieve the relevant papers from ArXiv. This function returns a list of `ArxivPaper` objects, which contain metadata about each paper, including its title, authors, abstract, and PDF URL.

```python
arxiv_papers = fetch_arxiv_papers(arxiv_query, max_results)
```

By uncommenting and running these code snippets, you can dynamically fetch ArXiv papers based on your research interests. This allows for a more flexible and customizable summarization pipeline, enabling you to process and summarize the most recent and relevant research in your field of interest.

In [14]:
@weave.op()
def generate_arxiv_query_args(instruction, model="claude-3-sonnet-20240229"):
    tools = [{
        "name": "prepare_arxiv_search",
        "description": "Prepare arguments for ArXiv paper search. This tool generates an optimal query string utilizing Boolean operators, field-specific syntax, and precise search terms. It also determines an efficient maximum number of results to fetch, balancing comprehensive coverage with processing efficiency. The output is tailored to the given research instruction, aiming to provide relevant and focused search results.",
        "input_schema": {
            "type": "object",
            "properties": {
                "query": {
                    "type": "string",
                    "description": "The ArXiv search query string. Supports Boolean operators (AND, OR, NOT), field-specific syntax (e.g., 'ti:' for title, 'au:' for author), quotation marks for exact phrases, and wildcards. Can include multiple search terms to refine results based on title, abstract, authors, comments, journal reference, subject category, or report number."
                },
                "max_results": {
                    "type": "integer",
                    "description": "The maximum number of paper results to return from the ArXiv search. Aims to minimize the number of results while ensuring sufficient coverage of the topic. Defaults to 5 if not specified. Increasing this value broadens the search but may increase processing time and resource usage. Aim to be below 10 articles."
                }
            },
            "required": ["query", "max_results"]
        }
    }]

    system_prompt = """You are an expert at generating ArXiv queries. Use the prepare_arxiv_search tool to create an optimal query and determine the appropriate maximum number of results for the given research question. The query should utilize advanced search techniques including Boolean operators, field-specific syntax, and precise terms to ensure comprehensive yet focused results."""

    messages = [
        {
            "role": "user",
            "content": f"Use the prepare_arxiv_search tool to generate an optimal ArXiv query and determine the maximum number of results for the following research instruction: {instruction}"
        }
    ]

    response = anthropic_client.messages.create(
        model=model,
        max_tokens=4096,
        messages=messages,
        system=system_prompt,
        tools=tools
    )

    # Extract the query and max_results from the response
    for content in response.content:
        if content.type == 'tool_use' and content.name == 'prepare_arxiv_search':
            args = content.input
            return args.get('query'), args.get('max_results')

    # If no tool use was found, return a default query and the provided max_results
    return f"{instruction}", 5

In [18]:
instruction = "Answer the following question: What are the latest advancements in Agentic LLMs?"
arxiv_query, max_results = generate_arxiv_query_args(instruction)
print(f"ArXiv query: {arxiv_query}")
print(f"Max results: {max_results}")

🍩 https://wandb.ai/a-sh0ts/arxiv-chain-of-density-summarization/r/call/5aa1df8f-b028-426c-9165-e7896a6f2d23
ArXiv query: (ti:agentic OR ti:constituitional OR ab:agentic OR ab:"language model" OR ab:"large language model") AND (ti:advanc* OR ab:advanc* OR ab:recent OR ab:latest) AND (cat:cs.CL OR cat:cs.AI OR cat:cs.LG)
Max results: 8


In [19]:
@weave.op()
def fetch_arxiv_papers(query, max_results=5):
    # Initialize the arxiv Client
    arxiv_client = arxiv.Client()

    # Create the search object
    search = arxiv.Search(
        query=query,
        max_results=max_results,
        sort_by=arxiv.SortCriterion.Relevance,
        sort_order=arxiv.SortOrder.Descending
    )

    # Fetch the results using client.results() and convert them to ArxivPaper objects
    papers = []
    for result in arxiv_client.results(search):
        paper = convert_raw_arxiv_to_pydantic(result)
        papers.append(paper)

    return papers

In [20]:
arxiv_papers = fetch_arxiv_papers(arxiv_query, max_results)

🍩 https://wandb.ai/a-sh0ts/arxiv-chain-of-density-summarization/r/call/6ab168f5-f701-483d-824d-b2efdfc11d67


## Create a sample Arxiv paper object and load its PDF

In this section, we demonstrate how to create a sample `ArxivPaper` object and load its corresponding PDF. This process is crucial for our summarization pipeline, as it provides both the metadata and the actual content of the paper.

### Creating the ArxivPaper object

The `ArxivPaper` class is a custom data structure that encapsulates various attributes of an arXiv paper, including:

- `entry_id`: A unique identifier for the paper
- `updated` and `published`: Timestamps for when the paper was last updated and initially published
- `title`: The title of the paper
- `authors`: A list of `Author` objects representing the paper's authors
- `summary`: An abstract or brief description of the paper's content
- `doi`: The Digital Object Identifier for the paper
- `categories`: The arXiv categories the paper belongs to
- `links`: Various URLs associated with the paper, including its abstract and PDF
- `pdf_url`: A direct link to the paper's PDF

In the code snippet below, we create an `ArxivPaper` object for a paper titled "CRAG -- Comprehensive RAG Benchmark". This paper discusses a new benchmark for Retrieval-Augmented Generation (RAG) systems, which is highly relevant to our summarization task.

### Loading the PDF

After creating the `ArxivPaper` object, we use the `load_pdf` function to fetch and load the actual PDF content. This function:

1. Retrieves the PDF URL from the `ArxivPaper` object
2. Downloads the PDF content using the `requests` library
3. Creates a `BytesIO` object from the downloaded content
4. Uses `PyPDF2.PdfReader` to create a PDF reader object

The `load_pdf` function allows us to work with the actual content of the paper, which is essential for our summarization task.

By using this sample object and loading its PDF, we can proceed with our chain of density summarization process and evaluate its performance on a known, controlled input. This approach helps in debugging, fine-tuning, and showcasing the capabilities of our summarization pipeline.

In [21]:
arxiv_paper = ArxivPaper(
    entry_id="http://arxiv.org/abs/2406.04744v1",
    updated=datetime(2024, 6, 7, 8, 43, 7, tzinfo=timezone.utc),
    published=datetime(2024, 6, 7, 8, 43, 7, tzinfo=timezone.utc),
    title="CRAG -- Comprehensive RAG Benchmark",
    authors=[
        Author(full_name="Xiao Yang"),
        Author(full_name="Kai Sun"),
        Author(full_name="Hao Xin"),
        Author(full_name="Yushi Sun"),
        Author(full_name="Nikita Bhalla"),
        Author(full_name="Xiangsen Chen"),
        Author(full_name="Sajal Choudhary"),
        Author(full_name="Rongze Daniel Gui"),
        Author(full_name="Ziran Will Jiang"),
        Author(full_name="Ziyu Jiang"),
        Author(full_name="Lingkun Kong"),
        Author(full_name="Brian Moran"),
        Author(full_name="Jiaqi Wang"),
        Author(full_name="Yifan Ethan Xu"),
        Author(full_name="An Yan"),
        Author(full_name="Chenyu Yang"),
        Author(full_name="Eting Yuan"),
        Author(full_name="Hanwen Zha"),
        Author(full_name="Nan Tang"),
        Author(full_name="Lei Chen"),
        Author(full_name="Nicolas Scheffer"),
        Author(full_name="Yue Liu"),
        Author(full_name="Nirav Shah"),
        Author(full_name="Rakesh Wanga"),
        Author(full_name="Anuj Kumar"),
        Author(full_name="Wen-tau Yih"),
        Author(full_name="Xin Luna Dong")
    ],
    summary="Retrieval-Augmented Generation (RAG) has recently emerged as a promising solution to alleviate Large Language Model (LLM)'s deficiency in lack of knowledge. Existing RAG datasets, however, do not adequately represent the diverse and dynamic nature of real-world Question Answering (QA) tasks. To bridge this gap, we introduce the Comprehensive RAG Benchmark (CRAG), a factual question answering benchmark of 4,409 question-answer pairs and mock APIs to simulate web and Knowledge Graph (KG) search. CRAG is designed to encapsulate a diverse array of questions across five domains and eight question categories, reflecting varied entity popularity from popular to long-tail, and temporal dynamisms ranging from years to seconds. Our evaluation on this benchmark highlights the gap to fully trustworthy QA. Whereas most advanced LLMs achieve <=34% accuracy on CRAG, adding RAG in a straightforward manner improves the accuracy only to 44%. State-of-the-art industry RAG solutions only answer 63% questions without any hallucination. CRAG also reveals much lower accuracy in answering questions regarding facts with higher dynamism, lower popularity, or higher complexity, suggesting future research directions. The CRAG benchmark laid the groundwork for a KDD Cup 2024 challenge, attracting thousands of participants and submissions within the first 50 days of the competition. We commit to maintaining CRAG to serve research communities in advancing RAG solutions and general QA solutions.",
    comment="",
    journal_ref=None,
    doi="10.48550/arXiv.2406.04744",
    primary_category="cs.CL",
    categories=["cs.CL"],
    links=[
        Link(href="https://arxiv.org/abs/2406.04744", title="Abstract", rel="alternate", content_type=None),
        Link(href="https://arxiv.org/pdf/2406.04744", title="pdf", rel="related", content_type=None)
    ],
    pdf_url="https://arxiv.org/pdf/2406.04744"
)

In [22]:
arxiv_paper.pdf_url

'https://arxiv.org/pdf/2406.04744'

In [23]:
def load_pdf(arxiv_result):
    pdf_url = arxiv_result["pdf_url"]
    response = requests.get(pdf_url)
    pdf_file = io.BytesIO(response.content)
    pdf_reader = PyPDF2.PdfReader(pdf_file)
    return pdf_reader

## Convert Images to Text using Sonnet's vision capabilities

In this section, we leverage Claude 3 Sonnet's advanced vision capabilities to convert images from ArXiv PDFs into detailed textual descriptions. This process is crucial for creating a comprehensive text-based representation of the entire paper, including figures and diagrams.

### Key Components:

1. **Vector Graphic Conversion**:
   - The `convert_vector_graphic_page_to_image` function handles vector graphics in PDFs, converting them to PNG images for further processing.
   - This step is essential for capturing complex diagrams and charts that are often present in scientific papers.
   - If direct image extraction is not possible (e.g., for SVGs or other vector graphics), the function converts the entire page to an image.
   - In such cases, the LLM is instructed to focus solely on describing the images on the page, ignoring any text content.

2. **Image Processing**:
   - Two main functions, `process_figure_image` and `process_vector_image_pdf`, utilize Claude 3 Sonnet to analyze and describe images.
   - `process_figure_image` focuses on individual figures, providing detailed technical descriptions.
   - `process_vector_image_pdf` handles full PDF pages that may contain multiple vector graphics.

3. **Image Extraction and Description**:
   - The `extract_images` function iterates through PDF pages, extracting both raster images and vector graphics.
   - It calls the appropriate processing function for each image type, generating textual descriptions.

4. **Text Integration**:
   - `replace_images_with_descriptions` combines the extracted text from the PDF with the generated image descriptions.
   - This creates a unified text document that includes both the original text and detailed descriptions of all visual elements.

By converting images to text, we ensure that the chain of density summarization process can incorporate information from all aspects of the paper, including visual data. This comprehensive approach allows for more accurate and informative summaries, especially for papers with significant visual content.

In [24]:
def convert_vector_graphic_page_to_image(pdf_page, scale_factor=0.5):
    def get_object(obj):
        if isinstance(obj, PyPDF2.generic.IndirectObject):
            return obj.get_object()
        return obj

    resources = get_object(pdf_page.get('/Resources', {}))
    xobject = get_object(resources.get('/XObject', {}))

    # Check if there's a figure that's not an image
    if xobject:
        for obj in xobject.values():
            obj = get_object(obj)
            if isinstance(obj, dict) and obj.get('/Subtype') == '/Form':  # This indicates a vector graphic
                # Convert the page to a PIL Image
                pdf_bytes = io.BytesIO()
                pdf_writer = PyPDF2.PdfWriter()
                pdf_writer.add_page(pdf_page)
                pdf_writer.write(pdf_bytes)
                pdf_bytes.seek(0)

                # Convert PDF to image
                images = convert_from_bytes(pdf_bytes.getvalue(), fmt='png')

                if images:
                    image = images[0]
                    # Resize the image
                    new_size = (int(image.width * scale_factor), int(image.height * scale_factor))
                    image = image.resize(new_size, Image.LANCZOS)
                    img_byte_arr = io.BytesIO()
                    image.save(img_byte_arr, format='PNG')
                    img_byte_arr = img_byte_arr.getvalue()
                    img_str = base64.b64encode(img_byte_arr).decode("utf-8")
                    data_url = f"data:image/png;base64,{img_str}"
                    return data_url

    return None  # Return None if no conversion was needed

In [25]:
@weave.op()
def process_figure_image(data_url, model="claude-3-5-sonnet-20240620"):
    """Process image data and return a detailed technical description."""
    img_str = data_url.split(",")[1]

    response = anthropic_client.messages.create(
        model=model,
        max_tokens=4096,
        messages=[
            {
                "role": "user",
                "content": [
                    {
                        "type": "image",
                        "source": {
                            "type": "base64",
                            "media_type": "image/png",
                            "data": img_str,
                        },
                    },
                    {
                        "type": "text",
                        "text": """Analyze this image as if it's a figure from a scientific research paper. Provide a detailed technical description addressing the following:

1. Type of figure (e.g., graph, diagram, flowchart, experimental setup)
2. Key components or variables represented
3. Relationships or trends depicted
4. Quantitative information (if present)
5. Methodology or process illustrated (if applicable)
6. Potential implications or conclusions that can be drawn
7. Any limitations or assumptions evident in the figure

Focus on technical accuracy and relevance to scientific research. Avoid general descriptions and concentrate on the specific scientific content presented.""",
                    },
                ],
            }
        ],
    )
    return response.content[0].text

In [26]:
@weave.op()
def process_vector_image_pdf(data_url, model="claude-3-5-sonnet-20240620"):
    img_str = data_url.split(",")[1]

    response = anthropic_client.messages.create(
        model=model,
        max_tokens=4096,
        messages=[
            {
                "role": "user",
                "content": [
                    {
                        "type": "image",
                        "source": {
                            "type": "base64",
                            "media_type": "image/png",
                            "data": img_str,
                        },
                    },
                    {
                        "type": "text",
                        "text": """This image is a full page from a scientific paper PDF, converted to PNG format. It may contain one or more vector graphic figures or charts. Your task is to:

1. Identify and focus solely on the vector graphic figures or charts within the page.
2. For each identified figure or chart, provide a detailed technical analysis addressing:

   a. Type of figure (e.g., graph, diagram, flowchart)
   b. Key components or variables represented
   c. Relationships or trends depicted
   d. Quantitative information (if present)
   e. Methodology or process illustrated (if applicable)
   f. Potential implications or conclusions that can be drawn

3. Ignore any text or other elements on the page that are not part of the vector graphic figures.
4. If multiple figures are present, analyze each separately and clearly indicate which figure you are describing.

Focus on providing accurate, technical descriptions of the vector graphic content only.""",
                    },
                ],
            }
        ],
    )
    return response.content[0].text

In [27]:
@weave.op()
def extract_images(paper, model="claude-3-5-sonnet-20240620"):
    """Extract text and images from PDF content."""
    pdf_reader = load_pdf(paper)
    all_images = []

    for page in pdf_reader.pages:
        images = []

        for image in page.images:
            img_data = image.data
            kind = filetype.guess(img_data)
            if kind is None:
                print("Cannot guess file type!")
                continue

            img_str = base64.b64encode(img_data).decode("utf-8")
            data_url = f"data:{kind.mime};base64,{img_str}"
            try:
                images.append(
                    {"image": data_url, "description": process_figure_image(data_url, model=model)}
                )
            except Exception as e:
                print(f"Error processing image: {e}")
                images.append({"image": data_url, "description": ""})

        vector_graphics_image_data_url = convert_vector_graphic_page_to_image(page)
        if vector_graphics_image_data_url:
            images.append({"image": vector_graphics_image_data_url, "description": process_vector_image_pdf(vector_graphics_image_data_url, model=model)})
        all_images.append(images)

    return all_images

In [28]:
@weave.op()
def replace_images_with_descriptions(paper, images):
    pdf_reader = load_pdf(paper)
    text = ""
    for page_num, page in enumerate(pdf_reader.pages):
        text += page.extract_text() + "\n\n"
        if images[page_num] and len(images[page_num]) > 0:
            text += f"\n\n[Image Descriptions for page {page_num+1}]\n"
            for image_num, image in enumerate(images[page_num]):
                text += f"\n[Image {image_num+1}]: {image['description']}\n"
            text += "[END OF IMAGE DESCRIPTIONS]\n"

    return text

## Chain of Density Summarization

The Chain of Density (CoD) summarization technique is a powerful method for creating increasingly dense and informative summaries. In this section, we'll explore how to implement CoD for ArXiv PDF summarization, including specific preprocessing and postprocessing steps to evaluate the model's performance.

Chain of Density is an iterative approach to summarization that progressively refines and condenses information. The process involves several key steps:

1. **Initial Summarization**: Starting with the full document, the `summarize_current_summary` function creates an initial summary focused on a specific instruction.

2. **Iterative Refinement**: The `iterative_density_summarization` function repeatedly calls `summarize_current_summary`, each time taking the previous summary as input. This process:
   - Identifies new, important technical entities or ideas from the original text
   - Incorporates these new elements into the summary
   - Increases overall information density while maintaining focus on the instruction

3. **Final Condensation**: After multiple iterations, the `final_summary` function creates an extremely dense summary, aiming to reduce length by 30-40% while retaining all critical technical content.

The `chain_of_density_summarization` function orchestrates this entire process:

```python
@weave.op()
def chain_of_density_summarization(document, instruction, current_summary="", model="claude-3-5-sonnet-20240620", density_iterations=2):
    current_summary, iteration_summaries = iterative_density_summarization(document, instruction, current_summary, density_iterations, model)
    final_summary_text = final_summary(instruction, current_summary, model)
    print(f"Final Summary:\n{final_summary_text}\n")

    return {
        "final_summary": final_summary_text,
        "accumulated_summary": current_summary,
        "iteration_summaries": iteration_summaries,
    }
```

This function takes the preprocessed document, a specific instruction to focus on, the model to use, and the number of density iterations. It returns a dictionary containing:

- The final, highly condensed summary
- The accumulated summary from all iterations
- Individual summaries from each iteration

By using this approach, Chain of Density creates summaries that are progressively more concise, technically precise, and information-dense, while remaining focused on the specific instruction provided. This makes it particularly well-suited for summarizing complex technical documents like ArXiv papers, where maintaining accuracy and depth of information is crucial.

In [29]:
@weave.op()
def summarize_current_summary(document, instruction, current_summary="", iteration=1, model="claude-3-5-sonnet-20240620"):
    max_tokens = 4096  # Adjust this value based on the model's context window

    prompt = f"""
    Document:
    {document}

    Current summary:
    {current_summary}

    Instruction to focus on: {instruction}

    Iteration: {iteration}

    Generate an increasingly concise, entity-dense, and highly technical summary from the provided document that specifically addresses the given instruction using the below approach:

    1. Carefully read the current summary and the instruction.

    2. Identify 1-3 new, important technical entities or ideas from the original text that:
       - Are directly relevant to the instruction
       - Are not yet present in the current summary
       - Add significant, specific information to the summary
       - Are preferably 5 words or fewer
       - May include methodologies, algorithms, metrics, or key findings
       - Ensure to include this in the output before the summary

    3. Write a new summary that:
       - Incorporates the newly identified entities/ideas
       - Retains all crucial information from the current summary
       - Increases overall information density
       - Remains focused on addressing the instruction
       - Utilizes the response window of {max_tokens} tokens

    Guidelines:
    - Prioritize technical accuracy and specificity over general readability
    - Use precise terminology, domain-specific jargon, and include quantitative details where relevant
    - Ensure all information is directly related to the instruction
    - Make every word count: rewrite to improve density and make space for new technical entities
    - Employ fusion, compression, and removal of less informative phrases to increase density
    - Never drop entities or technical details from the current summary that are relevant to the instruction
    - Maintain coherence while maximizing information density

    Your goal is to create a summary that is noticeably denser, more technical, and more informative than the previous one, utilizing the response window of {max_tokens} tokens while staying laser-focused on the instruction. The summary should be suitable for an expert audience in the field."""

    response = anthropic_client.messages.create(
        model=model,
        max_tokens=max_tokens,
        messages=[{"role": "user", "content": prompt}]
    )
    return response.content[0].text

In [30]:
@weave.op()
def iterative_density_summarization(document, instruction, current_summary, density_iterations, model):
    iteration_summaries = []
    for iteration in range(1, density_iterations + 1):
        current_summary = summarize_current_summary(document, instruction, current_summary, iteration, model)
        iteration_summaries.append(current_summary)
        print(f"Iteration {iteration}:\n{current_summary}\n")
    return current_summary, iteration_summaries

In [31]:
@weave.op()
def final_summary(instruction, current_summary, model):
    return anthropic_client.messages.create(
        model=model,
        max_tokens=4096,
        messages=[
            {
                "role": "user",
                "content": f"""Given this summary:

{current_summary}

And this instruction to focus on:

{instruction}

Create an extremely dense, final summary that captures all key technical information in the most concise form possible, while specifically addressing the given instruction. Follow these guidelines:

1. Aim to reduce length by 30-40% while retaining all critical technical content relevant to the instruction.
2. Prioritize highly specific methodologies, algorithms, metrics, and findings that directly address the instruction.
3. Preserve precise quantitative data, including statistical significance and error margins where applicable and relevant to the instruction.
4. Maintain the use of domain-specific terminology and technical jargon pertinent to the instruction.
5. Ensure that all key entities and concepts from the original summary that relate to the instruction are represented.
6. Use compact phrasing and remove any remaining non-essential information that doesn't directly contribute to addressing the instruction.
7. If relevant to the instruction, include brief mentions of limitations, assumptions, or conflicting viewpoints.
8. Optimize for information density while maintaining coherence for an expert audience, always keeping the focus on the given instruction.

The final summary should be a highly concentrated, technical distillation of the research that specifically addresses the given instruction, suitable for specialists in the field.""",
            }
        ],
    ).content[0].text

In [32]:
@weave.op()
def chain_of_density_summarization(document, instruction, current_summary="", model="claude-3-5-sonnet-20240620", density_iterations=2):
    current_summary, iteration_summaries = iterative_density_summarization(document, instruction, current_summary, density_iterations, model)
    final_summary_text = final_summary(instruction, current_summary, model)
    print(f"Final Summary:\n{final_summary_text}\n")

    return {
        "final_summary": final_summary_text,
        "accumulated_summary": current_summary,
        "iteration_summaries": iteration_summaries,
    }

## Create a Weave Model Object to better serialize the model for experimentation

## Create a Weave Model Object to better serialize the model for experimentation

This section defines an `ArxivChainOfDensityPipeline` class that encapsulates our summarization pipeline as a `weave.Model`. Key features:

- Configurable parameters: `model` and `density_iterations`
- `predict` method: Processes an `ArxivPaper` object and instruction through the entire pipeline

The class structure enables easy serialization, parameter adjustment, and reproducibility of experiments. Usage example is provided for instantiation and prediction.

In [33]:
class ArxivChainOfDensityPipeline(weave.Model):

    model: str = "claude-3-5-sonnet-20240620"
    density_iterations: int = 3

    def __init__(self, model: str = "claude-3-5-sonnet-20240620", density_iterations: int = 3):
        super().__init__()
        self.model = model
        self.density_iterations = density_iterations

    @weave.op()
    def predict(self, paper: ArxivPaper, instruction: str) -> dict:
        extracted_images = extract_images(paper)
        cleaned_text = replace_images_with_descriptions(paper, extracted_images)
        result = chain_of_density_summarization(cleaned_text, instruction, model=self.model, density_iterations=self.density_iterations)
        return result

In [36]:
arxiv_chain_of_density_pipeline = ArxivChainOfDensityPipeline()
arxiv_chain_of_density_pipeline.predict(arxiv_paper, "Determine how I would best incorporate these benchmarks for my customer support RAG system. What evaluations would work best specifically for me?")

Iteration 1:
New entities/ideas:
1. Score_a metric
2. Auto-eval mechanism
3. Human-eval process

Summary:

To incorporate CRAG benchmarks for a customer support RAG system, focus on implementing the following evaluations:

1. Utilize Score_a metric: Implement the auto-eval mechanism, which calculates Score_a as Accuracy - Hallucination. This penalizes incorrect answers while rewarding accurate ones, aligning with customer support priorities.

2. Adapt question types: Modify CRAG's eight question types (Simple, Simple w. Condition, Set, Comparison, Aggregation, Multi-hop, Post-processing heavy, False Premise) to reflect common customer support scenarios. Ensure coverage across different complexity levels.

3. Customize domains: Replace CRAG's domains (Finance, Sports, Music, Movie, Open) with relevant customer support categories. Maintain a mix of dynamism levels (Real-time, Fast-changing, Slow-changing, Static) to simulate varied support queries.

4. Implement mock APIs: Develop mock A

{'final_summary': 'To incorporate CRAG benchmarks for your customer support RAG system:\n\n1. Implement Score_a metric: Accuracy - Hallucination. Auto-eval: LLM evaluators (ChatGPT F1: 94.7%, Llama 3 F1: 98.9%). Human-eval: perfect (1), acceptable (0.5), missing (0), incorrect (-1).\n\n2. Adapt 8 question types to support scenarios. Implement torso-to-tail entity sampling.\n\n3. Replace domains with support categories. Maintain dynamism mix (Real-time, Fast-changing, Slow-changing, Static). Implement real-time API simulation.\n\n4. Develop KG (2.6M+ entities) and web search simulations (Brave Search API, 50 HTML pages/query). Signal-to-noise ratio <1/30.\n\n5. Domain-specific entity extraction using LLM in-context learning.\n\n6. Apply traffic-weighted evaluation based on user interaction data.\n\n7. Track Accuracy, Hallucination, Missing rates across question types, domains, dynamism. Prioritize complex queries and dynamic information.\n\n8. Measure latency on A100 GPUs, emphasizing R

## Create our Evaluation Dataset

In this section, we prepare a dataset for evaluating our Chain of Density (CoD) summarization pipeline on ArXiv papers. This dataset will allow us to assess the performance of our model across different papers and instructions.

### Key Components:

1. **Sample ArXiv Papers**: We create `ArxivPaper` objects for three different papers:
   - `arxiv_paper1`: "Does Fine-Tuning LLMs on New Knowledge Encourage Hallucinations?"
   - `arxiv_paper2`: "Many-Shot In-Context Learning"
   - `arxiv_paper3`: "LLMs instead of Human Judges? A Large Scale Empirical Study across 20 NLP Evaluation Tasks"

   Each `ArxivPaper` object contains metadata such as title, authors, summary, and PDF URL.

2. **Evaluation Instructions**: We define a list of instructions that will guide the summarization process:
   ```python
   eval_instructions = [
       "Summarize the key methodologies and novel contributions of this research, focusing on their potential impact in the field.",
       "Analyze the experimental setup, results, and limitations of this study, highlighting any statistical significance and error margins.",
       "Compare this paper's approach to existing methods in the field, explaining how it addresses current challenges or limitations."
   ]
   ```

3. **Creating Evaluation Data**: We use `itertools.product()` to create combinations of papers and instructions:
   ```python
   eval_data = list(product(eval_papers, eval_instructions))
   ```

4. **Weave Dataset**: Finally, we create a Weave Dataset object that combines the paper, instruction, and original summary for each evaluation item:
   ```python
   dataset = weave.Dataset(name="we-paper-reading-eval-data",
                           rows=[{"paper": arxiv_paper,
                                  "instruction": instruction,
                                  "summary": arxiv_paper.summary}
                                 for arxiv_paper, instruction in eval_data])
   ```

5. **Publishing the Dataset**: We publish the dataset to make it available for evaluation:
   ```python
   weave.publish(dataset)
   ```

This evaluation dataset provides a structured way to assess our CoD summarization pipeline across different papers and instructions, allowing for comprehensive testing of the model's performance and adaptability.

In [37]:
arxiv_paper1 = ArxivPaper(
    entry_id="http://arxiv.org/abs/2405.05904",
    updated=datetime(2024, 5, 13, 7, 29, 58, tzinfo=timezone.utc),
    published=datetime(2024, 5, 9, 17, 0, 22, tzinfo=timezone.utc),
    title="Does Fine-Tuning LLMs on New Knowledge Encourage Hallucinations?",
    authors=[
        Author(full_name="Zorik Gekhman"),
        Author(full_name="Gal Yona"),
        Author(full_name="Roee Aharoni"),
        Author(full_name="Matan Eyal"),
        Author(full_name="Amir Feder"),
        Author(full_name="Roi Reichart"),
        Author(full_name="Jonathan Herzig")
    ],
    summary=("When large language models are aligned via supervised fine-tuning, they may encounter new factual information "
             "that was not acquired through pre-training. It is often conjectured that this can teach the model the behavior "
             "of hallucinating factually incorrect responses, as the model is trained to generate facts that are not grounded "
             "in its pre-existing knowledge. In this work, we study the impact of such exposure to new knowledge on the capability "
             "of the fine-tuned model to utilize its pre-existing knowledge. To this end, we design a controlled setup, focused on "
             "closed-book QA, where we vary the proportion of the fine-tuning examples that introduce new knowledge. We demonstrate "
             "that large language models struggle to acquire new factual knowledge through fine-tuning, as fine-tuning examples that "
             "introduce new knowledge are learned significantly slower than those consistent with the model's knowledge. However, we "
             "also find that as the examples with new knowledge are eventually learned, they linearly increase the model's tendency "
             "to hallucinate. Taken together, our results highlight the risk in introducing new factual knowledge through fine-tuning, "
             "and support the view that large language models mostly acquire factual knowledge through pre-training, whereas fine-tuning "
             "teaches them to use it more efficiently."),
    comment=None,
    journal_ref=None,
    doi="10.48550/arXiv.2405.05904",
    primary_category="cs.CL",
    categories=["cs.CL"],
    links=[
        Link(href="https://arxiv.org/abs/2405.05904", title="Abstract", rel="alternate"),
        Link(href="https://arxiv.org/pdf/2405.05904", title="pdf", rel="related")
    ],
    pdf_url="https://arxiv.org/pdf/2405.05904"
)

In [38]:
arxiv_paper2 = ArxivPaper(
    entry_id="http://arxiv.org/abs/2404.11018",
    updated=datetime(2024, 5, 22, 17, 6, 10, tzinfo=timezone.utc),
    published=datetime(2024, 4, 17, 2, 49, 26, tzinfo=timezone.utc),
    title="Many-Shot In-Context Learning",
    authors=[
        Author(full_name="Rishabh Agarwal"),
        Author(full_name="Avi Singh"),
        Author(full_name="Lei M. Zhang"),
        Author(full_name="Bernd Bohnet"),
        Author(full_name="Luis Rosias"),
        Author(full_name="Stephanie Chan"),
        Author(full_name="Biao Zhang"),
        Author(full_name="Ankesh Anand"),
        Author(full_name="Zaheer Abbas"),
        Author(full_name="Azade Nova"),
        Author(full_name="John D. Co-Reyes"),
        Author(full_name="Eric Chu"),
        Author(full_name="Feryal Behbahani"),
        Author(full_name="Aleksandra Faust"),
        Author(full_name="Hugo Larochelle")
    ],
    summary=("Large language models (LLMs) excel at few-shot in-context learning (ICL) -- learning from a few examples provided in context at inference, "
             "without any weight updates. Newly expanded context windows allow us to investigate ICL with hundreds or thousands of examples -- the many-shot regime. "
             "Going from few-shot to many-shot, we observe significant performance gains across a wide variety of generative and discriminative tasks. While promising, "
             "many-shot ICL can be bottlenecked by the available amount of human-generated examples. To mitigate this limitation, we explore two new settings: Reinforced "
             "and Unsupervised ICL. Reinforced ICL uses model-generated chain-of-thought rationales in place of human examples. Unsupervised ICL removes rationales from the "
             "prompt altogether, and prompts the model only with domain-specific questions. We find that both Reinforced and Unsupervised ICL can be quite effective in the "
             "many-shot regime, particularly on complex reasoning tasks. Finally, we demonstrate that, unlike few-shot learning, many-shot learning is effective at overriding "
             "pretraining biases, can learn high-dimensional functions with numerical inputs, and performs comparably to fine-tuning. Our analysis also reveals the limitations "
             "of next-token prediction loss as an indicator of downstream ICL performance."),
    comment=None,
    journal_ref=None,
    doi="10.48550/arXiv.2404.11018",
    primary_category="cs.LG",
    categories=["cs.LG", "cs.AI", "cs.CL"],
    links=[
        Link(href="https://arxiv.org/abs/2404.11018", title="Abstract", rel="alternate"),
        Link(href="https://arxiv.org/pdf/2404.11018", title="pdf", rel="related")
    ],
    pdf_url="https://arxiv.org/pdf/2404.11018"
)

In [39]:
arxiv_paper3 = ArxivPaper(
    entry_id="http://arxiv.org/abs/2406.18403",
    updated=datetime(2024, 6, 26, 14, 56, 13, tzinfo=timezone.utc),
    published=datetime(2024, 6, 26, 14, 56, 13, tzinfo=timezone.utc),
    title="LLMs instead of Human Judges? A Large Scale Empirical Study across 20 NLP Evaluation Tasks",
    authors=[
        Author(full_name="Anna Bavaresco"),
        Author(full_name="Raffaella Bernardi"),
        Author(full_name="Leonardo Bertolazzi"),
        Author(full_name="Desmond Elliott"),
        Author(full_name="Raquel Fernández"),
        Author(full_name="Albert Gatt"),
        Author(full_name="Esam Ghaleb"),
        Author(full_name="Mario Giulianelli"),
        Author(full_name="Michael Hanna"),
        Author(full_name="Alexander Koller"),
        Author(full_name="André F. T. Martins"),
        Author(full_name="Philipp Mondorf"),
        Author(full_name="Vera Neplenbroek"),
        Author(full_name="Sandro Pezzelle"),
        Author(full_name="Barbara Plank"),
        Author(full_name="David Schlangen"),
        Author(full_name="Alessandro Suglia"),
        Author(full_name="Aditya K Surikuchi"),
        Author(full_name="Ece Takmaz"),
        Author(full_name="Alberto Testoni")
    ],
    summary=("There is an increasing trend towards evaluating NLP models with LLM-generated judgments instead of human judgments. "
             "In the absence of a comparison against human data, this raises concerns about the validity of these evaluations; in case they are conducted with proprietary models, "
             "this also raises concerns over reproducibility. We provide JUDGE-BENCH, a collection of 20 NLP datasets with human annotations, and comprehensively evaluate 11 current LLMs, "
             "covering both open-weight and proprietary models, for their ability to replicate the annotations. Our evaluations show that each LLM exhibits a large variance across datasets in its correlation to human judgments. "
             "We conclude that LLMs are not yet ready to systematically replace human judges in NLP."),
    comment=None,
    journal_ref=None,
    doi="10.48550/arXiv.2406.18403",
    primary_category="cs.CL",
    categories=["cs.CL"],
    links=[
        Link(href="https://arxiv.org/abs/2406.18403", title="Abstract", rel="alternate"),
        Link(href="https://arxiv.org/pdf/2406.18403", title="pdf", rel="related")
    ],
    pdf_url="https://arxiv.org/pdf/2406.18403"
)

In [40]:
arxiv_paper3.pdf_url

'https://arxiv.org/pdf/2406.18403'

In [41]:
eval_papers = [
    arxiv_paper1,
    arxiv_paper2,
    arxiv_paper3
]

In [42]:
eval_instructions = [
    "Summarize the key methodologies and novel contributions of this research, focusing on their potential impact in the field.",
    "Analyze the experimental setup, results, and limitations of this study, highlighting any statistical significance and error margins.",
    "Compare this paper's approach to existing methods in the field, explaining how it addresses current challenges or limitations."
]

In [43]:
eval_data = list(product(eval_papers, eval_instructions))

In [44]:
dataset = weave.Dataset(name="we-paper-reading-eval-data", rows=[{"paper": arxiv_paper, "instruction": instruction, "summary": arxiv_paper.summary} for arxiv_paper, instruction in eval_data])



In [45]:
weave.publish(dataset)

📦 Published to https://wandb.ai/a-sh0ts/arxiv-chain-of-density-summarization/weave/objects/we-paper-reading-eval-data/versions/bedJmUK7VNzRSclXYU4K0LXRkZeq0gBX9OyHXuvKNNk


ObjectRef(entity='a-sh0ts', project='arxiv-chain-of-density-summarization', name='we-paper-reading-eval-data', digest='bedJmUK7VNzRSclXYU4K0LXRkZeq0gBX9OyHXuvKNNk', extra=())

## Define our metrics

In this section, we establish a set of metrics to evaluate the quality and effectiveness of our Chain of Density (CoD) summarization pipeline for ArXiv PDFs. These metrics are designed to provide a comprehensive assessment of the summarization process, focusing on relevance, technical quality, and conciseness.

### Key Metrics:

1. **Summary Scoring (`score_summary`)**:
   - Evaluates individual summaries based on three criteria:
     - Relevance (0-5): How well the summary addresses the given instruction
     - Technical Quality (0-5): Accuracy and depth of technical content
     - Conciseness (0-5): Information density and brevity
   - Uses GPT-4 to perform the evaluation, ensuring a nuanced assessment

2. **Long-tail Statistics (`calculate_long_tail_stats`)**:
   - Analyzes the distribution of scores across multiple summaries
   - Calculates mean scores and tail ratios for each aspect (relevance, technical quality, conciseness)
   - Helps identify overall performance and potential outliers

3. **Iteration Impact Analysis (`analyze_iteration_impact`)**:
   - Assesses the improvement of summaries across iterations
   - Identifies the point of diminishing returns and cumulative improvement
   - Useful for optimizing the number of iterations in the CoD process

4. **Optimal Improvement Range (`find_optimal_improvement_range`)**:
   - Determines the most effective range of iterations for improvement
   - Considers moving averages of improvements to find sustained progress

5. **Optimal Score Range (`find_optimal_score_range`)**:
   - Identifies the iteration range that produces the highest quality summaries
   - Helps in fine-tuning the CoD process for maximum effectiveness

6. **Iteration Summary Processing (`process_iteration_summaries`)**:
   - Aggregates and analyzes scores across all iterations
   - Provides a holistic view of the summarization process's progression

7. **Quality Scorer (`quality_scorer`)**:
   - Combines all the above metrics into a comprehensive evaluation
   - Analyzes iteration summaries, accumulated summary, and final summary
   - Produces a flattened, easy-to-analyze score dictionary

In [46]:
@weave.op()
def score_summary(summary, summary_type, instruction, model):
    openai_client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
    prompt = f"""Evaluate the quality of the following {summary_type} based on how well it addresses the given instruction. Use the scoring rules below to calculate three numerical scores between 0 and 10.

Instruction: {instruction}

{summary_type}:
{summary}
Scoring Rules:
1. Relevance (0-5):
   - 5: Perfectly addresses all aspects of the instruction, focusing on key methodologies and novel contributions
     Example: "The paper introduces JUDGE-BENCH, a comprehensive evaluation framework comprising 20 NLP datasets with human annotations, designed to assess LLMs' capacity to replicate human judgments across diverse NLP tasks. The study employs a rigorous comparative analysis of 11 state-of-the-art LLMs, including both open-weight and proprietary models, utilizing correlation metrics to quantify alignment with human annotations."
   - 4: Addresses most aspects of the instruction with minor omissions
     Example: "The research presents JUDGE-BENCH, a novel evaluation framework for LLMs consisting of 20 NLP datasets. It conducts a thorough assessment of 11 LLMs, analyzing their ability to replicate human judgments. The methodology involves correlation analysis between LLM outputs and human annotations."
   - 3: Addresses the main points of the instruction but misses some details about methodologies or contributions
     Example: "The study proposes JUDGE-BENCH, a new benchmark for evaluating LLMs against human judgments in NLP tasks. It assesses multiple LLMs and finds significant variability in their performance across different datasets."
   - 2: Partially addresses the instruction, missing significant aspects of methodologies or contributions
     Example: "The paper discusses a new method for evaluating language models using human-annotated datasets. It compares several LLMs and concludes that they are not yet ready to replace human judges in NLP tasks."
   - 1: Barely addresses the instruction, focusing on tangential information
     Example: "The research explores various natural language processing tasks and the performance of language models. It suggests that human evaluation is still important in NLP."
   - 0: Completely irrelevant to the instruction
     Example: "The paper discusses advancements in computer vision algorithms for image recognition using convolutional neural networks."

2. Technical Quality (0-5):
   - 5: Exceptionally accurate, detailed, and technically sound, with precise descriptions of methodologies and contributions
     Example: "JUDGE-BENCH employs a multi-faceted evaluation protocol, utilizing Pearson correlation coefficients (r) to quantify LLM-human judgment alignment across 20 diverse NLP tasks. The framework incorporates both discriminative and generative tasks, with a particular focus on nuanced linguistic phenomena such as pragmatic inference and discourse coherence. The study reports a mean correlation of r = 0.47 (σ = 0.18) across all models and tasks, with significant inter-task variability (range: 0.12 ≤ r ≤ 0.83). Notably, the best-performing LLM (GPT-4) achieved a maximum mean correlation of r = 0.62, still substantially below perfect alignment (r = 1.0), underscoring the persistent gap between LLM and human judgment capabilities."
   - 4: Highly accurate with comprehensive technical details about research methods and findings
     Example: "The JUDGE-BENCH framework evaluates 11 LLMs across 20 NLP datasets using Pearson correlation to measure alignment with human judgments. The study reports a mean correlation of 0.47 across all models and tasks, with significant variability (σ = 0.18). The best-performing model (GPT-4) achieved a maximum mean correlation of 0.62, indicating a substantial gap between LLM and human judgment capabilities."
   - 3: Generally accurate with good technical depth, but may lack some specifics
     Example: "JUDGE-BENCH evaluates LLMs using correlation analysis with human judgments across multiple NLP tasks. The study finds variable performance across models and tasks, with the best model achieving a mean correlation of 0.62. This suggests LLMs are not yet capable of consistently replicating human judgments in NLP tasks."
   - 2: Mostly accurate but lacks important technical details about methodologies or contributions
     Example: "The study uses a new benchmark called JUDGE-BENCH to evaluate language models. It compares LLM outputs to human judgments using correlation analysis and finds that even the best models don't consistently match human performance across different NLP tasks."
   - 1: Contains technical inaccuracies or lacks significant depth in describing research approaches
     Example: "The paper discusses a method for evaluating AI language models using human-annotated datasets. It shows that AI models don't always agree with human judgments, suggesting they need improvement."
   - 0: Technically unsound or extremely superficial in describing methodologies and contributions
     Example: "The research uses AI to compare computer-generated text to human writing. It finds that AI is not as good as humans at understanding language."

3. Conciseness (0-5):
   - 5: Maximally information-dense without any unnecessary content, perfectly balancing detail and brevity
     Example: "JUDGE-BENCH: 20-dataset NLP evaluation framework. 11 LLMs assessed. Mean correlation with human judgments: r = 0.47 (σ = 0.18). Best model (GPT-4): r = 0.62. Significant inter-task variability: 0.12 ≤ r ≤ 0.83. Conclusion: LLMs not ready to replace human judges in NLP."
   - 4: Highly concise with minimal extraneous information, efficiently describing methodologies and contributions
     Example: "JUDGE-BENCH: 20 NLP datasets for LLM evaluation. 11 models tested. Mean human-LLM judgment correlation: 0.47. Best model: 0.62. High variability across tasks. LLMs currently inadequate for replacing human NLP judges."
   - 3: Generally concise but could be slightly more compact in describing research approaches
     Example: "JUDGE-BENCH evaluates 11 LLMs on 20 NLP datasets. Uses correlation with human judgments. Finds variable performance across tasks. Best model achieves 0.62 correlation. Concludes LLMs can't reliably replace human judges in NLP yet."
   - 2: Contains some unnecessary information or repetition, diluting the focus on key methodologies and contributions
     Example: "The paper introduces JUDGE-BENCH, a new way to evaluate language models. It looks at how well 11 different AI models can match human judgments on 20 NLP tasks. The researchers found that even the best AI model wasn't consistently as good as humans at judging language tasks. They conclude that AI models aren't ready to replace human judges in NLP research yet."
   - 1: Verbose with significant redundancy, obscuring the main research points
     Example: "In this study, the researchers created something called JUDGE-BENCH. It's a way to test how good AI language models are at understanding and judging language like humans do. They tested 11 different AI models on 20 different types of language tasks. They found out that the AI models weren't as consistent as humans in judging these tasks. Even the best AI model wasn't always as good as humans. So, they say that right now, we can't use AI to replace humans when we need to judge language in research."
   - 0: Extremely verbose or filled with irrelevant information unrelated to methodologies and contributions
     Example: "The researchers in this study were interested in natural language processing, which is a field of artificial intelligence that deals with how computers understand and generate human language. They created a new tool called JUDGE-BENCH to test AI models. They used many different language tasks and compared how the AI did compared to humans. It's important to test AI models because we want to know if they can understand language as well as humans can. This kind of research helps us improve AI technology."

     Examples:

1. High-quality summary (Instruction: "Summarize the key methodologies and novel contributions of this research, focusing on their potential impact in the field."):
{{
    "relevance": {{
        "score": 4.75
    }},
    "technical_quality": {{
        "score": 4.5
    }},
    "conciseness": {{
        "score": 4.25
    }}
}}

2. Average-quality summary (Instruction: "Analyze the experimental setup, results, and limitations of this study."):
{{
    "relevance": {{
        "score": 3.0
    }},
    "technical_quality": {{
        "score": 2.75
    }},
    "conciseness": {{
        "score": 3.5
    }}
}}

3. Low-quality summary (Instruction: "Explain how this paper's approach compares to existing methods in the field."):
{{
    "relevance": {{
        "score": 1.5
    }},
    "technical_quality": {{
        "score": 1.25
    }},
    "conciseness": {{
        "score": 2.0
    }}
}}

Provide your evaluation in the following JSON format:
{{
    "relevance": {{
        "score": <float>
    }},
    "technical_quality": {{
        "score": <float>
    }},
    "conciseness": {{
        "score": <float>
    }}
}}

Ensure your response is ONLY valid JSON. Do not include any other text outside the JSON object.
Ensure you have the keys: relevance, technical_quality, conciseness, each containing only a score.
Ensure each score is a float between 0 and 10, using the scoring rules provided above.
"""

    response = openai_client.chat.completions.create(
        model=model,
        messages=[{"role": "user", "content": prompt}],
        response_format={"type": "json_object"}
    )
    return json.loads(response.choices[0].message.content)

In [47]:
@weave.op()
def calculate_long_tail_stats(scores):
    if not scores:
        return None
    aspects = ['relevance', 'technical_quality', 'conciseness']
    stats = {}
    for aspect in aspects:
        try:
            if isinstance(scores[0], list):
                flattened_scores = [score[aspect]['score'] for sublist in scores for score in sublist]
            elif isinstance(scores[0], dict):
                flattened_scores = [score[aspect]['score'] for score in scores]
            else:
                print(f"Unexpected format for scores: {scores}")
                return None

            stats[aspect] = {
                "mean": np.mean(flattened_scores),
                # "median": np.median(flattened_scores),
                # "top_5_percent": np.mean(sorted(flattened_scores)[-max(1, int(len(flattened_scores)*0.05)):]),
                # "bottom_5_percent": np.mean(sorted(flattened_scores)[:max(1, int(len(flattened_scores)*0.05))]),
                # "top_1_percent": np.mean(sorted(flattened_scores)[-max(1, int(len(flattened_scores)*0.01)):]),
                # "interquartile_range": np.percentile(flattened_scores, 75) - np.percentile(flattened_scores, 25),
                "tail_ratio": np.mean(sorted(flattened_scores)[-max(1, int(len(flattened_scores)*0.05)):]) / np.mean(flattened_scores),
            }
        except Exception as e:
            print(f"Error calculating stats for {aspect}: {str(e)}")
            stats[aspect] = None
    return stats

In [48]:
@weave.op()
def analyze_iteration_impact(scores):
    if len(scores) < 2:
        return {aspect: {"mean_improvement": 0, "diminishing_returns_point": 0, "cumulative_improvement": 0, "improvement_variability": 0} for aspect in ['relevance', 'technical_quality', 'conciseness']}

    aspects = ['relevance', 'technical_quality', 'conciseness']
    results = {}

    for aspect in aspects:
        aspect_scores = [s[aspect]['score'] for s in scores]
        improvements = [aspect_scores[i+1] - aspect_scores[i] for i in range(len(aspect_scores)-1)]

        results[aspect] = {
            # "mean_improvement": np.mean(improvements),
            "diminishing_returns_point": next((i for i, imp in enumerate(improvements) if imp <= 0), len(improvements)),
            "cumulative_improvement": sum(improvements),
            # "improvement_variability": np.std(improvements) / np.mean(improvements) if np.mean(improvements) != 0 else 0
        }

    return results

In [49]:
@weave.op()
def find_optimal_improvement_range(scores):
    if len(scores) < 3:
        return {aspect: {"optimal_range_start": 0, "optimal_range_end": 0, "score_at_start": 0, "score_at_end": 0, "improvement_in_range": 0} for aspect in ['relevance', 'technical_quality', 'conciseness']}

    aspects = ['relevance', 'technical_quality', 'conciseness']
    results = {}

    for aspect in aspects:
        aspect_scores = [s[aspect]['score'] for s in scores]
        improvements = [aspect_scores[i+1] - aspect_scores[i] for i in range(len(aspect_scores)-1)]

        window_size = min(3, len(aspect_scores) - 1)
        moving_avg = np.convolve(improvements, np.ones(window_size), 'valid') / window_size

        threshold = 0.1 * np.mean(improvements)
        above_threshold = [i for i, avg in enumerate(moving_avg) if avg >= threshold]

        if not above_threshold:
            optimal_start, optimal_end = 0, 0
        else:
            optimal_start = above_threshold[0]
            optimal_end = above_threshold[-1] + 1

        results[aspect] = {
            "optimal_range_start": optimal_start,
            "optimal_range_end": optimal_end,
            "score_at_start": aspect_scores[optimal_start],
            "score_at_end": aspect_scores[optimal_end] if optimal_end < len(aspect_scores) else aspect_scores[-1],
            "improvement_in_range": sum(improvements[optimal_start:optimal_end])
        }

    return results

In [50]:
@weave.op()
def find_optimal_score_range(scores):
    if len(scores) < 2:
        return {aspect: {"optimal_range_start": 0, "optimal_range_end": 0, "highest_score": 0, "improvement_in_range": 0} for aspect in ['relevance', 'technical_quality', 'conciseness']}

    aspects = ['relevance', 'technical_quality', 'conciseness']
    results = {}

    for aspect in aspects:
        aspect_scores = [s[aspect]['score'] for s in scores]
        improvements = [aspect_scores[i+1] - aspect_scores[i] for i in range(len(aspect_scores)-1)]

        highest_score = max(aspect_scores)
        highest_score_index = aspect_scores.index(highest_score)

        best_start = 0
        best_end = highest_score_index
        best_improvement = sum(improvements[:highest_score_index])

        for start in range(highest_score_index):
            current_improvement = sum(improvements[start:highest_score_index])
            if current_improvement > best_improvement:
                best_start = start
                best_improvement = current_improvement

        results[aspect] = {
            "optimal_range_start": best_start,
            "optimal_range_end": highest_score_index,
            "score_at_start": aspect_scores[best_start],
            "score_at_end": highest_score,
            "improvement_in_range": best_improvement
        }

    return results

In [51]:
@weave.op()
def process_iteration_summaries(model_output, instruction, model):
    iteration_scores = [score_summary(summary, f"Iteration Summary {i+1}", instruction, model)
                        for i, summary in enumerate(model_output["iteration_summaries"])]
    return {
        "long_tail_stats": calculate_long_tail_stats(iteration_scores),
        # "iteration_impact": analyze_iteration_impact(iteration_scores),
        # "optimal_improvement_range": find_optimal_improvement_range(iteration_scores),
        # "optimal_score_range": find_optimal_score_range(iteration_scores)
    }

In [52]:
@weave.op()
def quality_scorer(instruction, model_output, model="gpt-4o"):
    scores = {
        "iteration_summaries_analysis": {},
        "accumulated_summary": {},
        "final_summary": {}
    }

    try:

        # Process iteration summaries
        scores["iteration_summaries_analysis"] = process_iteration_summaries(model_output, instruction, model)

        # Score accumulated summary
        scores["accumulated_summary"] = score_summary(model_output["accumulated_summary"], "Accumulated Summary", instruction, model)

        # Score final summary
        scores["final_summary"] = score_summary(model_output["final_summary"], "Final Summary", instruction, model)

        # After calculating all scores
        flattened_scores = {}
        for key, value in scores.items():
            if isinstance(value, dict):
                flattened_scores[key] = flatten_dict(value)
            else:
                flattened_scores[key] = value

        scores = flatten_dict(flattened_scores)

    except Exception as e:
        print(f"Error in quality_scorer: {str(e)}")
        scores["error"] = str(e)

    return scores

Here's a markdown description for the "Run Evaluation!" section:

## Run Evaluation!

In this section, we demonstrate how to run the evaluation of our Chain of Density (CoD) summarization pipeline on ArXiv papers. This process involves using multiple models and assessing their performance using our custom evaluation metrics.

1. First, we define a list of models to evaluate:
These models represent different versions of Claude, allowing us to compare their performance on our summarization task.

2. Next, we set up and run the evaluation:

Here's what's happening in this code:

- We create a `weave.Evaluation` object, using our previously defined dataset and the `quality_scorer` function.
- We iterate through each model in our list.
- For each model, we create an `ArxivChainOfDensityPipeline` instance, specifying the model and setting `density_iterations` to 8.
- We then run the evaluation asynchronously using `await evaluation.evaluate()`.

In [53]:
models = [
    "claude-3-opus-20240229",
    "claude-3-haiku-20240307",
    "claude-3-5-sonnet-20240620"
]

In [54]:
evaluation = weave.Evaluation(dataset=dataset, scorers=[quality_scorer])
for model in models:
    arxiv_chain_of_density_pipeline = ArxivChainOfDensityPipeline(model=model, density_iterations=8)
    await evaluation.evaluate(arxiv_chain_of_density_pipeline)

Iteration 1:
Here are 2 key technical ideas from the original text that are relevant to comparing this paper's approach to existing methods, are not yet in the summary, add significant information, and are concise:

1. SliCK: Sampling-based Categorization of Knowledge
2. PCorrect measure based on samples

New summary:
This work proposes SliCK, a Sampling-based Categorization of Knowledge approach for quantifying the knowledge in large language models (LLMs). SliCK defines a continuous PCorrect measure based on samples from the LLM to estimate the likelihood it generates the correct answer to a question. PCorrect is used to categorize question-answer pairs into four knowledge categories: HighlyKnown, MaybeKnown, WeaklyKnown, and Unknown. 

Compared to existing methods for quantifying LLM knowledge, SliCK provides a more fine-grained taxonomy. A case study comparison to the P(True) approach by Kadavath et al. (2022) suggests SliCK more accurately identifies Unknown examples on which the 

Iteration 3:
Here are the new key technical entities/ideas I identified to incorporate into the next summary iteration:

1. Linguistic properties evaluated: acceptability, coherence, consistency, fluency, relevance, verbosity
2. Human inter-rater agreement metric: Krippendorff's α 
3. Findings on LLM-human correlation based on human expertise: higher for non-expert than expert judgments

New highly technical summary focused on methodologies and contributions:

JUDGE-BENCH, a 20-dataset English benchmark, evaluates 11 open and closed LLMs' ability to replicate human judgments across NLP tasks. Datasets are categorized by evaluated text source (human/machine-generated) and annotated properties (acceptability, coherence, consistency, fluency, relevance, verbosity). LLM-human alignment is quantified via Cohen's κ (categorical data) and Spearman's ρ (graded data), with Krippendorff's α measuring inherent task difficulty.

Methodologically, the study: 1) prompts LLMs with original annotation

Iteration 3:
Here are the key new entities/ideas I identified to include:
- JUDGE-BENCH methodology: 20 diverse datasets, 11 open-weight and proprietary LLMs, agreement metrics (Cohen's kappa, Spearman correlation, Krippendorff's alpha)
- Evaluation results: High LLM variance across datasets, need for calibration, decreasing open-closed model gap
- Living benchmark: Enables future extensions as new LLMs are released

New summary:
Previous work evaluating LLMs as NLP judges has limitations, often relying on few datasets (largely closed-source models), raising reproducibility concerns. JUDGE-BENCH addresses these challenges through a comprehensive methodology: evaluating 11 open-weight and proprietary LLMs on 20 diverse human-annotated NLP datasets spanning various tasks, judged properties, annotation types, annotator expertise, and data sources. Using agreement metrics (Cohen's kappa, Spearman correlation, Krippendorff's alpha), the evaluation reveals high LLM variance across datasets. 

Iteration 4:
Here are the key technical entities/ideas I identified to add to the summary:

1. Datasets: CNN/DailyMail, XSUM, Switchboard, WMT newstest2020
2. Evaluating vs. human experts: GPT-4o's Spearman ρ 0.22 vs. 0.63 on WMT EN-DE 
3. Inherent task difficulty: Quantified by Krippendorff's α, uncorrelated with model performance

New summary:

This study comprehensively evaluates 11 LLMs (6 with ≥98% valid responses) on JUDGE-BENCH's 20 datasets (e.g. CNN/DailyMail, XSUM, Switchboard, WMT newstest2020) spanning translation, dialogue, toxicity detection, reasoning, etc. It employs Cohen's κ for categorical annotations (avg 0.28±0.32; 0.54-0.56 on CoLA) and Spearman's ρ for graded annotations (avg 0.50±0.21).

Datasets include expert and non-expert categorical and graded judgments on human/machine-generated text. Inherent task difficulty, quantified by Krippendorff's α, is uncorrelated with LLM performance.

Proprietary GPT-4o performs best overall, but open Llama-3-70B and Mixtral-8x

Iteration 5:
Here are the key technical entities/ideas I identified to add to the summary:

1. Models: GPT-4o, Llama-3-70B, Mixtral-8x22B, Gemini-1.5, Comm-R+, Comm-R4
2. Valid response rates: Models with ≥98% used; invalid responses replaced with random values
3. Safety datasets: DICES 990 (crowdsourced), 350 (expert & crowdsourced); Medical-safety

New summary:

This study comprehensively evaluates 11 LLMs on JUDGE-BENCH's 20 datasets spanning translation, dialogue, toxicity detection, reasoning, etc. It employs Cohen's κ for categorical annotations (avg 0.28±0.32; 0.54-0.56 on CoLA) and Spearman's ρ for graded annotations (avg 0.50±0.21). 

Experimental setup: Datasets include expert and non-expert categorical and graded judgments on human/machine-generated text (e.g. CNN/DailyMail, XSUM, Switchboard, WMT newstest2020). Models with ≥98% valid response rates are analyzed (GPT-4o, Llama-3-70B, Mixtral-8x22B, Gemini-1.5, Comm-R+, Comm-R4); invalid responses are replaced with random val

Final Summary:
This study evaluates 11 LLMs on JUDGE-BENCH's 20 English datasets using Cohen's κ for categorical annotations (avg 0.28±0.32) and Spearman's ρ for graded annotations (avg 0.50±0.21). Results vary widely across datasets, with GPT-4o performing best overall. LLMs align better with non-experts than experts (GPT-4o: ρ=0.22 vs. 0.63, WMT EN-DE) and on human vs. machine-generated text. High variance and low alignment are observed, particularly on safety and toxicity datasets. Krippendorff's α quantifies inherent task difficulty. Limitations include English focus, lack of instruction-tuning, and not accounting for valid/invalid response rates. The study concludes LLMs are unreliable without per-dataset calibration. JUDGE-BENCH aims to enable reproducible LLM evaluation despite challenges with proprietary models and data leakage.



Iteration 1:
Key technical entities/ideas not yet present:
- SliCK (Sampling-based Categorization of Knowledge) - a taxonomy for classifying model knowledge
- P(Correct) - a continuous measure of model's knowledge of a given fact
- Filtering out Unknown fine-tuning examples

Summary:

SliCK, a four-category taxonomy for classifying model knowledge, is proposed to quantify LLM's familiarity with (q, a) pairs. P(Correct) estimates the likelihood of the model accurately generating the correct answer a to question q. The paper demonstrates that fine-tuning on examples introducing new factual knowledge (Unknown) is linearly correlated with increased model hallucinations, compared to fine-tuning on Known examples. LLMs struggle to acquire new knowledge through fine-tuning, instead learning to better utilize their pre-existing knowledge. Filtering out Unknown fine-tuning examples is shown to mitigate overfitting and performance degradation, without sacrificing overall performance. These findi

Iteration 1:
New important technical entities/ideas:
1. SliCK knowledge categorization
2. PCorrect measure
3. Unknown example fitting

Summary:

This research introduces SliCK (Sampling-based Categorization of Knowledge), a novel methodology for categorizing facts with respect to a language model's knowledge. SliCK utilizes a continuous PCorrect measure based on model-generated samples to classify facts into four categories: HighlyKnown, MaybeKnown, WeaklyKnown, and Unknown. This categorization enables a controlled study on the impact of new knowledge introduced during fine-tuning on large language models' (LLMs) tendency to hallucinate.

The study's key methodological contribution is the design of fine-tuning dataset variants with varying proportions of Unknown examples while controlling for other factors. This approach isolates the effect of new knowledge on model performance. The research demonstrates that higher ratios of Unknown examples in the fine-tuning data lead to performance

Iteration 6:
New important entities/ideas:
1. Multi-property evaluation framework
2. Cross-model performance variance
3. Data leakage mitigation strategy

Summary:

JUDGE-BENCH's multi-property evaluation framework addresses limitations in automated NLP evaluation through comprehensive assessment of 11 state-of-the-art LLMs across 20 diverse datasets, surpassing existing methods' reliance on limited datasets and models. This framework enables robust evaluation across multiple dimensions: tasks (e.g., translation, dialogue generation), judgment properties (e.g., coherence, fluency), annotation types (categorical, graded), and annotator expertise (expert, non-expert). The methodology employs annotation pipeline standardization, utilizing original human annotation instructions as LLM prompts and implementing greedy decoding optimization (temperature=0) for response generation.

Human-model alignment scores are computed via Cohen's κ (categorical) and Spearman's correlation (graded), with 

Iteration 6:
New entities/ideas:
1. Chrono-ablation analysis
2. Perplexity-based measures
3. Paired t-tests

Summary:

Experimental setup: Gemini 1.5 Pro (1M token context) for many-shot ICL across NLP tasks. Methodology: Random sampling with replacement for K-shot prompts, multiple seeds (3-5), greedy decoding, KV caching for inference optimization. Evaluation metrics: Task-specific (e.g., chrF2++ for MT, ROUGE-L for summarization).

Results and statistical significance:
1. Low-resource MT (English to Bemba/Kurdish): 997-shot ICL improved by 15.3%/4.5% over 1-shot, establishing SOTA. Performance gains: Bemba (28.3% to 47.7%), Kurdish (39.5% to 44.0%). Standard deviation: 0.1%-0.5% across 3 seeds.

2. Abstractive summarization (XSum): Many-shot ICL approached fine-tuned models, peaking at 50 examples (ROUGE-L: ~32%). XLSum: monotonic improvement. Chrono-ablation analysis revealed prompt length saturation beyond 50 shots.

3. GPQA: 125-shot ICL achieved 43.8% accuracy, comparable to Cla

Iteration 8:
New important entities/ideas:
1. Prompt-based evaluation paradigm
2. Cross-dataset performance variance
3. Safety guardrail impact analysis

Summary:

JUDGE-BENCH surpasses existing prompt-based evaluation paradigms by implementing a comprehensive multi-property assessment framework for 11 state-of-the-art LLMs across 20 diverse datasets, addressing the limited scope of prior approaches. This framework enables robust evaluation across multiple dimensions: tasks (e.g., translation, dialogue generation), judgment properties (e.g., coherence, fluency), annotation types (categorical, graded), and annotator expertise (expert, non-expert). The methodology employs annotation pipeline standardization, utilizing original human annotation instructions as LLM prompts and implementing greedy decoding optimization (temperature=0) for response generation, enhancing reproducibility and comparability across models and tasks.

Human-model alignment scores are computed via Cohen's κ (catego

Iteration 8:
New important technical entities/ideas:
1. Exemplar-based generalization
2. Rule-based generalization
3. Needle-in-a-haystack test

Summary:

This research introduces and evaluates many-shot in-context learning (ICL) methodologies for large language models (LLMs) with expanded context windows up to 1M tokens, leveraging cross-attention mechanisms to process extensive prompts. Key novel contributions and their potential impact include:

1. Many-shot ICL: Utilizes 100-8192 examples in prompts, enabled by context length scaling in LLMs. Performance gains up to 36.4% observed on tasks like Sequential Parity (20 digits). Impact: Reduces task-specific fine-tuning dependency, streamlining LLM deployment across domains. Employs context caching and KV caching to mitigate inference costs, enhancing computational feasibility. Demonstrates ability to implement computations analogous to gradient descent, suggesting potential for adapting to unseen tasks and domains misaligned with trai

Final Summary:
Experimental setup: Gemini 1.5 Pro (1M token context) for many-shot ICL across NLP tasks. Methodology: Random sampling with replacement for K-shot prompts, 3-5 seeds, greedy decoding, KV caching. Evaluation: Task-specific metrics, paired bootstrap resampling for significance, Bonferroni correction.

Key results:
1. Low-resource MT: 997-shot ICL improved Bemba (28.3% to 47.7%, p<0.001, d=2.8) and Kurdish (39.5% to 44.0%, p<0.01, d=1.2).
2. Summarization (XSum): 50-shot ICL peaked at ROUGE-L 32.1% ± 0.4% (95% CI [31.3%, 32.9%]).
3. GPQA: 125-shot ICL achieved 43.8% accuracy (95% CI [41.2%, 46.4%]), comparable to Claude-3 Opus (p=0.08).
4. BIG-Bench Hard: Reinforced ICL outperformed 3-shot CoT (83% vs. 72.1%, p<0.001, d=1.7).
5. Sentiment analysis: 2048-shot ICL reached 95.2% accuracy (95% CI [94.1%, 96.3%]), overcoming label flips.
6. High-dimensional classification: 2048-shot ICL approached k-NN performance (N=16: 89.7% ± 1.2%, N=64: 79.8% ± 1.8%).
7. Sequential parity: 8

Iteration 7:
New important entities/ideas:
1. Spliceosome binding prevention
2. Exon skipping induction
3. Lariat formation inhibition

Summary:

This paper introduces many-shot in-context learning (ICL) leveraging expanded context windows (up to 1M tokens) of large language models (LLMs), outperforming few-shot ICL across diverse tasks using 100-1000x more examples (up to 8192 shots). Key innovations addressing current limitations include:

1. Reinforced ICL: Extends Reinforced Self-Training to ICL, mitigating human-written rationale bottlenecks. Utilizes zero-shot CoT prompt to generate multiple rationales, filtered via answer correctness. Outperforms few-shot ICL with human-written rationales, even in 3-shot settings. Improves upon existing self-generated data ICL methods by eliminating clustering, post-processing heuristics, and test input access requirements. Addresses challenges in complex reasoning tasks (e.g., GPQA) requiring significant resources and expert knowledge. Demonstr

## Optional: Deeper Analysis of Summary Refinement using Chunking

This section introduces an advanced technique to enhance the Chain of Density (CoD) summarization process for ArXiv PDFs. By incorporating a chunking mechanism, we can handle longer documents more effectively and potentially improve the quality of our summaries. Here's an overview of the augmented CoD summarization process:

### Augmented Chain of Density Summarization

1. **Chunk the Text**: We split the input text into manageable chunks using the `chunk_text` function. This function intelligently handles document structure, including image descriptions.

2. **Iterative Chunk Summarization**: For each chunk, we apply the CoD process using `summarize_chunk`. This creates summaries for individual sections of the document.

3. **Combine Chunk Summaries**: We use `summarize_chunk_summaries` to integrate information from all chunk summaries into a cohesive whole.

4. **Iterative Refinement**: We repeat steps 2-3 for a specified number of iterations (`chunk_iterations`) to progressively refine the summary.

5. **Final Density Pass**: After chunk-based summarization, we apply the standard CoD process to further refine and densify the final summary.

Key components of this process include:

- `chunk_text`: Splits the document into manageable pieces.
- `summarize_chunk`: Applies CoD to individual chunks.
- `summarize_chunk_summaries`: Combines chunk summaries.
- `summarize_chunk_iteration`: Manages the iteration process for chunk summarization.
- `iterative_chunk_summarization`: Orchestrates the entire chunk-based summarization process.
- `chain_of_density_summarization`: Integrates chunking with the final CoD refinement.

This augmented approach allows us to:
1. Handle longer documents more effectively.
2. Potentially capture more nuanced information from different parts of the paper.
3. Provide a more comprehensive summary that considers the entire document structure.

The `ArxivChainOfDensityPipeline` class has been updated to incorporate these new features, allowing for easy experimentation with different chunk sizes and iteration counts.

To evaluate the effectiveness of this approach, we've also extended our `quality_scorer` function to analyze the chunk-based summaries separately. This gives us insights into how the chunking process affects summary quality at different stages of the pipeline.

By using this augmented CoD approach, we aim to create more comprehensive and accurate summaries of ArXiv papers, especially for longer or more complex documents.

In [55]:
@weave.op()
def chunk_text(text, chunk_size):
    chunks = []
    current_chunk = ""
    lines = text.split('\n')

    i = 0
    while i < len(lines):
        line = lines[i]
        if len(current_chunk) + len(line) > chunk_size:
            if current_chunk:
                chunks.append(current_chunk.strip())
                current_chunk = ""

        current_chunk += line + "\n"

        if line.startswith("[Image Descriptions for page"):
            if current_chunk.strip():
                chunks.append(current_chunk.strip())
                current_chunk = ""

            image_descriptions = line + "\n"
            i += 1
            while i < len(lines) and not lines[i].startswith("[END OF IMAGE DESCRIPTIONS]"):
                image_descriptions += lines[i] + "\n"
                i += 1
            if i < len(lines):
                image_descriptions += lines[i] + "\n"

            chunks.append(image_descriptions.strip())
            current_chunk = ""
        else:
            i += 1

    if current_chunk:
        chunks.append(current_chunk.strip())

    combined_chunks = []
    current_combined_chunk = ""
    for chunk in chunks:
        if len(current_combined_chunk) + len(chunk) <= chunk_size:
            current_combined_chunk += chunk + "\n\n"
        else:
            if current_combined_chunk:
                combined_chunks.append(current_combined_chunk.strip())
            current_combined_chunk = chunk + "\n\n"

    if current_combined_chunk:
        combined_chunks.append(current_combined_chunk.strip())

    return combined_chunks

In [56]:
@weave.op()
def summarize_chunk(chunk, instruction, current_summary="", iteration=1, model="claude-3-5-sonnet-20240620"):
    prompt = f"""Current summary:
    {current_summary}

    New information:
    {chunk}

    Instruction to focus on: {instruction}

    Iteration: {iteration}

    Create an extremely dense, highly technical summary that specifically addresses the given instruction. Follow these steps:

    1. Identify 3-5 key technical points from the new information that are directly relevant to the instruction, prioritizing:
    - Novel methodologies or algorithms related to the instruction
    - Specific quantitative results or metrics that address the instruction
    - Detailed experimental setups or parameters pertinent to the instruction
    - Precise definitions of domain-specific concepts mentioned in the instruction
    - Critical limitations or assumptions in the research that affect the instruction

    2. Integrate these points with the current summary, ensuring:
    - Direct relevance to the instruction at hand
    - No redundancy or oversimplification
    - Preservation of technical nuances and complexities specific to the instruction
    - Inclusion of relevant equations, formulas, or mathematical notations that help address the instruction
    - Accurate representation of statistical significance and error margins for instruction-related data

    3. Rephrase the combined information to maximize information density while maintaining focus on the instruction:
    - Use domain-specific terminology and jargon without simplification, as relevant to the instruction
    - Maintain the level of detail expected in a PhD-level discourse on the specific topic of the instruction
    - Incorporate precise citations or references where applicable to support the response
    - Preserve any conflicting viewpoints or ongoing debates in the field that relate to the instruction

    4. With each iteration, aim to increase information density by 30-40% without sacrificing technical accuracy or critical details that address the instruction.

    5. Ensure the summary includes instruction-specific:
    - Methodological details (e.g., exact algorithms, parameter settings) that are crucial to addressing the instruction
    - Precise quantitative results with appropriate units and error bounds that directly relate to the instruction
    - Detailed descriptions of novel techniques or approaches that are key to addressing the instruction
    - Critical analysis of strengths and limitations in the research as they pertain to the instruction

    Produce a summary that is significantly more information-dense and technically precise than the previous one, while remaining laser-focused on addressing the given instruction. Use language appropriate for a highly specialized audience in the field."""

    response = anthropic_client.messages.create(
        model=model,
        max_tokens=4096,
        messages=[{"role": "user", "content": prompt}]
        )
    return response.content[0].text

In [57]:
@weave.op()
def summarize_chunk_summaries(instruction, current_summary, chunk_summaries, model="claude-3-opus-20240229"):
    return anthropic_client.messages.create(
        model=model,
        max_tokens=4096,
        messages=[
            {
                "role": "user",
                "content": f"""Given this current summary:

    {current_summary}

    And these chunk summaries:

    {' '.join(chunk_summaries)}

    And this instruction to focus on:

    {instruction}

    Create an extremely dense, final summary that refines the current summary by incorporating key information from the chunk summaries, while specifically addressing the given instruction. Follow these guidelines:

    1. Integrate the most relevant and important information from the chunk summaries into the current summary.
    2. Ensure all key technical content from both the current summary and chunk summaries that relates to the instruction is retained.
    3. Aim to reduce overall length by 30-40% while increasing information density.
    4. Prioritize highly specific methodologies, algorithms, metrics, and findings that directly address the instruction.
    5. Preserve precise quantitative data, including statistical significance and error margins where applicable and relevant to the instruction.
    6. Maintain the use of domain-specific terminology and technical jargon pertinent to the instruction.
    7. Use compact phrasing and remove any remaining non-essential information that doesn't directly contribute to addressing the instruction.
    8. If relevant to the instruction, include brief mentions of limitations, assumptions, or conflicting viewpoints from across all summaries.
    9. Optimize for information density while maintaining coherence for an expert audience, always keeping the focus on the given instruction.

    The final summary should be a highly concentrated, technical distillation of all provided summaries that specifically addresses the given instruction, suitable for specialists in the field.""",
                }
            ],
    ).content[0].text

In [58]:
@weave.op()
def summarize_chunk_iteration(chunks, instruction, current_summary, iteration, model):
    chunk_summaries = []
    for i, chunk in enumerate(chunks, 1):
        current_summary = summarize_chunk(chunk, instruction, current_summary, iteration, model)
        chunk_summaries.append(current_summary)
        print(f"Iteration {iteration}, Chunk {i}:\n{current_summary}\n")
    current_summary = summarize_chunk_summaries(instruction, current_summary, chunk_summaries, model)
    print(f"Iteration {iteration}, Final Summary:\n{current_summary}\n")
    return current_summary, chunk_summaries


@weave.op()
def iterative_chunk_summarization(chunks, instruction, current_summary, chunk_iterations, model):
    chunk_iteration_summaries = []
    chunk_summaries = []
    for iteration in range(1, chunk_iterations + 1):
        current_summary, iteration_chunk_summaries = summarize_chunk_iteration(chunks, instruction, current_summary, iteration, model)
        chunk_iteration_summaries.append(current_summary)
        chunk_summaries.append(iteration_chunk_summaries)
    return current_summary, chunk_iteration_summaries, chunk_summaries

In [59]:
@weave.op()
def chain_of_density_summarization(instruction, text, model="claude-3-5-sonnet-20240620", chunk_size=8192, chunk_iterations=2, density_iterations=2):
    chunks = chunk_text(text, chunk_size)
    print(f"Number of chunks: {len(chunks)}")
    print(f"Chunk sizes: {[len(chunk) for chunk in chunks]}")

    current_summary, chunk_iteration_summaries, chunk_summaries = iterative_chunk_summarization(chunks, instruction, "", chunk_iterations, model)
    current_summary, iteration_summaries = iterative_density_summarization(instruction, current_summary, density_iterations, model)
    final_summary_text = final_summary(instruction, current_summary, model)
    print(f"Final Summary:\n{final_summary_text}\n")

    return {
        "final_summary": final_summary_text,
        "accumulated_summary": current_summary,
        "iteration_summaries": iteration_summaries,
        "chunk_iteration_summaries": chunk_iteration_summaries,
        "chunk_summaries": chunk_summaries
    }

In [60]:
class ArxivChainOfDensityPipeline(weave.Model):

    model: str = "claude-3-5-sonnet-20240620"
    chunk_size: int = 20000
    chunk_iterations: int = 1
    density_iterations: int = 3
    use_cache: bool = False
    cache: dict = {}

    def __init__(self, model: str = "claude-3-5-sonnet-20240620", chunk_size: int = 4000, chunk_iterations: int = 1, density_iterations: int = 3, use_cache: bool = False):
        super().__init__()
        self.model = model
        self.chunk_size = chunk_size
        self.chunk_iterations = chunk_iterations
        self.density_iterations = density_iterations
        self.use_cache = use_cache
        if use_cache:
            self.cache = {}

    @weave.op()
    def predict(self, paper: ArxivPaper, instruction: str) -> dict:

        if self.use_cache:
            cache_key = (paper.entry_id, instruction)
            if cache_key in self.cache:
                return self.cache[cache_key]

        extracted_images = extract_images(paper)
        cleaned_text = replace_images_with_descriptions(paper, extracted_images)
        result = chain_of_density_summarization(instruction, cleaned_text, model=self.model, chunk_size=self.chunk_size, chunk_iterations=self.chunk_iterations, density_iterations=self.density_iterations)

        if self.use_cache:
            self.cache[cache_key] = result

        return result

In [61]:
def process_chunk_summaries(model_output, instruction, model):
    scores = {}
    for i, chunk_list in enumerate(model_output["chunk_summaries"]):
        chunk_summary_scores = []
        for j, summary in enumerate(chunk_list):
            chunk_summary_score = score_summary(summary, f"Chunk Summary {i+1}.{j+1}", instruction, model)
            chunk_summary_scores.append(chunk_summary_score)
        scores[f"chunk_summaries_analysis_{i+1}"] = {
            "long_tail_stats": calculate_long_tail_stats(chunk_summary_scores),
            "iteration_impact": analyze_iteration_impact(chunk_summary_scores),
            "optimal_improvement_range": find_optimal_improvement_range(chunk_summary_scores),
            "optimal_score_range": find_optimal_score_range(chunk_summary_scores)
        }
    return scores


def process_chunk_iteration_summaries(model_output, instruction, model):
    chunk_iteration_scores = [score_summary(summary, f"Chunk Iteration Summary {i+1}", instruction, model)
                            for i, summary in enumerate(model_output["chunk_iteration_summaries"])]
    return {
        "long_tail_stats": calculate_long_tail_stats(chunk_iteration_scores),
        # "iteration_impact": analyze_iteration_impact(chunk_iteration_scores),
        # "optimal_improvement_range": find_optimal_improvement_range(chunk_iteration_scores),
        # "optimal_score_range": find_optimal_score_range(chunk_iteration_scores)
    }

In [62]:
@weave.op()
def quality_scorer(instruction, model_output, model="gpt-4o"):
    scores = {
        "chunk_summaries_analysis": {},
        "chunk_iteration_summaries_analysis": {},
        "iteration_summaries_analysis": {},
        "accumulated_summary": {},
        "final_summary": {}
    }

    try:
        # Process chunk summaries
        chunk_summaries_scores = process_chunk_summaries(model_output, instruction, model)
        scores.update(chunk_summaries_scores)

        # Process chunk iteration summaries
        scores["chunk_iteration_summaries_analysis"] = process_chunk_iteration_summaries(model_output, instruction, model)

        # Process iteration summaries
        scores["iteration_summaries_analysis"] = process_iteration_summaries(model_output, instruction, model)

        # Score accumulated summary
        scores["accumulated_summary"] = score_summary(model_output["accumulated_summary"], "Accumulated Summary", instruction, model)

        # Score final summary
        scores["final_summary"] = score_summary(model_output["final_summary"], "Final Summary", instruction, model)

        # After calculating all scores
        flattened_scores = {}
        for key, value in scores.items():
            if isinstance(value, dict):
                flattened_scores[key] = flatten_dict(value)
            else:
                flattened_scores[key] = value

        scores = flatten_dict(flattened_scores)

    except Exception as e:
        print(f"Error in quality_scorer: {str(e)}")
        scores["error"] = str(e)

    return scores

In [None]:
# evaluation = weave.Evaluation(dataset=dataset, scorers=[quality_scorer])
# for model in models:
#     arxiv_chain_of_density_pipeline = ArxivChainOfDensityPipeline(model=model, density_iterations=1, chunk_iterations=1)
#     await evaluation.evaluate(arxiv_chain_of_density_pipeline)