In [15]:
from utils import *
import os

# If arxiv.csv doesn't exist, load the json which will save it
# If arxiv.csv does exist, load it

if not os.path.exists('arxiv.csv'): df = load_arxiv_json()
else: df = pd.read_csv('arxiv.csv')


In [16]:
# Get the abstracts
abstracts = df['abstract'].values
abstracts[-1]

"  Generative adversarial networks (GANs) are frequently utilized in astronomy\nto construct an emulator of numerical simulations. Nevertheless, training GANs\ncan prove to be a precarious task, as they are prone to instability and often\nlead to mode collapse problems. Conversely, the diffusion model also has the\nability to generate high-quality data without adversarial training. It has\nshown superiority over GANs with regard to several natural image datasets. In\nthis study, we undertake a quantitative comparison between the denoising\ndiffusion probabilistic model (DDPM) and StyleGAN2 (one of the most robust\ntypes of GANs) via a set of robust summary statistics from scattering\ntransform. In particular, we utilize both models to generate the images of 21\ncm brightness temperature mapping, as a case study, conditionally based on\nastrophysical parameters that govern the process of cosmic reionization. Using\nour new Fr\\'echet Scattering Distance (FSD) as the evaluation metric to

In [64]:
from langchain.prompts import StringPromptTemplate
from pydantic import BaseModel, validator

PROMPT = """\
Please consider the following abstract:

{abstract}

I'd like to extract a structured representation from it. To achieve this, do the following steps by thinking out loud, and thinking step by step:

Begin by performing syntactic simplification of complex sentences in the abstract. 
Break them down into simpler, more digestible statements, but keep the key details intact, particularly the context, field of study, and application of the problem.
Next, organize the main concepts into a JSON-style object with the following structure:

- 'Problem': 'Issues or challenges addressed.',
- 'Solution': 'Proposed solutions or methods.'
- 'Methodology': 'Implementation details of the solutions or methods.'
- 'Evaluation': 'How results or solutions are assessed.'
- 'Results': 'Conclusions drawn from the study.'

Your goal is to capture the essence of the abstract in a clear and structured manner, highlighting the most critical elements using the provided categories.
"""

class HypothesisExtractionPromptTemplate(StringPromptTemplate, BaseModel):
    """A custom prompt template for extracting a structured representation from an abstract."""

    @validator("input_variables")
    def validate_input_variables(cls, v):
        """Validate that the input variables are correct."""
        if len(v) != 1 or "abstract" not in v:
            raise ValueError("abstract must be the only input_variable.")
        return v

    def format(self, **kwargs) -> str:
        # Generate the prompt using the provided abstract
        prompt = PROMPT.format(abstract=kwargs["abstract"])
        return prompt

    def _prompt_type(self):
        return "hypothesis-extraction"

# Usage:
hypothesis_extractor = HypothesisExtractionPromptTemplate(input_variables=["abstract"])
prompt = hypothesis_extractor.format(abstract="Your provided abstract here.")
print(prompt)

Please consider the following abstract:

Your provided abstract here.

I'd like to extract a structured representation from it. To achieve this, do the following steps by thinking out loud, and thinking step by step:

Begin by performing syntactic simplification of complex sentences in the abstract. 
Break them down into simpler, more digestible statements, but keep the key details intact, particularly the context, field of study, and application of the problem.
Next, organize the main concepts into a JSON-style object with the following structure:

- 'Problem': 'Issues or challenges addressed.',
- 'Solution': 'Proposed solutions or methods.'
- 'Methodology': 'Implementation details of the solutions or methods.'
- 'Evaluation': 'How results or solutions are assessed.'
- 'Results': 'Conclusions drawn from the study.'

Your goal is to capture the essence of the abstract in a clear and structured manner, highlighting the most critical elements using the provided categories.



In [66]:
from pydantic import BaseModel, Field

# Redefining the Pydantic model for Hypothesis
class Hypothesis(BaseModel):
    Problem: str = Field(description="Issues or challenges addressed.")
    Solution: str = Field(description="Proposed solutions or methods.")
    Methodology: str = Field(description="Implementation details of the solutions or methods.")
    Evaluation: str = Field(description="How results or solutions are assessed.")
    Results: str = Field(description="Conclusions drawn from the study.")

# Sample output from the LLM (based on the previously discussed extraction)
llm_output_str = """
{
    "Problem": "GANs are used in astronomy for numerical simulations but have training challenges due to instability and mode collapse problems.",
    "Solution": "The diffusion model offers an alternative to GANs, capable of generating high-quality data without adversarial training. Additionally, DDPM and StyleGAN2 are compared for image generation.",
    "Methodology": "This study uses summary statistics from scattering transform for the comparison. Both models generate images of 21 cm brightness temperature mapping based on astrophysical parameters.",
    "Evaluation": "Fréchet Scattering Distance (FSD) is used to compare sample distributions. Fisher forecasts evaluate mode collapses in models.",
    "Results": "DDPM outperforms StyleGAN2 on varied training set sizes and offers more robust image generation. The diffusion model is highlighted as a promising alternative to GANs for accurate image generation, providing reliable parameter constraints in astrophysics."
}
"""

# Parse the JSON string into the Pydantic object
hypothesis_data = Hypothesis.parse_raw(llm_output_str)
hypothesis_data

Hypothesis(Problem='GANs are used in astronomy for numerical simulations but have training challenges due to instability and mode collapse problems.', Solution='The diffusion model offers an alternative to GANs, capable of generating high-quality data without adversarial training. Additionally, DDPM and StyleGAN2 are compared for image generation.', Methodology='This study uses summary statistics from scattering transform for the comparison. Both models generate images of 21 cm brightness temperature mapping based on astrophysical parameters.', Evaluation='Fréchet Scattering Distance (FSD) is used to compare sample distributions. Fisher forecasts evaluate mode collapses in models.', Results='DDPM outperforms StyleGAN2 on varied training set sizes and offers more robust image generation. The diffusion model is highlighted as a promising alternative to GANs for accurate image generation, providing reliable parameter constraints in astrophysics.')

In [72]:
import openai
import re
from langchain.chat_models import AzureChatOpenAI
from langchain.output_parsers import PydanticOutputParser
from langchain.schema import HumanMessage
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from typing import Dict

API_KEY = "ec4eddd112ca4626b79c4ed61310326b"
DEPLOYMENT_NAME = "gpt4_small"
BASE_URL = "https://utbd.openai.azure.com/"
API_VERSION = "2023-03-15-preview"

# 1. Setup the chat model
model = AzureChatOpenAI(
    openai_api_base=BASE_URL,
    openai_api_version=API_VERSION,
    deployment_name=DEPLOYMENT_NAME,
    openai_api_key=API_KEY,
    openai_api_type="azure",
)

# 2. Define the custom prompt
def generate_custom_prompt(abstract):
    return PROMPT.format(abstract=abstract)

# 3. Get model's response
def get_model_response(abstract):
    prompt = generate_custom_prompt(abstract)
    messages = [HumanMessage(content=prompt)]
    response = model(messages)
    return response.content

# 4. Parse the response to get the structured representation
def extract_to_hypothesis(response: str) -> Hypothesis:
    # Extract content between curly braces
    match = re.search(r'\{(.+?)\}', response, flags=re.DOTALL)
    if not match:
        raise ValueError("Could not find a JSON-like structure in the response.")
    content = match.group(1)
    # Split by lines and extract key-value pairs
    lines = content.strip().split('\n')
    data: Dict[str, str] = {}
    for line in lines:
        key, value = line.split(':', 1) # Split by the first colon
        key = key.strip().strip("'")
        value = value.strip().strip("',")
        data[key] = value
    # Populate the Hypothesis class
    return Hypothesis(**data)

# 4. Parse the response to get the structured representation
def extract_hypothesis(abstract):
    response = get_model_response(abstract)
    parser = PydanticOutputParser(pydantic_object=Hypothesis)
    hypothesis = extract_to_hypothesis(response)
    return hypothesis

# Use the pipeline
abstract = abstracts[-1]
result = extract_hypothesis(abstract)
print(result)

Problem='Training GANs can be unstable and often leads to mode collapse problems, especially in the field of astronomy for emulating numerical simulations.' Solution='The Diffusion Model, specifically the Denoising Diffusion Probabilistic Model (DDPM), can generate high-quality data without adversarial training, offering a promising alternative to GANs.' Methodology='Both DDPM and a type of GAN, StyleGAN2, are used to generate images of 21 cm brightness temperature mapping based on astrophysical parameters that control cosmic reionization. The role of classifier-free guidance in DDPM is explored, with a preference for non-zero guidance scale when training data is limited.' Evaluation='The new Fréchet Scattering Distance (FSD) is used as the evaluation metric to compare the sample distribution between the generative models and simulations.' Results='DDPM outperforms StyleGAN2 on various sizes of training sets. While StyleGAN2 exhibits mode collapses in different ways, DDPM provides a mo