In [4]:
import time
import os
import openai
import json
from perform_review import perform_review, load_paper, reviewer_system_prompt_neg


def review_paper(
    paper_name,
    folder_name,
    num_reviews=5,
    reviewer_system_prompt=reviewer_system_prompt_neg,
):
    paper_text = load_paper(os.path.join(folder_name, paper_name + ".pdf"))

    all_reviews = []
    for i in range(num_reviews):
        review = perform_review(
            paper_text,
            model="gpt-4o-2024-05-13",
            client=openai.OpenAI(),
            num_reflections=5,
            num_fs_examples=1,
            num_reviews_ensemble=5,
            temperature=0.1,
            reviewer_system_prompt=reviewer_system_prompt,
        )
        all_reviews.append(review)
        timestamp = time.strftime("%Y%m%d-%H%M%S")
        with open(
            f"{folder_name}/ai_reviews/{timestamp}_review_{i}_workshop_track.txt", "w"
        ) as f:
            f.write(json.dumps(review, indent=4))
        print(f"{paper_name}: Finished review {i+1}/{num_reviews}")
    return all_reviews

In [5]:
ws_description = """Submitted papers should contain the following four elements:

1. A use case that was tackled with deep learning. 

2. A solution for this type of use case was proposed in the deep learning literature

3. A description of the (negative) outcome in the solution. 

4. An investigation (and ideally an answer) to the question of why it did not work as promised by the deep learning literature. 

The potential reasons for failure may include but are not limited to data-related issues (e.g., distribution shift, bias, label quality, noisy measurement, quality of simulated data), model limitations (e.g., assumption violations, robustness, interpretability, scalability, representation misalignment), and deployment challenges (e.g., computational demands, hardware constraints). Besides these four points, papers will be assessed on:

1. Rigor and transparency in the scientific methodologies employed. 

2. Novelty and significance of insights.

3. Quality of discussion of limitations.

4. Reproducibility of results.

5. Clarity of writing.
"""

review_prompt_ws = "You are an AI researcher who is reviewing a paper that was submitted to a ML conference workshop."

review_prompt_ws += (
    "The workshop is called 'I Can't Believe It's Not Better: Challenges in Applied Deep Learning'."
    + ws_description
)

In [None]:
reviews = review_paper(
    "raw_paper",
    "../compositional-regularization",
    num_reviews=1,
    reviewer_system_prompt=review_prompt_ws,
)

raw_paper: Finished review 1/1


In [9]:
reviews = review_paper(
    "raw_paper",
    "../pest-detection",
    num_reviews=1,
    reviewer_system_prompt=review_prompt_ws,
)

raw_paper: Finished review 1/1


In [10]:
reviews = review_paper(
    "raw_paper",
    "../label-noise",
    num_reviews=1,
    reviewer_system_prompt=review_prompt_ws,
)

raw_paper: Finished review 1/1
