In [None]:
from crewai import Agent, Task, Crew
import os
import fitz
import os

In [None]:
def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page in doc:
        text += page.get_text()
    return text

In [None]:
pdf_folder = "ICLR_2020_papers"
output_folder = "ICLR_2020_LLM_reviews"
os.makedirs(output_folder, exist_ok=True)

In [None]:
review_generator = Agent(
    role="Review Generator",
    goal="""
    Generate a structured, detailed review of a research paper in paragraph form (max 300 words)  covering all major sections (Abstract, Introduction, Methodology, Experiments, Results, etc.).
    - Assess aspects such as originality, impact, clarity, empirical soundness, and recommendation in the paper.
    - Clearly mention the paper's title and domain.
    - Discuss strengths, weaknesses, and potential areas for improvement.
    - The review should be thorough and critical, analyzing the paper as if you are a domain expert in the field.
    - Avoid merely summarizing the paper; engage deeply with its strengths, limitations, and impact.
    - Be constructively critical—highlight flaws, but in a professional and polite tone.
    - Provide constructive criticism with actionable suggestions.
    - Include a score for rating (out of 10) and acceptance decision (Strong Reject → Strong Accept) and confidence score (a value from 1 to 5),
      which will be refined in later steps.
    - The final decision should adhere to these thresholds:   1-3 → Strong Reject, 3-5 → Weak Reject, 5-7 → Weak Accept, 7-10 → Strong Accept.
    - Review the paper as if you are the expert of that domain.
    - Rating, Decision and confidence scored should not be part of the 300 words. They are to be written in a list form at the end of the review.
    """,
    backstory="You are an AI research reviewer trained to critically analyze scientific papers and provide structured, insightful critiques.",
    openai_model="gpt-4o",
    verbose=True,
    memory=True
)

In [None]:
review_evaluator = Agent(
    role="Review Evaluator",
    goal="""
    Evaluate the generated review for depth, specificity, and logical consistency.

    Key Evaluation Criteria:
    - Ensure that all major sections (Abstract, Introduction, Methodology, Experiments, etc.) are covered and all aspects (Originality, Impact, Clarity, etc.) are analyzed.
    - Insightfulness: How does the paper contribute to existing work? Is it impactful?
    - Coverage: Does the review discuss all key sections of the paper? What is missing?
    - Constructive Criticism: Are actionable improvement suggestions given?
    - Politeness & Code of Conduct: Is criticism expressed professionally and politely?
    - Score Consistency: Does the rating and confidence score align with the review text?
    - Hedge Words Detection: If there are excessive hedge words ("might," "could," "potentially"), reduce the confidence score.
    - Check if the reviewer is consistent with the review, at sentence level and aspect level
    Output:
    - Identify missing aspects in the review.
    - Suggest improvements to make the review more insightful and constructive.
    - Flag any inconsistencies between review text and final rating.
    """,
    backstory="You are an AI expert in review quality assessment, ensuring fairness, depth, and logical alignment in critiques.",
    verbose=True,
    memory=True,
    openai_model="gpt-4o"
)

In [None]:
mismatch_flagger = Agent(
    role="Mismatch Flagging & Counterfactual Agent",
    goal="""
    Critically analyze the review for logical inconsistencies, contradictions, or missing critiques.
    Incorporate evaluator and mismatch agent feedback to refine the review.
    Key Checks:
    - Highlight missing sections and aspects.
    - Rating Alignment: Does the final score (out of 10) and acceptance decision reflect the review's actual content?
    - Section Coverage: How many sections of the paper does the review cover? Which sections are missing?
    - Aspect Coverage: How many aspects of the paper does the review cover? Which asepects are missing?
    - Depth Analysis: Does the review discuss technical details, methodology, results, and limitations adequately?
    - Counterfactual Reasoning: How would the review change if certain aspects were emphasized more or removed?
      (Example: If the results section was stronger, would the review be more positive?)
    - Tone Check: Ensure the review follows a constructive, polite, and professional tone.
      - If criticism is too harsh, suggest ways to soften it while keeping the critique valid.

    Output:
    - Identify logical gaps or contradictions in the review.
    - Suggest specific revisions to ensure fairn  ess, depth, and consistency.
    """,
    backstory="You are an AI specializing in logical consistency, depth analysis, and counterfactual reasoning for research critiques.",
    verbose=True,
    memory=True,
    openai_model="gpt-4o"
)

In [None]:
refinement_agent = Agent(
    role="Refinement Agent",
    goal="""
    Improve the review by integrating feedback from the Evaluator and Counterfactual Agent.

    - Ensure that the rating (out of 10) and final decision (Strong Accept → Strong Reject) logically reflect the critique and align with the set thresholds.
    - Expand on missing sections and make the review more insightful.
    - Ensure the tone is professional, constructive, and polite.
    - Add actionable improvement suggestions (what can the authors do to improve?).
    - Ensure that the rating (out of 10) and final decision (Strong Accept → Strong Reject) logically reflect the critique.
    Strict Decision Thresholds (NO EXCEPTIONS):
    - Strong Reject: 1-3
    - Weak Reject: 3-5
    - Weak Accept: 5-6
    - Strong Accept: 7-10
    - Final review must contain point of action for the authors to improve the paper.
    Final Review Format:
    - Structured review (paragraph form, max 300 words)
    - Rating (out of 10)
    - Acceptance Decision STRICTLY FOLLOWING THE DECISION THRESHOLDS(Strong Accept, Weak Accept, Weak Reject, Strong Reject)
    - Confidence Score (1-5)
    """,
    backstory="You refine research reviews, ensuring clarity, fairness, and logical coherence.",
    verbose=True,
    memory=True,
    openai_model="gpt-4o",
)

In [None]:
for pdf_file in os.listdir(pdf_folder):
    if pdf_file.endswith(".pdf"):
        pdf_path = os.path.join(pdf_folder, pdf_file)
        paper_text = extract_text_from_pdf(pdf_path)

        # Define tasks
        generate_review_task = Task(
            description=f"""
            Generate an initial structured review of the provided research paper {paper_text} (include the paper title and domain) in no more than 300 words.
            The review should highlight the paper's strengths, weaknesses, and potential areas for improvement.
            Do not include generic statements. The output should also include a placeholder and also and intial value for a rating and acceptance decision and confidence score that will later be refined.
            The paper should cover all the following aspects:
                "- Abstract (ABS)\n"
                "- Introduction (INT)\n"
                "- Related Works (RWK)\n"
                "- Problem Definition/Idea (PDI)\n"
                "- Data/Datasets (DAT)\n"
                "- Methodology (MET)\n"
                "- Experiments (EXP)\n"
                "- Results (RES)\n"
                "- Tables & Figures (TNF)\n"
                "- Analysis (ANA)\n"
                "- Future Work (FWK)\n"
                "- Overall (OAL)\n"
                "- Bibliography (BIB)\n"
                "- External Knowledge (EXT)\n"
                "\n"
                "Additionally, assess the paper based on the following aspects:\n"
                "- Appropriateness (APR)\n"
                "- Originality (NOV)\n"
                "- Significance (IMP)\n"
                "- Meaningful Comparison (CMP)\n"
                "- Presentation & Formatting (PNF)\n"
                "- Recommendation (REC)\n"
                "- Empirical & Theoretical Soundness (EMP)\n"
                "- Substance (SUB)\n"
                "- Clarity (CLA)\n"
                "\n"
                "Conclude with a final rating (1-10), confidence score (1-5), and recommendation (Strong Reject → Strong Accept), but not in the review text.",
            The paper should cover all the following sections
            """,
            agent=review_generator,
            expected_output="A structured review highlighting strengths, weaknesses, and improvement areas."
        )

        evaluate_review_task = Task(
            description=f"""
            Assess the review of the paper: {paper_text} based on depth, politeness, hedge words, and rating consistency." \n
            Provide detailed feedback, especially noting if the rating and decision are logically aligned with the strengths and weaknesses discussed.\n
            Ensure all sections (ABS, INT, RWK, PDI, DAT, MET, EXP, RES, TNF, ANA, FWK, OAL, BIB, EXT) are covered.\n"
            Ensure all Aspects (APR, NOV, IMP, CMP, PNF, REC, EMP, SUB, CLA) are assessed.\n"
            """,
            agent=review_evaluator,
            context=[generate_review_task],
            expected_output="An evaluation report highlighting missing aspects and tone issues."
        )

        flag_mismatches_task = Task(
            description=f"""
            Refine the review for the paper: {paper_text} by addressing all flagged issues, ensuring depth and consistency.
              - Check whether the final rating and decision (e.g., '8/10, Strong Accept') match the review's content.
              - Suggest specific counterfactual scenarios: How would the review change if certain aspects were emphasized differently?
              - Provide **specific** recommendations for aligning the review critique with its final scoring.
              - Check whether the critical sections and aspects are missing from the review or not.
              """,
            agent=mismatch_flagger,
            context=[generate_review_task],
            expected_output="A list of inconsistencies, counterfactual insights, and improvement suggestions."
        )

        generate_final_review_task = Task(
            description=f"""
            Refine the initial review of {paper_text} by incorporating feedback from the evaluator and counterfactual agent.
            - Ensure that the review is in a paragraph form in not more than 300 words.
            - Ensure that the review has depth and consistency.
            - Ensure logical consistency and fair critique.
            - Explicitly mention the paper's title and field/domain.
            - Clearly mention strengths, weaknesses, and improvement suggestions.
            - The final review must also include at end seperately listed down:
              - A Rating (out of 10 for the paper) that logically reflects the critique.
              - An Acceptance Decision(Strong Accept, Weak Accept, Weak Reject, Strong Reject).
              - A confidence score (1-5) for the review.
            -**Strict Decision Thresholds (NO EXCEPTIONS):**
            - **Strong Reject:** 1-3
            - **Weak Reject:** 3-5
            - **Weak Accept:** 5-6
            - **Strong Accept:** 7-10
            - **Strictly follow these thresholds for the final decision**. If the paper is very good and flawless then only consider giving a score of 7 or more than 7.
            -If the paper is given a weak accept, then the rating shouldnt be more than 6. and so on for all the thresholds.
            - Strong accept to be given to a paper if it is very good and flawless.
            **Ensure that critical reviews have lower ratings and rejections, and positive reviews have higher ratings and acceptances.**
            -  Review the paper as if you are the expert of that domain.
            """,
            agent=refinement_agent,
            context=[generate_review_task, evaluate_review_task, flag_mismatches_task],
            expected_output="A final, polished review with clear action points. Review with structured output: Review text, Rating (out of 10), Decision, Confidence Score (1-10)."
        )

        crew = Crew(
            agents=[review_generator, review_evaluator, mismatch_flagger, refinement_agent],
            tasks=[generate_review_task, evaluate_review_task, flag_mismatches_task, generate_final_review_task],
            verbose=True
        )

        crew.kickoff()

        final_review_output = generate_final_review_task.output.raw
        
        output_file = os.path.join(output_folder, f"{os.path.splitext(pdf_file)[0]}_review.txt")
        with open(output_file, "w", encoding="utf-8") as f:
            f.write(final_review_output)

        print(f"Saved final review for {pdf_file} to {output_file}")

In [None]:
task_output = generate_review_task.output
print(f"Raw Output: {task_output.raw}")

In [None]:
task_output = evaluate_review_task.output
print(f"Task 2: {task_output.raw}")

In [None]:
task_output = flag_mismatches_task.output
print(f"Task 3: {task_output.raw}")

In [None]:
task_output = generate_final_review_task.output
print(f"Final review: {task_output.raw}")