Imports and PaperMetadata class

In [None]:
!pip install openai pydantic python-docx reportlab tiktoken PyPDF2

import os
import json
from typing import Optional, Dict, List
from pydantic import BaseModel
import openai
from docx import Document
from docx.shared import Pt, RGBColor
from docx.enum.text import WD_ALIGN_PARAGRAPH
from reportlab.lib import colors
from reportlab.lib.pagesizes import letter
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
from pathlib import Path
import tiktoken
from PyPDF2 import PdfReader



class PaperMetadata(BaseModel):
    """Metadata about the paper being reviewed"""
    title: str
    authors: List[str]
    field: Optional[str] = None
    citation_context: Optional[str] = None
    author_summary: Optional[str] = None



In [None]:
class PaperReview(BaseModel):
    """Structured review of a paper"""
    title: str
    authors: str
    summary: str
    contributions: List[str]
    technical_soundness: str
    reproducibility: str
    impact: str
    strengths: List[str]
    weaknesses: List[str]
    suggestions: List[str]
    related_work: str
    ratings: Dict[str, float]  # clarity, novelty, methodology, technical_soundness, impact

In [None]:
class PaperProcessor:
    def __init__(self, max_tokens: int = 8000):
        """Initialize the paper processor with token limits"""
        self.max_tokens = max_tokens
        self.encoding = tiktoken.encoding_for_model("gpt-4")

    def count_tokens(self, text: str) -> int:
        """Count the number of tokens in a text"""
        return len(self.encoding.encode(text))

    def split_paper(self, paper_text: str) -> List[str]:
        """Split a paper into sections if it exceeds token limit"""
        sections = []
        current_section = []
        current_tokens = 0

        # Split by common section markers
        lines = paper_text.split('\n')
        for line in lines:
            line_tokens = self.count_tokens(line)

            # If adding this line would exceed the limit, start a new section
            if current_tokens + line_tokens > self.max_tokens:
                sections.append('\n'.join(current_section))
                current_section = [line]
                current_tokens = line_tokens
            else:
                current_section.append(line)
                current_tokens += line_tokens

        if current_section:
            sections.append('\n'.join(current_section))

        return sections

    def read_paper_from_file(self, file_path: str) -> Optional[str]:
        """Read a paper from a file (PDF or TXT)"""
        try:
            file_path = Path(file_path)
            if file_path.suffix.lower() == '.pdf':
                return self._read_pdf(file_path)
            else:
                with open(file_path, 'r', encoding='utf-8') as f:
                    return f.read()
        except Exception as e:
            print(f"Error reading file {file_path}: {str(e)}")
            return None

    def _read_pdf(self, file_path: Path) -> str:
        """Extract text from a PDF file"""
        reader = PdfReader(file_path)
        text = ""
        for page in reader.pages:
            text += page.extract_text() + "\n"
        return text

    def extract_metadata(self, paper_text: str) -> dict:
        """Extract basic metadata from paper text"""
        metadata = {
            'title': '',
            'authors': [],
            'abstract': ''
        }

        lines = paper_text.split('\n')
        for i, line in enumerate(lines):
            if line.strip().lower().startswith('title:'):
                metadata['title'] = line.split(':', 1)[1].strip()
            elif line.strip().lower().startswith('authors:'):
                authors = line.split(':', 1)[1].strip()
                metadata['authors'] = [a.strip() for a in authors.split(',')]
            elif line.strip().lower().startswith('abstract:'):
                abstract_lines = []
                for j in range(i + 1, len(lines)):
                    if lines[j].strip() == '':
                        break
                    abstract_lines.append(lines[j])
                metadata['abstract'] = ' '.join(abstract_lines)

        return metadata

In [None]:
import os
import json
from typing import Optional, Dict, List
from pydantic import BaseModel
import openai

class PaperMetadata(BaseModel):
    """Metadata about the paper being reviewed"""
    title: str
    authors: List[str]
    field: Optional[str] = None
    citation_context: Optional[str] = None
    author_summary: Optional[str] = None

class PaperReview(BaseModel):
    """Structured review of a paper"""
    title: str
    authors: str
    summary: str
    contributions: List[str]
    technical_soundness: str
    reproducibility: str
    impact: str
    strengths: List[str]
    weaknesses: List[str]
    suggestions: List[str]
    related_work: str
    ratings: Dict[str, float]

class PaperReviewer:
    def __init__(self, model: str = "gpt-4.1", field: str = "Computer Science"):
        """Initialize the paper reviewer with a specific model and field"""
        self.model = model
        self.field = field
        self.client = openai.OpenAI(api_key='your-key')

    def _get_field_specific_guidelines(self) -> str:
        """Get field-specific review guidelines"""
        guidelines = {
            "Computer Science": """
            Focus on:
            1. Algorithmic complexity and efficiency
            2. Implementation details and reproducibility
            3. Theoretical foundations and proofs
            4. Experimental methodology and results
            5. Code quality and availability
            6. Performance metrics and benchmarks
            7. Scalability and optimization
            8. Security considerations
            9. System architecture and design
            10. Edge cases and error handling
            """,
            "Machine Learning": """
            Focus on:
            1. Model architecture and design choices
            2. Training methodology and hyperparameters
            3. Evaluation metrics and baselines
            4. Dataset quality and preprocessing
            5. Reproducibility and code availability
            6. Model interpretability and explainability
            7. Bias and fairness considerations
            8. Computational efficiency
            9. Generalization capabilities
            10. Ablation studies and analysis
            """,
            "Natural Language Processing": """
            Focus on:
            1. Model architecture and linguistic foundations
            2. Dataset quality and preprocessing
            3. Evaluation metrics and human evaluation
            4. Error analysis and limitations
            5. Ethical considerations
            6. Language coverage and diversity
            7. Task-specific performance
            8. Cross-lingual capabilities
            9. Computational requirements
            10. Real-world applicability
            """,
            "Computer Vision": """
            Focus on:
            1. Model architecture and design
            2. Dataset quality and diversity
            3. Evaluation metrics and benchmarks
            4. Computational efficiency
            5. Real-world performance
            6. Robustness to variations
            7. Interpretability and visualization
            8. Hardware requirements
            9. Application scenarios
            10. Ethical considerations
            """,
            "Robotics": """
            Focus on:
            1. System architecture and design
            2. Hardware-software integration
            3. Real-world performance
            4. Safety considerations
            5. Control algorithms
            6. Sensor integration
            7. Environmental adaptability
            8. Power efficiency
            9. Cost considerations
            10. Practical applications
            """
        }
        return guidelines.get(self.field, guidelines["Computer Science"])

    def _generate_review_prompt(self, paper_text: str, metadata: Optional[PaperMetadata] = None) -> str:
        """Generate a prompt for the LLM to review the paper"""
        field_guidelines = self._get_field_specific_guidelines()

        prompt = f"""You are an expert academic reviewer with deep knowledge in {self.field}.
Please provide a comprehensive, detailed review of the following paper:

Title: {metadata.title if metadata else 'Not provided'}
Authors: {', '.join(metadata.authors) if metadata else 'Not provided'}
Field: {metadata.field if metadata else self.field}

Field-Specific Guidelines:
{field_guidelines}

Paper Content:
{paper_text}

Please provide a detailed, well-structured review in JSON format with the following fields:
{{
    "title": "The full title of the paper being reviewed",
    "authors": "The complete list of authors as they appear in the paper",
    "summary": "A comprehensive summary of the paper's main contributions, methodology, and findings. Write in full paragraphs, explaining the key concepts and their significance. Focus on the paper's core contributions and how they advance the field. Include specific details about the methodology, results, and implications.",
    "contributions": [
        "A detailed paragraph describing each major contribution. Focus on novel ideas, technical innovations, and significant findings. Explain why each contribution is important and how it advances the field. Include specific examples and quantitative results where available.",
        "Another detailed paragraph about a different contribution, with specific examples and analysis. Consider theoretical, methodological, and practical contributions."
    ],
    "technical_soundness": "A detailed analysis of the paper's technical soundness, including methodology, theoretical foundations, and experimental design. Discuss the rigor of the approach and any potential technical limitations.",
    "reproducibility": "An assessment of the paper's reproducibility, including code availability, dataset details, and experimental setup. Discuss any potential challenges in reproducing the results.",
    "impact": "An analysis of the paper's potential impact on the field, including theoretical contributions, practical applications, and future research directions.",
    "strengths": [
        "A detailed paragraph describing each major strength. Explain why each strength is significant and how it contributes to the field. Include specific examples from the paper where relevant. Discuss the technical merits, innovation, and potential impact.",
        "Another detailed paragraph about a different strength, with specific examples and analysis. Focus on different aspects of the work, such as methodology, results, or theoretical contributions."
    ],
    "weaknesses": [
        "A detailed paragraph analyzing each major weakness. Explain the implications of these weaknesses and how they might affect the paper's contributions. Provide specific examples where possible. Discuss both technical limitations and potential improvements.",
        "Another detailed paragraph about a different weakness, with specific analysis and examples. Consider aspects like methodology, assumptions, or practical limitations."
    ],
    "suggestions": [
        "A detailed paragraph for each suggestion, explaining how it would improve the paper. Provide specific recommendations and explain their potential impact. Include concrete examples where possible. Discuss both immediate improvements and future research directions.",
        "Another detailed paragraph with a different suggestion, including specific implementation details and rationale. Consider technical, methodological, and theoretical aspects."
    ],
    "related_work": "A detailed analysis of how this work relates to and advances the existing literature. Discuss key differences from previous approaches and how this work contributes to the field.",
    "ratings": {{
        "clarity": <number between 1 and 5>,
        "novelty": <number between 1 and 5>,
        "methodology": <number between 1 and 5>,
        "technical_soundness": <number between 1 and 5>,
        "impact": <number between 1 and 5>
    }}
}}

Guidelines for the review:
1. Write in clear, academic prose with proper paragraph structure
2. Provide specific examples and references to the paper's content
3. Explain the significance and implications of each point
4. Maintain a critical but constructive tone
5. Focus on technical content, methodology, and contributions
6. Include relevant comparisons to related work where appropriate
7. Support all claims with evidence from the paper
8. Ensure each section flows naturally and builds upon previous points
9. Include specific technical details and quantitative results where available
10. Discuss both theoretical and practical implications
11. Consider ethical implications and potential societal impact
12. Evaluate the paper's contribution to the broader field
13. Clearly identify and explain each major contribution
14. Distinguish between technical, theoretical, and practical contributions

The review should be thorough and detailed, with each section providing in-depth analysis rather than bullet points. Each paragraph should be well-developed and supported by specific examples from the paper."""
        return prompt

    def _parse_response(self, response_text: str) -> PaperReview:
        """Parse the response text into a structured review"""
        try:
            # Try to find JSON content in the response
            start_idx = response_text.find('{')
            end_idx = response_text.rfind('}') + 1
            if start_idx == -1 or end_idx == 0:
                raise ValueError("No JSON content found in response")

            json_str = response_text[start_idx:end_idx]
            review_data = json.loads(json_str)

            # Ensure all required fields are present
            required_fields = ['title', 'authors', 'summary', 'technical_soundness', 'reproducibility', 'impact', 'strengths', 'weaknesses', 'suggestions', 'related_work', 'ratings']
            for field in required_fields:
                if field not in review_data:
                    raise ValueError(f"Missing required field in response: {field}")

            return PaperReview(
                title=review_data['title'],
                authors=review_data['authors'],
                summary=review_data['summary'],
                contributions=review_data['contributions'],
                technical_soundness=review_data['technical_soundness'],
                reproducibility=review_data['reproducibility'],
                impact=review_data['impact'],
                strengths=review_data['strengths'],
                weaknesses=review_data['weaknesses'],
                suggestions=review_data['suggestions'],
                related_work=review_data['related_work'],
                ratings=review_data['ratings']
            )
        except Exception as e:
            print(f"Error parsing response: {str(e)}")
            print("Response text:", response_text)
            raise

    def review_paper(self, paper_text: str, metadata: Optional[PaperMetadata] = None) -> PaperReview:
        """Generate a review for the given paper"""
        prompt = self._generate_review_prompt(paper_text, metadata)

        try:
            response = self.client.chat.completions.create(
                model=self.model,
                messages=[
                    {"role": "system", "content": "You are an expert academic reviewer with deep knowledge across multiple fields. Provide detailed, specific feedback in JSON format."},
                    {"role": "user", "content": prompt}
                ],
                temperature=0.7,
                max_tokens=3000,
                response_format={ "type": "json_object" }
            )

            review_text = response.choices[0].message.content

            return self._parse_response(review_text)

        except Exception as e:
            print(f"Error generating review: {str(e)}")
            raise

    def _assess_review_quality(self, review: PaperReview) -> dict:
        """Assess the quality and completeness of the generated review"""
        quality_metrics = {
            "completeness": 0,
            "depth": 0,
            "specificity": 0,
            "constructiveness": 0,
            "technical_accuracy": 0
        }

        # Check completeness
        required_sections = [
            review.summary,
            review.contributions,
            review.technical_soundness,
            review.reproducibility,
            review.impact,
            review.strengths,
            review.weaknesses,
            review.suggestions,
            review.related_work
        ]
        quality_metrics["completeness"] = sum(1 for section in required_sections if section) / len(required_sections)


        avg_section_length = sum(len(str(section)) for section in required_sections) / len(required_sections)
        quality_metrics["depth"] = min(1.0, avg_section_length / 500)  # Normalize to 1.0


        specific_terms = sum(1 for section in required_sections if any(term in str(section).lower()
            for term in ["specifically", "particularly", "notably", "specifically", "in particular"]))
        quality_metrics["specificity"] = min(1.0, specific_terms / 5)


        constructive_terms = sum(1 for section in required_sections if any(term in str(section).lower()
            for term in ["suggest", "recommend", "improve", "enhance", "consider"]))
        quality_metrics["constructiveness"] = min(1.0, constructive_terms / 5)


        technical_terms = sum(1 for section in required_sections if any(term in str(section).lower()
            for term in ["algorithm", "method", "technique", "approach", "implementation"]))
        quality_metrics["technical_accuracy"] = min(1.0, technical_terms / 5)

        return quality_metrics

    def generate_review(self, paper_path: str) -> PaperReview:
        """Generate a comprehensive review for the given paper"""
        try:
            paper_content = self._extract_paper_content(paper_path)

            review = self._generate_review_with_gpt4(paper_content)

            quality_metrics = self._assess_review_quality(review)

            if quality_metrics["completeness"] < 0.8 or quality_metrics["depth"] < 0.7:
                print("Initial review quality below threshold. Regenerating...")
                review = self._generate_review_with_gpt4(paper_content)
                quality_metrics = self._assess_review_quality(review)

            review.quality_metrics = quality_metrics

            return review

        except Exception as e:
            print(f"Error generating review: {str(e)}")
            raise

    def _format_review_for_docx(self, review: PaperReview) -> str:
        """Format the review for DOCX output"""
        sections = [
            ("Title and Authors", review.title),
            ("Summary", review.summary),
            ("Key Contributions", review.contributions),
            ("Technical Soundness", review.technical_soundness),
            ("Reproducibility", review.reproducibility),
            ("Impact", review.impact),
            ("Strengths", review.strengths),
            ("Weaknesses", review.weaknesses),
            ("Suggestions", review.suggestions),
            ("Related Work", review.related_work),
            ("Ratings", f"""
            Technical Soundness: {review.ratings['technical_soundness']}/5
            Reproducibility: {review.ratings['reproducibility']}/5
            Impact: {review.ratings['impact']}/5
            Overall: {review.ratings['overall']}/5
            """),
            ("Quality Metrics", f"""
            Completeness: {review.quality_metrics['completeness']:.2f}
            Depth: {review.quality_metrics['depth']:.2f}
            Specificity: {review.quality_metrics['specificity']:.2f}
            Constructiveness: {review.quality_metrics['constructiveness']:.2f}
            Technical Accuracy: {review.quality_metrics['technical_accuracy']:.2f}
            """)
        ]

        formatted_review = ""
        for title, content in sections:
            if content:
                formatted_review += f"\n{title}\n{'=' * len(title)}\n{content}\n"

        return formatted_review


In [None]:
class ReviewFormatter:
    def __init__(self, review: PaperReview, metadata: PaperMetadata):
        self.review = review
        self.metadata = metadata

    def create_docx(self, output_path: str):
        """Create a DOCX formatted review"""
        doc = Document()

        # Title and Authors
        doc.add_heading('Review of Paper', level=1)
        doc.add_paragraph(f'Title: {self.review.title}')
        doc.add_paragraph(f'Authors: {self.review.authors}')
        doc.add_paragraph('')

        # Summary
        doc.add_heading('Summary', level=2)
        doc.add_paragraph(self.review.summary)
        doc.add_paragraph('')

        # Contributions
        doc.add_heading('Key Contributions', level=2)
        for contribution in self.review.contributions:
            doc.add_paragraph(contribution)
        doc.add_paragraph('')

        # Technical Soundness
        doc.add_heading('Technical Soundness', level=2)
        doc.add_paragraph(self.review.technical_soundness)
        doc.add_paragraph('')

        # Reproducibility
        doc.add_heading('Reproducibility', level=2)
        doc.add_paragraph(self.review.reproducibility)
        doc.add_paragraph('')

        # Impact
        doc.add_heading('Impact', level=2)
        doc.add_paragraph(self.review.impact)
        doc.add_paragraph('')

        # Strengths
        doc.add_heading('Strengths', level=2)
        for strength in self.review.strengths:
            doc.add_paragraph(strength)
        doc.add_paragraph('')

        # Weaknesses
        doc.add_heading('Weaknesses', level=2)
        for weakness in self.review.weaknesses:
            doc.add_paragraph(weakness)
        doc.add_paragraph('')

        # Suggestions
        doc.add_heading('Suggestions for Improvement', level=2)
        for suggestion in self.review.suggestions:
            doc.add_paragraph(suggestion)
        doc.add_paragraph('')

        # Related Work
        doc.add_heading('Related Work', level=2)
        doc.add_paragraph(self.review.related_work)
        doc.add_paragraph('')

        # Ratings
        doc.add_heading('Ratings', level=2)
        ratings_text = f"Clarity: {self.review.ratings['clarity']}/5 | " \
                      f"Novelty: {self.review.ratings['novelty']}/5 | " \
                      f"Methodology: {self.review.ratings['methodology']}/5 | " \
                      f"Technical Soundness: {self.review.ratings['technical_soundness']}/5 | " \
                      f"Impact: {self.review.ratings['impact']}/5"
        doc.add_paragraph(ratings_text)

        doc.save(output_path)

    def create_pdf(self, output_path: str):
        """Create a PDF formatted review"""
        doc = SimpleDocTemplate(output_path, pagesize=letter)
        styles = getSampleStyleSheet()
        story = []

        # Title and Authors
        title_style = ParagraphStyle(
            'CustomTitle',
            parent=styles['Heading1'],
            fontSize=16,
            spaceAfter=20
        )
        story.append(Paragraph('Review of Paper', title_style))
        story.append(Paragraph(f'Title: {self.review.title}', styles['Normal']))
        story.append(Paragraph(f'Authors: {self.review.authors}', styles['Normal']))
        story.append(Spacer(1, 12))

        # Summary
        story.append(Paragraph('Summary', styles['Heading2']))
        story.append(Paragraph(self.review.summary, styles['Normal']))
        story.append(Spacer(1, 12))

        # Contributions
        story.append(Paragraph('Key Contributions', styles['Heading2']))
        for contribution in self.review.contributions:
            story.append(Paragraph(contribution, styles['Normal']))
            story.append(Spacer(1, 6))
        story.append(Spacer(1, 12))

        # Technical Soundness
        story.append(Paragraph('Technical Soundness', styles['Heading2']))
        story.append(Paragraph(self.review.technical_soundness, styles['Normal']))
        story.append(Spacer(1, 12))

        # Reproducibility
        story.append(Paragraph('Reproducibility', styles['Heading2']))
        story.append(Paragraph(self.review.reproducibility, styles['Normal']))
        story.append(Spacer(1, 12))

        # Impact
        story.append(Paragraph('Impact', styles['Heading2']))
        story.append(Paragraph(self.review.impact, styles['Normal']))
        story.append(Spacer(1, 12))

        # Strengths
        story.append(Paragraph('Strengths', styles['Heading2']))
        for strength in self.review.strengths:
            story.append(Paragraph(strength, styles['Normal']))
            story.append(Spacer(1, 6))
        story.append(Spacer(1, 12))

        # Weaknesses
        story.append(Paragraph('Weaknesses', styles['Heading2']))
        for weakness in self.review.weaknesses:
            story.append(Paragraph(weakness, styles['Normal']))
            story.append(Spacer(1, 6))
        story.append(Spacer(1, 12))

        # Suggestions
        story.append(Paragraph('Suggestions for Improvement', styles['Heading2']))
        for suggestion in self.review.suggestions:
            story.append(Paragraph(suggestion, styles['Normal']))
            story.append(Spacer(1, 6))
        story.append(Spacer(1, 12))

        # Related Work
        story.append(Paragraph('Related Work', styles['Heading2']))
        story.append(Paragraph(self.review.related_work, styles['Normal']))
        story.append(Spacer(1, 12))

        # Ratings
        story.append(Paragraph('Ratings', styles['Heading2']))
        ratings_text = f"Clarity: {self.review.ratings['clarity']}/5 | " \
                      f"Novelty: {self.review.ratings['novelty']}/5 | " \
                      f"Methodology: {self.review.ratings['methodology']}/5 | " \
                      f"Technical Soundness: {self.review.ratings['technical_soundness']}/5 | " \
                      f"Impact: {self.review.ratings['impact']}/5"
        story.append(Paragraph(ratings_text, styles['Normal']))

        doc.build(story)

In [None]:
import os
from pathlib import Path

def test_with_paper(paper_path: str, output_format: str = "pdf"):
    """Test the paper reviewer with a specific paper file"""
    paper_path = Path(paper_path).absolute()

    if not paper_path.exists():
        print(f"Error: Paper file not found at: {paper_path}")
        return

    processor = PaperProcessor()
    reviewer = PaperReviewer(field="Computer Science")

    paper_text = processor.read_paper_from_file(str(paper_path))
    if not paper_text:
        print(f"Failed to read paper from {paper_path}")
        return

    metadata_dict = processor.extract_metadata(paper_text)

    metadata = PaperMetadata(
        title=metadata_dict.get('title', 'Unknown Title'),
        authors=metadata_dict.get('authors', []),
        field="Computer Science",
        author_summary=metadata_dict.get('abstract', '')
    )

    print("\nGenerating review...")
    review = reviewer.review_paper(paper_text, metadata)

    formatter = ReviewFormatter(review, metadata)
    output_dir = Path("output")
    output_dir.mkdir(exist_ok=True)

    output_path = output_dir / f"review_{paper_path.stem}.{output_format}"
    if output_format == "pdf":
        formatter.create_pdf(str(output_path))
    else:
        formatter.create_docx(str(output_path))

    print(f"\nReview generated and saved to: {output_path}")


single_paper_path = '2501.19012v1.pdf'
test_with_paper(single_paper_path, "pdf")


Generating review...

Review generated and saved to: output/review_2501.19012v1.pdf


For multiple

In [None]:
import os

def test_with_paper(paper_path: str, output_format: str = "pdf"):
    """Test the paper reviewer with a specific paper file"""
    # Convert to absolute path
    paper_path = Path(paper_path).absolute()

    if not paper_path.exists():
        print(f"Error: Paper file not found at: {paper_path}")
        return

    # Initialize the processor and reviewer
    processor = PaperProcessor()
    reviewer = PaperReviewer(field="Computer Science")

    # Read the paper
    paper_text = processor.read_paper_from_file(str(paper_path))
    if not paper_text:
        print(f"Failed to read paper from {paper_path}")
        return

    # Extract metadata
    metadata_dict = processor.extract_metadata(paper_text)

    # Create PaperMetadata object
    metadata = PaperMetadata(
        title=metadata_dict.get('title', 'Unknown Title'),
        authors=metadata_dict.get('authors', []),
        field="Computer Science",
        author_summary=metadata_dict.get('abstract', '')
    )

    # Generate review
    print("\nGenerating review...")
    review = reviewer.review_paper(paper_text, metadata)

    # Create formatted review
    formatter = ReviewFormatter(review, metadata)
    output_dir = Path("output")
    output_dir.mkdir(exist_ok=True)

    output_path = output_dir / f"review_{paper_path.stem}.{output_format}"
    if output_format == "pdf":
        formatter.create_pdf(str(output_path))
    else:
        formatter.create_docx(str(output_path))

    print(f"\nReview generated and saved to: {output_path}")


# Replace this with your actual directory
pdf_directory = "./papers/"  # Make sure the path ends with a slash or use os.path.join()

for filename in os.listdir(pdf_directory):
    if filename.endswith(".pdf"):
        paper_path = os.path.join(pdf_directory, filename)
        print(f"Processing {paper_path}...")
        test_with_paper(paper_path, "pdf")

Processing ./papers/2406.11036v1.pdf...

Generating review...

Review generated and saved to: output/review_2406.11036v1.pdf
Processing ./papers/2402.13457v2.pdf...

Generating review...

Review generated and saved to: output/review_2402.13457v2.pdf
Processing ./papers/2312.07104v2.pdf...

Generating review...

Review generated and saved to: output/review_2312.07104v2.pdf
Processing ./papers/2401.04334v1.pdf...

Generating review...

Review generated and saved to: output/review_2401.04334v1.pdf
Processing ./papers/2406.11931v1.pdf...

Generating review...

Review generated and saved to: output/review_2406.11931v1.pdf
Processing ./papers/2402.00157v4.pdf...

Generating review...

Review generated and saved to: output/review_2402.00157v4.pdf
Processing ./papers/2305.14992v2.pdf...

Generating review...

Review generated and saved to: output/review_2305.14992v2.pdf
Processing ./papers/2309.00071v2.pdf...

Generating review...

Review generated and saved to: output/review_2309.00071v2.pdf


RateLimitError: Error code: 429 - {'error': {'message': 'Request too large for gpt-4.1 in organization org-oNzWIWz3J17nDsH3duVAnoVC on tokens per min (TPM): Limit 30000, Requested 30650. The input or output tokens must be reduced in order to run successfully. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}}

Metrics/Evaluation

In [None]:
!pip install --quiet rouge-score nltk PyPDF2 bert-score sentence-transformers

import os
import pandas as pd
import re
from rouge_score import rouge_scorer
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from PyPDF2 import PdfReader

def extract_text_from_pdf(path):
    reader = PdfReader(path)
    return "".join(page.extract_text() or "" for page in reader.pages)

def extract_section(text, heading, stop_headings):
    pattern = rf"{heading}\s*(.*?)(?:{'|'.join(stop_headings)})"
    match = re.search(pattern, text, re.DOTALL | re.IGNORECASE)
    return match.group(1).strip() if match else ""

def compute_rouge_and_bleu(reference, hypothesis):
    scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
    rouge_score = scorer.score(reference, hypothesis)['rougeL'].fmeasure

    smoothie = SmoothingFunction().method4
    reference_tokens = [reference.split()]
    hypothesis_tokens = hypothesis.split()
    bleu_score = sentence_bleu(reference_tokens, hypothesis_tokens, smoothing_function=smoothie)

    return round(rouge_score, 4), round(bleu_score, 4)

# Main loop
results = []
review_dir = "output"
paper_dir = "papers"

for filename in os.listdir(review_dir):
    if filename.startswith("review_") and filename.endswith(".pdf"):
        paper_id = filename.replace("review_", "").replace(".pdf", "")
        review_path = os.path.join(review_dir, filename)
        paper_path = os.path.join(paper_dir, f"{paper_id}.pdf")

        if not os.path.exists(paper_path):
            print(f"Paper {paper_id}.pdf not found, skipping...")
            continue

        review_text = extract_text_from_pdf(review_path)
        paper_text = extract_text_from_pdf(paper_path)

        paper_abstract = extract_section(paper_text, "Abstract", ["1", "I", "Introduction"])
        review_summary = extract_section(review_text, "Summary", ["Key Contributions", "Technical Soundness", "Strengths", "Weaknesses"])

        if not paper_abstract or not review_summary:
            print(f"Missing abstract or summary for {paper_id}, skipping...")
            continue

        rouge, bleu = compute_rouge_and_bleu(paper_abstract, review_summary)
        results.append({
            "Paper ID": paper_id,
            "ROUGE-L F1": rouge,
            "BLEU": bleu
        })

df = pd.DataFrame(results)
display(df)

  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m100.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m74.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m42.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m810.5 kB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m11.4 MB/s[0m eta [36m0

Unnamed: 0,Paper ID,ROUGE-L F1,BLEU
0,2406.11036v1,0.0606,0.0028
1,2402.13457v2,0.0,0.0
2,2309.06180v1,0.0,0.0
3,2305.14992v2,0.0505,0.0074
4,2402.03300v3,0.0098,0.0
5,2309.00071v2,0.0103,0.002
6,2312.07104v2,0.0302,0.002
7,2406.11931v1,0.0506,0.0042
8,2402.00157v4,0.012,0.0


In [None]:
!pip install --quiet rouge-score nltk PyPDF2 bert-score sentence-transformers

import os
import re
import pandas as pd
from PyPDF2 import PdfReader
from rouge_score import rouge_scorer
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from bert_score import score as bert_score
from sentence_transformers import SentenceTransformer, util

sbert_model = SentenceTransformer('all-MiniLM-L6-v2')


def extract_text_from_pdf(path):
    reader = PdfReader(path)
    return "".join(page.extract_text() or "" for page in reader.pages)

def extract_section(text, heading, stop_headings):
    pattern = rf"{heading}\s*(.*?)(?:{'|'.join(stop_headings)})"
    match = re.search(pattern, text, re.DOTALL | re.IGNORECASE)
    return match.group(1).strip() if match else ""

def compute_rouge_and_bleu(reference, hypothesis):
    scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
    rouge_score = scorer.score(reference, hypothesis)['rougeL'].fmeasure

    smoothie = SmoothingFunction().method4
    reference_tokens = [reference.split()]
    hypothesis_tokens = hypothesis.split()
    bleu_score = sentence_bleu(reference_tokens, hypothesis_tokens, smoothing_function=smoothie)

    return round(rouge_score, 4), round(bleu_score, 4)

def compute_content_metrics(reference, hypothesis):
    P, R, F1 = bert_score([hypothesis], [reference], lang='en')
    ref_emb = sbert_model.encode(reference, convert_to_tensor=True)
    hyp_emb = sbert_model.encode(hypothesis, convert_to_tensor=True)
    cosine_sim = util.pytorch_cos_sim(ref_emb, hyp_emb).item()
    return round(F1[0].item(), 4), round(cosine_sim, 4)


review_dir = "output"
paper_dir = "papers"
results = []

for filename in os.listdir(review_dir):
    if filename.startswith("review_") and filename.endswith(".pdf"):
        paper_id = filename.replace("review_", "").replace(".pdf", "")
        review_path = os.path.join(review_dir, filename)
        paper_path = os.path.join(paper_dir, f"{paper_id}.pdf")

        if not os.path.exists(paper_path):
            print(f"Missing paper for {paper_id}, skipping...")
            continue

        review_text = extract_text_from_pdf(review_path)
        paper_text = extract_text_from_pdf(paper_path)

        paper_abstract = extract_section(paper_text, "Abstract", ["1", "I", "Introduction"])
        review_summary = extract_section(review_text, "Summary", ["Key Contributions", "Technical Soundness", "Strengths", "Weaknesses"])

        if not paper_abstract or not review_summary:
            print(f"Could not extract content for {paper_id}, skipping...")
            continue

        rouge, bleu = compute_rouge_and_bleu(paper_abstract, review_summary)
        bert, cosine = compute_content_metrics(paper_abstract, review_summary)

        results.append({
            "Paper ID": paper_id,
            "ROUGE-L F1": rouge,
            "BLEU": bleu,
            "BERTScore F1": bert,
            "SBERT Cosine": cosine
        })

df = pd.DataFrame(results)
display(df)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Missing paper for 2401.04334v, skipping...


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You sho

Unnamed: 0,Paper ID,ROUGE-L F1,BLEU,BERTScore F1,SBERT Cosine
0,2406.11036v1,0.0606,0.0028,0.8168,0.4119
1,2402.13457v2,0.0,0.0,0.7775,0.1358
2,2309.06180v1,0.0,0.0,0.7748,0.0572
3,2305.14992v2,0.0505,0.0074,0.8041,0.4033
4,2402.03300v3,0.0098,0.0,0.7722,0.1725
5,2309.00071v2,0.0103,0.002,0.7776,0.0526
6,2312.07104v2,0.0302,0.002,0.8107,0.4455
7,2406.11931v1,0.0506,0.0042,0.8269,0.6027
8,2402.00157v4,0.012,0.0,0.7786,0.3587


In [None]:
!pip install fpdf

import os
from pathlib import Path
from PyPDF2 import PdfReader
import openai
from fpdf import FPDF


client = openai.OpenAI(api_key='your-key')

def extract_text_from_pdf(path):
    reader = PdfReader(path)
    full_text = ""
    for page in reader.pages:
        full_text += page.extract_text() or ""
    return full_text.strip()

def get_baseline_prompt(paper_text):
    return f"""Please provide a short academic review of the following paper:\n\n{paper_text[:7000]}"""

def sanitize_text(text):
    replacements = {
        '“': '"', '”': '"',
        '‘': "'", '’': "'",
        '–': '-', '—': '-',
        '…': '...',
        '•': '*'
    }
    for orig, repl in replacements.items():
        text = text.replace(orig, repl)
    return re.sub(r'[^\x00-\x7F]+', '', text)

def save_text_as_pdf(text, output_path):
    text = sanitize_text(text)
    pdf = FPDF()
    pdf.set_auto_page_break(auto=True, margin=15)
    pdf.add_page()
    pdf.set_font("Arial", size=11)
    for line in text.split("\n"):
        pdf.multi_cell(0, 10, line)
    pdf.output(output_path)

def generate_baseline_reviews(papers_dir, output_dir):
    papers_dir = Path(papers_dir)
    output_dir = Path(output_dir)
    output_dir.mkdir(exist_ok=True)

    for paper_file in papers_dir.glob("*.pdf"):
        paper_id = paper_file.stem
        print(f"Processing {paper_id}...")

        try:
            paper_text = extract_text_from_pdf(paper_file)
            prompt = get_baseline_prompt(paper_text)

            response = client.chat.completions.create(
                model="gpt-4.1",
                messages=[
                    {"role": "system", "content": "You are a helpful academic reviewer."},
                    {"role": "user", "content": prompt}
                ],
                temperature=0.7,
                max_tokens=3000
            )

            baseline_review = response.choices[0].message.content
            output_path = output_dir / f"baseline_review_{paper_id}.pdf"
            save_text_as_pdf(baseline_review, output_path)
            print(f"Saved: {output_path}")

        except Exception as e:
            print(f"Failed for {paper_id}: {e}")

generate_baseline_reviews("papers", "baseline_outputs")

Processing 2406.11036v1...
✅ Saved: baseline_outputs/baseline_review_2406.11036v1.pdf
Processing 2402.13457v2...
✅ Saved: baseline_outputs/baseline_review_2402.13457v2.pdf
Processing 2312.07104v2...
✅ Saved: baseline_outputs/baseline_review_2312.07104v2.pdf
Processing 2402.03300v3...
✅ Saved: baseline_outputs/baseline_review_2402.03300v3.pdf
Processing 2401.04334v1...
✅ Saved: baseline_outputs/baseline_review_2401.04334v1.pdf
Processing 2406.11931v1...
✅ Saved: baseline_outputs/baseline_review_2406.11931v1.pdf
Processing 2402.00157v4...
✅ Saved: baseline_outputs/baseline_review_2402.00157v4.pdf
Processing 2305.14992v2...
✅ Saved: baseline_outputs/baseline_review_2305.14992v2.pdf
Processing 2309.00071v2...
✅ Saved: baseline_outputs/baseline_review_2309.00071v2.pdf
Processing 2309.06180v1...
✅ Saved: baseline_outputs/baseline_review_2309.06180v1.pdf


In [None]:
!pip install --quiet rouge-score nltk PyPDF2 bert-score sentence-transformers

import os
import re
import pandas as pd
from PyPDF2 import PdfReader
from rouge_score import rouge_scorer
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from bert_score import score as bert_score
from sentence_transformers import SentenceTransformer, util

sbert_model = SentenceTransformer('all-MiniLM-L6-v2')

def extract_text_from_pdf(path):
    reader = PdfReader(path)
    return "".join(page.extract_text() or "" for page in reader.pages)

def extract_section(text, heading, stop_headings):
    heading_patterns = [heading, heading.upper(), f"1 {heading}", f"I. {heading}", f"{heading}:"]
    stop_patterns = [r"\n" + h for h in stop_headings] + [r"\n[A-Z][a-z]+"]

    for h in heading_patterns:
        pattern = rf"{h}\s*(.*?)(?:{'|'.join(stop_patterns)})"
        match = re.search(pattern, text, re.DOTALL | re.IGNORECASE)
        if match:
            return match.group(1).strip()


    print(f"Heading '{heading}' not found. Using fallback paragraph.")
    return text[:1000].strip()

def compute_rouge_and_bleu(reference, hypothesis):
    scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
    rouge_score = scorer.score(reference, hypothesis)['rougeL'].fmeasure

    smoothie = SmoothingFunction().method4
    reference_tokens = [reference.split()]
    hypothesis_tokens = hypothesis.split()
    bleu_score = sentence_bleu(reference_tokens, hypothesis_tokens, smoothing_function=smoothie)

    return round(rouge_score, 4), round(bleu_score, 4)

def compute_content_metrics(reference, hypothesis):
    P, R, F1 = bert_score([hypothesis], [reference], lang='en')
    ref_emb = sbert_model.encode(reference, convert_to_tensor=True)
    hyp_emb = sbert_model.encode(hypothesis, convert_to_tensor=True)
    cosine_sim = util.pytorch_cos_sim(ref_emb, hyp_emb).item()
    return round(F1[0].item(), 4), round(cosine_sim, 4)

main_dir = "outputs"
baseline_dir = "baseline_outputs"
paper_dir = "papers"

results = []

for filename in os.listdir(main_dir):
    if filename.startswith("review_") and filename.endswith(".pdf"):
        paper_id = filename.replace("review_", "").replace(".pdf", "")

        main_review_path = os.path.join(main_dir, f"review_{paper_id}.pdf")
        baseline_review_path = os.path.join(baseline_dir, f"baseline_review_{paper_id}.pdf")
        paper_path = os.path.join(paper_dir, f"{paper_id}.pdf")

        print(f"\nProcessing: {paper_id}")

        if not os.path.exists(baseline_review_path):
            print(f"Missing baseline review for {paper_id}, skipping...")
            continue
        if not os.path.exists(paper_path):
            print(f"Missing original paper for {paper_id}, skipping...")
            continue

        try:
            paper_text = extract_text_from_pdf(paper_path)
            main_review = extract_text_from_pdf(main_review_path)
            baseline_review = extract_text_from_pdf(baseline_review_path)

            paper_abstract = extract_section(paper_text, "Abstract", ["1", "I", "Introduction"])
            main_summary = extract_section(main_review, "Summary", ["Key Contributions", "Technical Soundness", "Strengths", "Weaknesses"])
            baseline_summary = extract_section(baseline_review, "Summary", ["Key Contributions", "Technical Soundness", "Strengths", "Weaknesses"])

            if not paper_abstract or not main_summary or not baseline_summary:
                print(f"Missing abstract or summaries for {paper_id}, skipping...")
                continue

            r_main, b_main = compute_rouge_and_bleu(paper_abstract, main_summary)
            bert_main, cos_main = compute_content_metrics(paper_abstract, main_summary)

            r_base, b_base = compute_rouge_and_bleu(paper_abstract, baseline_summary)
            bert_base, cos_base = compute_content_metrics(paper_abstract, baseline_summary)

            results.append({
                "Paper ID": paper_id,
                "ROUGE-L (Main)": r_main,
                "BLEU (Main)": b_main,
                "BERTScore F1 (Main)": bert_main,
                "SBERT Cosine (Main)": cos_main,
                "ROUGE-L (Baseline)": r_base,
                "BLEU (Baseline)": b_base,
                "BERTScore F1 (Baseline)": bert_base,
                "SBERT Cosine (Baseline)": cos_base
            })

        except Exception as e:
            print(f"Error with {paper_id}: {e}")

df = pd.DataFrame(results)
print("\nFinal Comparison Table")
display(df)


Processing: 2406.11036v1
⚠️ Heading 'Summary' not found. Using fallback paragraph.


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Processing: 2401.04334v
⚠️ Missing baseline review for 2401.04334v, skipping...

Processing: 2402.13457v2


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Processing: 2309.06180v1


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Processing: 2305.14992v2


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Processing: 2402.03300v3
⚠️ Heading 'Summary' not found. Using fallback paragraph.


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Processing: 2309.00071v2
⚠️ Heading 'Summary' not found. Using fallback paragraph.


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Processing: 2312.07104v2
⚠️ Heading 'Summary' not found. Using fallback paragraph.


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Processing: 2406.11931v1


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Processing: 2402.00157v4
⚠️ Heading 'Summary' not found. Using fallback paragraph.


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



✅ Final Comparison Table


Unnamed: 0,Paper ID,ROUGE-L (Main),BLEU (Main),BERTScore F1 (Main),SBERT Cosine (Main),ROUGE-L (Baseline),BLEU (Baseline),BERTScore F1 (Baseline),SBERT Cosine (Baseline)
0,2406.11036v1,0.0,0.0,0.8132,0.178,0.068,0.0059,0.8296,0.3824
1,2402.13457v2,0.2,0.0176,0.8406,0.1373,0.0,0.0,0.7855,0.182
2,2309.06180v1,0.0,0.0,0.8157,0.2107,0.0,0.0,0.8048,0.0979
3,2305.14992v2,0.0,0.0,0.8313,0.1022,0.0,0.0,0.7649,0.138
4,2402.03300v3,0.16,0.0486,0.8625,0.5374,0.0658,0.0038,0.8182,0.4866
5,2309.00071v2,0.08,0.0,0.8227,0.4257,0.0676,0.0105,0.8259,0.3927
6,2312.07104v2,0.0,0.0,0.8315,0.3384,0.0927,0.0118,0.8247,0.6705
7,2406.11931v1,0.6207,0.1553,0.9566,0.9226,0.0,0.0,0.7672,0.098
8,2402.00157v4,0.0952,0.0166,0.8415,0.0298,0.0417,0.0031,0.8134,0.5425
