# AI-Critique (Automated Peer Review)

- A Multi-stage NLP approach to enhance evaluation of research papers

Team Members:
  - Manoghn Kandiraju
  - Sanjiv Motilal Choudhari
  - Snigdha Mohana Adepalli
  - Sai Manichandana Devi Thumati

# Installation of Required Packages

- Installs necessary packages
- Clones the PeerRead repository

In [1]:
!pip install transformers sentencepiece gradio PyMuPDF
!pip install sentence-transformers nltk
!pip install huggingface_hub
!git clone https://github.com/allenai/PeerRead.git
!pip install PyMuPDF
!pip install PyPDF2

Collecting gradio
  Downloading gradio-5.25.2-py3-none-any.whl.metadata (16 kB)
Collecting PyMuPDF
  Downloading pymupdf-1.25.5-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (3.4 kB)
Collecting aiofiles<25.0,>=22.0 (from gradio)
  Downloading aiofiles-24.1.0-py3-none-any.whl.metadata (10 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Downloading fastapi-0.115.12-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.5.0-py3-none-any.whl.metadata (3.0 kB)
Collecting gradio-client==1.8.0 (from gradio)
  Downloading gradio_client-1.8.0-py3-none-any.whl.metadata (7.1 kB)
Collecting groovy~=0.1 (from gradio)
  Downloading groovy-0.1.2-py3-none-any.whl.metadata (6.1 kB)
Collecting pydub (from gradio)
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting python-multipart>=0.0.18 (from gradio)
  Downloading python_multipart-0.0.20-py3-none-any.whl.metadata (1.8 kB)
Collecting ruff>=0.9.3 (from gradio)
  Downloadin

# Importing Necessary Libraries

- Installing necessary libraries
- Libraries that would be essential for summarising, text processing, sentiment analysis for bias detection, plagiarism detection

In [2]:
import re
import fitz  # PyMuPDF
import nltk
import torch
import json
import requests
import PyPDF2
import numpy as np
from nltk.sentiment import SentimentIntensityAnalyzer
from sentence_transformers import SentenceTransformer, util

nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


True

# Device Configuration and set up

Sets up devide for PyTorch computations

In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
HF_MODEL_NAME = "Manoghn/mistral-qlora-critique"
HF_API_URL = f"https://api-inference.huggingface.co/models/{HF_MODEL_NAME}"  # Public model, no request access needed
HF_API_TOKEN = " "  # Replace with Hugging Face access token
HEADERS = {"Authorization": f"Bearer {HF_API_TOKEN}", "Content-Type": "application/json"}

# Load Models for the different feature implementations

Loads models for different features of the implementaion of AI_Critique :
- BART: For text summarization.
- SentimentIntensityAnalyzer: For bias detection.
- SentenceTransformer: For plagiarism detection.
- It also loads and encodes PeerRead paragraphs for plagiarism comparison.

In [6]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import os
# Load BART summarization model
tok = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")
sum_model = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-large-cnn").to(device)

# Initialize sentiment analyzer and plagiarism model
sia = SentimentIntensityAnalyzer()
plag_model = SentenceTransformer('all-MiniLM-L6-v2')

# Load PeerRead paragraphs and encode them in advance
# Clone PeerRead repo and load paragraphs
!git clone https://github.com/allenai/PeerRead.git /content/PeerRead
# Load PeerRead Paragraphs
peer_paragraphs = []
review_dir = "/content/PeerRead/data/acl_2017/train/reviews/"
import glob
review_files = glob.glob(os.path.join(review_dir, "*.json"))
for file in review_files:
    if len(peer_paragraphs) >= 5000:
        break
    try:
        with open(file, "r", encoding="utf-8") as f:
            data = json.load(f)
            abstract = data.get("abstract", "").strip()
            if len(abstract.split()) > 20:
                peer_paragraphs.append(abstract)
    except:
        continue
peerread_embeddings = plag_model.encode(peer_paragraphs, convert_to_tensor=True) if peer_paragraphs else None


fatal: destination path '/content/PeerRead' already exists and is not an empty directory.


# Text Extraction from the papers

- Extracts texts from papers
- divides them into sections and paragraphs

In [7]:
def extract_text_from_pdf(filepath):
    """
    Extracts text from all pages of a PDF file.
    Args:
        filepath (str): Path to the PDF file.
    Returns:
        str: Concatenated raw text.
    """
    doc = fitz.open(filepath)
    return "\n".join([page.get_text() for page in doc])

def split_into_sections(text):
    """
    Splits the full text into numbered sections using regex.
    Args:
        text (str): Full extracted text.
    Returns:
        dict: Mapping of section title to section content.
    """
    pattern = r"(?:^|\n)(\d+\.?\d*.*?)\n"
    titles = [m.group(1).strip() for m in re.finditer(pattern, text)]
    splits = [m.start() for m in re.finditer(pattern, text)] + [len(text)]
    return {titles[i]: text[splits[i]:splits[i+1]].strip() for i in range(len(titles))}

def split_into_paragraphs(section_text):
    """
    Splits section content into individual paragraphs.
    Args:
        section_text (str): Text from a section.
    Returns:
        list: List of paragraph strings.
    """
    return [p.strip() for p in section_text.split("\n\n") if len(p.strip()) > 30]

# Summary, Critique, Bias and Plagiarism

The most important phase in the implementation of the Automated Peer Reviewer

In [8]:
def call_huggingface_api(prompt):
    """
    Sends a prompt to the Hugging Face model for critique generation.
    Args:
        prompt (str): Prompt text to evaluate.
    Returns:
        str: Model's response.
    """
    payload = {
        "inputs": f"<s>[INST] {prompt} [/INST]</s>",
        "options": {"use_cache": True, "wait_for_model": True}
    }
    try:
        response = requests.post(url=HF_API_URL, headers=HEADERS, data=json.dumps(payload))
        response.raise_for_status()
        result = response.json()
        if isinstance(result, list) and "generated_text" in result[0]:
            return result[0]["generated_text"].split("[/INST]")[-1].strip()
        elif isinstance(result, dict) and "error" in result:
            return f"Error from HF: {result['error']}"
        return json.dumps(result)
    except requests.RequestException as e:
        return f"Error: {e}"

def generate_summary(text):
    """
    Generates a summary using BART for a given section.
    Args:
        text (str): Section text.
    Returns:
        str: Generated summary.
    """
    inputs = tok([text], return_tensors="pt", max_length=1024, truncation=True).to(device)
    summary_ids = sum_model.generate(**inputs, max_length=250, min_length=80, length_penalty=2.0, num_beams=4)
    return tok.decode(summary_ids[0], skip_special_tokens=True)

def detect_bias(paragraph):
    """
    Analyzes sentiment of a paragraph to flag potential bias.
    Args:
        paragraph (str): Paragraph text.
    Returns:
        tuple: Compound score, binary flag ('yes'/'no').
    """
    score = sia.polarity_scores(paragraph)['compound']
    return score, 'yes' if abs(score) > 0.5 else 'no'

def detect_plagiarism(paragraph):
    """
    Calculates cosine similarity with PeerRead content.
    Args:
        paragraph (str): Paragraph text.
    Returns:
        tuple: Similarity score, binary flag ('yes'/'no').
    """
    para_emb = plag_model.encode(paragraph, convert_to_tensor=True)
    cos_scores = util.pytorch_cos_sim(para_emb, peerread_embeddings)[0]
    top_score = float(torch.max(cos_scores))
    return top_score, 'yes' if top_score > 0.35 else 'no'

# Upload and process a pdf

1. Allows to upload a sample pdf
2. Preprocesses file using regex functions that were earlier defined to,
- Extract text
- Generate summary
- Analyze paragraph for Critique
- Bias and Plagiarism check

In [12]:
def process_pdf(filepath):
    """
    Main function to extract text from a PDF and perform:
    - Section-wise summarization
    - Paragraph-level critique, bias detection, and plagiarism check

    Args:
        filepath (str): Path to uploaded PDF file
    Returns:
        dict: Structured output of analysis per section
    """
    raw_text = extract_text_from_pdf(filepath)
    sections = split_into_sections(raw_text)
    results = {}

    for title, text in sections.items():
        section_info = {}
        section_info['summary'] = generate_summary(text)
        section_info['paragraphs'] = []
        paragraphs = split_into_paragraphs(text)

        if 'reference' in title.lower():
            critique_prompt = f"Give feedback on the formatting, consistency, and citation style used in this reference section:\n{text}"
            section_info['paragraphs'].append({
                'text': text,
                'critique': call_huggingface_api(critique_prompt),
                'bias_score': 'N/A',
                'biased': 'N/A',
                'plagiarism_score': 'N/A',
                'plagiarized': 'N/A'
            })
        else:
            for para in paragraphs:
                critique_prompt = (
    "You are an academic peer reviewer. Provide 3 to 5 constructive review points about the following section.\n"
    "Focus on clarity, structure, grammar, flow, relevance, and any major issues.\n"
    "Avoid copying or summarizing the content directly.\n"
    "Respond in concise bullet-point format.\n\n"
    f"Section:\n{para}\n\nPeer Review Suggestions:"
)
                para_critique = call_huggingface_api(critique_prompt)
                bias_score, is_biased = detect_bias(para)
                plag_score, is_plag = detect_plagiarism(para)
                section_info['paragraphs'].append({
                    'text': para,
                    'critique': para_critique,
                    'bias_score': bias_score,
                    'biased': is_biased,
                    'plagiarism_score': plag_score,
                    'plagiarized': is_plag
                })

        results[title] = section_info

    return results

# Generate result for the uploaded pdf

- Executes analysis on uploaded PDF and gives results
- Ouputs a summary for each section and critique, bias and plagiarism for each of the paragraph

In [13]:
from google.colab import files

# Upload a PDF file
uploaded = files.upload()
pdf_path = f"/content/{next(iter(uploaded))}"

# Extract and process the file
raw_text = extract_text_from_pdf(pdf_path)
results = process_pdf(pdf_path)

# Print final output
for section, data in results.items():
    print(f"\n\n=== Section: {section} ===")
    if 'summary' in data:
        print(f"\nSummary:\n{data['summary']}")
    for p in data.get('paragraphs', []):
        print(f"\nParagraph:\n{p['text']}\nCritique: {p['critique']}\nBias Score: {p['bias_score']}, Biased: {p['biased']}, Plagiarism Score: {p['plagiarism_score']}, Plagiarized: {p['plagiarized']}")

Saving Handwritten Equation Solver 1.pdf to Handwritten Equation Solver 1 (2).pdf


KeyboardInterrupt: 