# Process Raw Data

In [None]:
%pip install openreview-py numpy pandas nltk transformers tqdm torch
%pip install torch torchvision torchaudio
%pip install pylats taaled spacy convokit textstat simpletransformers

%pip install spacy
# English models
%python -m spacy download en_core_web_sm
%python -m spacy download en_core_web_trf
# Spanish models (used as fallback)
%python -m spacy download es_core_news_sm
%python -m spacy download es_dep_news_trf

%pip install textblob
%python -m textblob.download_corpora

In [None]:
import io
import os
import re
import sys
import csv
import json
import time
import random
import pickle
from tqdm import tqdm
from datetime import datetime
import pandas as pd
import numpy as np
from collections import Counter

import nltk
from nltk.tokenize import sent_tokenize
nltk.download('punkt')
nltk.download('punkt_tab')
import textstat
from textblob import TextBlob
from taaled import ld
from pylats import lats
from convokit import Corpus, TextParser, PolitenessStrategies, Classifier, Utterance, Speaker, download

import torch
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification
from simpletransformers.ner import NERModel, NERArgs

from ollama import chat

In [None]:
# NEURIPS 2023
file_path = '../data/raw/neurips-2023.pkl'

with open(file_path, 'rb') as pkl_file:
    data = pickle.load(pkl_file)

In [None]:
# Replace 'your_file.pkl' with the path to your .pkl file
output_json_path = '../data/processed/neurips-2023.json'

# Load the .pkl file
with open(file_path, 'rb') as pkl_file:
    data = pickle.load(pkl_file)

# Extract the required fields for each submission
extracted_data = []
for submission in data:
    extracted_data.append({
        'number': submission.number if hasattr(submission, 'number') else np.nan,
        'id': submission.id if hasattr(submission, 'id') else np.nan,
        'content.paperhash': submission.content['paperhash']['value'] if 'paperhash' in submission.content and 'value' in submission.content['paperhash'] else np.nan,
        'content.authorids': submission.content['authorids']['value'] if 'authorids' in submission.content and 'value' in submission.content['authorids'] else np.nan,
        'cdate': submission.cdate if hasattr(submission, 'cdate') else np.nan,
        'content.title': submission.content['title']['value'] if 'title' in submission.content and 'value' in submission.content['title'] else np.nan,
        'content.abstract': submission.content['abstract']['value'] if 'abstract' in submission.content and 'value' in submission.content['abstract'] else np.nan,
        # 'content.TLDR': submission.content['TLDR']['value'] if 'TLDR' in submission.content and 'value' in submission.content['TLDR'] else np.nan,
    })

# Create a Pandas DataFrame
df = pd.DataFrame(extracted_data)

# Save the DataFrame to a JSON file
df.to_json(output_json_path, orient='records', indent=4)
print(f"DataFrame saved to JSON file at: {output_json_path}")

In [None]:
# Load the JSON file as a pandas DataFrame
df_json = pd.read_json('../data/processed/neurips-2023.json')

# Display the header of the first 5 samples
df_json

In [None]:
# number of review, comments, and meta-reviews for each submission
# 3395 is the number of submissions
num_of_reviews = [len(data[i].details['directReplies']) for i in range(3395)]
print('min:', min(num_of_reviews))
print('max:', max(num_of_reviews))
print('mean:', np.mean(num_of_reviews))
print('median:', np.median(num_of_reviews))
print('std:', np.std(num_of_reviews))
print('percentiles:', np.percentile(num_of_reviews, [25, 50, 75, 90, 95, 99]))

In [None]:
comment_types = set()
for i in range(3395):
    for j in range(len(data[i].details['directReplies'])):
        comment_types.add(data[i].details['directReplies'][j]['invitations'][0].split('/')[-1])

comment_types

In [None]:
# Iterate through each submission in the data
new_rows = []
for submission in data:
    submission_id = submission.id
    submission_number = submission.number
    submission_title = submission.content['title']['value'] if 'title' in submission.content and 'value' in submission.content['title'] else None
    submission_abstract = submission.content['abstract']['value'] if 'abstract' in submission.content and 'value' in submission.content['abstract'] else None
    submission_authors = submission.content['authorids']['value'] if 'authorids' in submission.content and 'value' in submission.content['authorids'] else np.nan
    submission_creation_date = submission.cdate if hasattr(submission, 'cdate') else np.nan

    # Check if 'directReplies' exists in details
    if 'directReplies' in submission.details:
        for reply in submission.details['directReplies']:
            # Check if the invitation is 'Official_Review'
            if reply['invitations'][0].split('/')[-1] == 'Official_Review':
                # Extract features from the review
                reviewer = reply['signatures'][0].split('/')[-1]  # if 'signatures' in reply and len(reply['signatures']) > 0 else None
                
                #'tcdate', 'cdate', 'tmdate', 'mdate'
                review_tcdate = reply['tcdate'] if 'tcdate' in reply else None
                review_cdate = reply['cdate'] if 'cdate' in reply else None
                review_tmdate = reply['tmdate'] if 'tmdate' in reply else None
                review_mdate = reply['mdate'] if 'mdate' in reply else None
                
                review_rating = int(reply['content']['rating']['value'].split(':')[0]) if 'rating' in reply['content'] and 'value' in reply['content']['rating'] else None
                review_confidence = int(reply['content']['confidence']['value'].split(':')[0]) if 'confidence' in reply['content'] and 'value' in reply['content']['confidence'] else None
                review_soundness = int(reply['content']['soundness']['value'].split(' ')[0]) if 'soundness' in reply['content'] and 'value' in reply['content']['soundness'] else None
                review_presentation = int(reply['content']['presentation']['value'].split(' ')[0]) if 'presentation' in reply['content'] and 'value' in reply['content']['presentation'] else None
                review_contribution = int(reply['content']['contribution']['value'].split(' ')[0]) if 'contribution' in reply['content'] and 'value' in reply['content']['contribution'] else None
                
                review_summary = reply['content']['summary']['value'] if 'summary' in reply['content'] and 'value' in reply['content']['summary'] else None
                review_strengths = reply['content']['strengths']['value'] if 'strengths' in reply['content'] and 'value' in reply['content']['strengths'] else None
                review_weaknesses = reply['content']['weaknesses']['value'] if 'weaknesses' in reply['content'] and 'value' in reply['content']['weaknesses'] else None
                review_questions = reply['content']['questions']['value'] if 'questions' in reply['content'] and 'value' in reply['content']['questions'] else None
                review_limitations = reply['content']['limitations']['value'] if 'limitations' in reply['content'] and 'value' in reply['content']['limitations'] else None
                
                # Create a new row with the extracted features
                new_row = {
                    'submission_id': submission_id,
                    'submission_number': submission_number,
                    'submission_creation_date': submission_creation_date,
                    'submission_authors': submission_authors,
                    
                    'submission_title': submission_title,
                    'submission_abstract': submission_abstract,
                    
                    'reviewer': reviewer,
                    'review_tcdate': review_tcdate,
                    'review_cdate': review_cdate,
                    'review_tmdate': review_tmdate,
                    'review_mdate': review_mdate,
                    
                    'review_summary': review_summary,
                    'review_strengths': review_strengths,
                    'review_weaknesses': review_weaknesses,
                    'review_questions': review_questions,
                    'review_limitations': review_limitations,
                    
                    'review_rating': review_rating,
                    'review_confidence': review_confidence,
                    'review_soundness': review_soundness,
                    'review_presentation': review_presentation,
                    'review_contribution': review_contribution
                }
                new_rows.append(new_row)

# Create a new DataFrame from the new rows
df_reviews = pd.DataFrame(new_rows)

# Save the updated DataFrame to a JSON file
output_json_path = '../data/processed/neurips-2023.json'
df_reviews.to_json(output_json_path, orient='records', indent=4)

print(f"Updated DataFrame with reviews saved to JSON file at: {output_json_path}")

In [None]:
# Load the JSON file as a pandas DataFrame
df_reviews = pd.read_json('../data/processed/neurips-2023.json')

# Display the header of the first 5 samples
df_reviews

In [None]:
# Concatenate the specified columns into a single column named 'total_review'
df_reviews['total_review'] = df_reviews[['review_summary', 'review_strengths', 'review_weaknesses', 'review_questions', 'review_limitations']].apply(
    lambda row: ' '.join(row.dropna()), axis=1
)

# Drop the original columns to reduce redundancy
df_reviews = df_reviews.drop(columns=['review_summary', 'review_strengths', 'review_weaknesses', 'review_questions', 'review_limitations'])
# Create a new column 'length_words' to count the number of words in the 'total_review' column
df_reviews['length_words'] = df_reviews['total_review'].apply(lambda x: len(x.split()))


# Save the new DataFrame to a different JSON file
new_output_json_path = '/home/ali/Review_Quality_Benchmark/data/processed/openreview_ICLR2024_total_review.json'
df_reviews.to_json(new_output_json_path, orient='records', indent=4)

print(f"New DataFrame with 'total_review' column saved to JSON file at: {new_output_json_path}")

In [None]:
def count_citations(text):
    citation_patterns = [
        r'\[\d+(?:,\s*\d+)*\]',                         # [1], [1, 2, 3]
        r'\([A-Za-z]+ et al\.,\s*\d{4}\)',               # (Smith et al., 2020)
        r'\(\d{4}[a-z]?\)',                              # (2020), (2020a)
        r'\[[A-Za-z]+\d{4}[a-z]?\]',                     # [Smith2020], [Johnson2021a]
        r'\b(?:doi:|arxiv:|https?://[^\s]+)',             # DOI, arXiv, URLs
    ]
    pattern = '|'.join(citation_patterns)
    matches = re.findall(pattern, text)
    return len(matches)


# Apply the count_citations function to the 'total_review' column and create a new column 'citation_count'
df_reviews['citation_count'] = df_reviews['total_review'].apply(count_citations)
df_reviews

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = AutoTokenizer.from_pretrained("shahrukhx01/bert-mini-finetune-question-detection")
model = AutoModelForSequenceClassification.from_pretrained("shahrukhx01/bert-mini-finetune-question-detection")
model.to(device)
model.eval()


def count_questions(review_text):
    
    if review_text:
        question_count = 0

        sentences = sent_tokenize(review_text)
        for sent in sentences:
            inputs = tokenizer(
                sent,
                return_tensors="pt",
                truncation=True,
                max_length=64,
                padding=True
            ).to(device)
            with torch.no_grad():
                outputs = model(**inputs)
                predicted = torch.argmax(outputs.logits, dim=1).item()

                # Label 0 = question
                if predicted == 0:
                    question_count += 1

    return question_count


df_reviews['question_count'] = [
    count_questions(row['total_review']) for row in tqdm(df_reviews.to_dict('records'), desc="Processing reviews")
]
df_reviews

In [None]:
# Count rows where cdate and tcdate are equal or not equal
cdate_tcdate_equal = (df_reviews['review_cdate'] == df_reviews['review_tcdate']).sum()
cdate_tcdate_not_equal = (df_reviews['review_cdate'] != df_reviews['review_tcdate']).sum()

# Count rows where tmdate and mdate are equal or not equal
tmdate_mdate_equal = (df_reviews['review_tmdate'] == df_reviews['review_mdate']).sum()
tmdate_mdate_not_equal = (df_reviews['review_tmdate'] != df_reviews['review_mdate']).sum()

# Print the results
print(f"cdate and tcdate equal: {cdate_tcdate_equal}")
print(f"cdate and tcdate not equal: {cdate_tcdate_not_equal}")
print(f"tmdate and mdate equal: {tmdate_mdate_equal}")
print(f"tmdate and mdate not equal: {tmdate_mdate_not_equal}")

In [None]:
# Drop the 'tcdate' and 'tmdate' columns
df_reviews = df_reviews.drop(columns=['review_tcdate', 'review_tmdate'])

# Rename 'cdate' to 'creation_date' and 'mdate' to 'last_modification_date'
df_reviews = df_reviews.rename(columns={'review_cdate': 'creation_date', 'review_mdate': 'last_modification_date'})

# Display the updated DataFrame
df_reviews

In [23]:
# Save the new DataFrame to a different JSON file
out = '../data/processed/neurips-2023.json'
df_reviews.to_json(out, orient='records', indent=4)

In [None]:
# Read the JSON file as a pandas DataFrame
df_reviews = pd.read_json('../data/processed/neurips-2023.json')

# Display the first few rows of the DataFrame
df_reviews

In [None]:
def compute_mattr(review_text):
    mattr_value = ""
    if review_text is not None:
        review_text = review_text.strip()  # Remove leading/trailing whitespace
        review_text = review_text.replace('\n', '')  # Replace newlines with spaces
        try:
            cleaned = lats.Normalize(review_text, lats.ld_params_en)
            tokens = cleaned.toks
            mattr_value = f"{ld.lexdiv(tokens).mattr:.4f}"
        except Exception as e:
            mattr_value = ""
    return mattr_value


df_reviews['mattr'] = [
    compute_mattr(row['total_review']) for row in tqdm(df_reviews.to_dict('records'), desc="Processing reviews")
]
df_reviews

In [5]:
# Save the new DataFrame to a different JSON file
out = '../data/processed/neurips-2023.json'
df_reviews.to_json(out, orient='records', indent=4)

In [None]:
def compute_sentiment_polarity(review_text):
    review_text = review_text.strip()
    try:
        blob = TextBlob(review_text)
        sentiment = blob.sentiment.polarity
    except Exception:
        sentiment = ""

    return sentiment


df_reviews['sentiment_polarity'] = [
    compute_sentiment_polarity(row['total_review']) for row in tqdm(df_reviews.to_dict('records'), desc="Processing reviews")
]
df_reviews

In [8]:
# Save the new DataFrame to a different JSON file
out = '../data/processed/neurips-2023.json'
df_reviews.to_json(out, orient='records', indent=4)

In [None]:
# Read the JSON file as a pandas DataFrame
df_reviews = pd.read_json('../data/processed/neurips-2023.json')

# Display the first few rows of the DataFrame
df_reviews

In [None]:
# --- Load SPECTER model ---
model_name = "allenai/specter"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
model.eval()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


def encoding_text(text):
    inputs = tokenizer(
        text,
        return_tensors="pt",
        truncation=True,
        max_length=512,
        padding=True
    ).to(device)
    
    with torch.no_grad():
        outputs = model(**inputs)
        embeddings = outputs.last_hidden_state[:, 0, :]  # [CLS] token
    
    return embeddings


def compute_relevance_score(review_text, title, abstract):
    # Encode document
    doc_emb = encoding_text(f"{title} {abstract}")

    # Encode review text
    review_emb = encoding_text(review_text)
    
    # Compute cosine similarity
    return F.cosine_similarity(doc_emb, review_emb).item()


# Compute similarity score for each row with progress bar
df_reviews['similarity_score'] = [
    compute_relevance_score(row['total_review'], row['submission_title'], row['submission_abstract'])
    for row in tqdm(df_reviews.to_dict('records'), desc="Computing similarity scores")
]

# Display the updated DataFrame
df_reviews

In [11]:
# Save the new DataFrame to a different JSON file
out = '../data/processed/neurips-2023.json'
df_reviews.to_json(out, orient='records', indent=4)

In [None]:
# Read the JSON file as a pandas DataFrame
df_reviews = pd.read_json('../data/processed/neurips-2023.json')
# Display the first few rows of the DataFrame
df_reviews

In [None]:
# Calculate the time differences and add new columns
df_reviews['paper_submission_to_review_submission_time'] = df_reviews['last_modification_date'] - df_reviews['submission_creation_date']
df_reviews['review_creation_to_review_submission_time'] = df_reviews['last_modification_date'] - df_reviews['creation_date']

# Display the updated DataFrame
df_reviews

In [None]:
# Convert time from milliseconds to days
df_reviews['paper_submission_to_review_submission_days'] = df_reviews['paper_submission_to_review_submission_time'] // (24 * 60 * 60 * 1000)
df_reviews['review_creation_to_review_submission_days'] = df_reviews['review_creation_to_review_submission_time'] // (24 * 60 * 60 * 1000)

# Drop the original columns to avoid redundancy
df_reviews = df_reviews.drop(columns=['paper_submission_to_review_submission_time', 'review_creation_to_review_submission_time'])

# Display the updated DataFrame
df_reviews

In [6]:
# Save the new DataFrame to a different JSON file
out = '../data/processed/neurips-2023.json'
df_reviews.to_json(out, orient='records', indent=4)

In [None]:
# Read the JSON file as a pandas DataFrame
df_reviews = pd.read_json('../data/processed/neurips-2023.json')
# Display the first few rows of the DataFrame
df_reviews

In [None]:
# Enable tqdm for pandas apply
tqdm.pandas(desc="Scoring Readability")

# Define the readability scoring function
def readability_scores(text):
    try:
        return {
            "flesch_reading_ease": round(textstat.flesch_reading_ease(text), 4),
            "flesch_kincaid_grade": round(textstat.flesch_kincaid_grade(text), 4),
            "gunning_fog": round(textstat.gunning_fog(text), 4),
            "smog_index": round(textstat.smog_index(text), 4),
            "automated_readability_index": round(textstat.automated_readability_index(text), 4),
        }
    except:
        return {
            "flesch_reading_ease": None,
            "flesch_kincaid_grade": None,
            "gunning_fog": None,
            "smog_index": None,
            "automated_readability_index": None,
        }

readability_scores_df = df_reviews['total_review'].progress_apply(readability_scores).apply(pd.Series)
df_reviews = pd.concat([df_reviews, readability_scores_df], axis=1)
df_reviews

In [None]:
# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Step 1: Load training corpus
print("📥 Downloading training corpus...")
train_corpus = Corpus(filename=download('wiki-politeness-annotated'))

# Step 2: Convert review data to Utterances with dummy speakers
review_utterances = []
for idx, row in tqdm(df_reviews.iterrows(), desc="🔧 Preparing Utterances", total=len(df_reviews)):
    review_text = row['total_review'].strip()
    if review_text:
        dummy_speaker = Speaker(id=f"reviewer_{idx}")
        review_utterances.append(
            Utterance(id=str(idx), text=review_text, speaker=dummy_speaker, meta={"orig_row": row})
        )

# Step 3: Build test corpus
print("📦 Building test corpus...")
test_corpus = Corpus(utterances=review_utterances)

# Step 4: Parse
print("🧠 Parsing utterances...")
parser = TextParser()
parser.transform(train_corpus)
parser.transform(test_corpus)

# Step 5: Extract politeness strategies
print("✨ Extracting politeness strategies...")
ps = PolitenessStrategies()
ps.transform(train_corpus)
ps.transform(test_corpus)

# Step 6: Train classifier
print("🎓 Training classifier...")
clf = Classifier(obj_type='utterance', pred_feats=['politeness_strategies'],
                 labeller=lambda utt: utt.meta.get("Binary") == 1)

# Move classifier to GPU
clf.device = device

# Train on GPU
clf.fit(train_corpus)

# Test on GPU
clf.transform(test_corpus)

# Step 7: Compute politeness scores and add them to the dataframe
print("📈 Computing politeness scores...")
politeness_scores = []
for utt in tqdm(test_corpus.iter_utterances(), desc="🔗 Assigning Scores"):
    try:
        score = clf.summarize(test_corpus).loc[utt.id, "pred_score"]
        politeness_scores.append(round(score, 4))
    except KeyError:
        politeness_scores.append("")

# Add politeness scores to the dataframe
df_reviews['politeness_score'] = politeness_scores

print("✅ Politeness scores added to the dataframe.")


In [6]:
# Save the new DataFrame to a different JSON file
out = '../data/processed/neurips-2023.json'
df_reviews.to_json(out, orient='records', indent=4)

In [None]:
# Read the JSON file as a pandas DataFrame
df_reviews = pd.read_json('../data/processed/neurips-2023.json')
# Display the first few rows of the DataFrame
df_reviews

In [None]:
# Define labels used by the HEDGEhog model
labels = ["C", "D", "E", "I", "N"]

# Set up model arguments
model_args = NERArgs()
model_args.labels_list = labels
model_args.silent = True
model_args.use_multiprocessing = False

# Initialize model
model = NERModel(
    model_type="bert",
    model_name="jeniakim/hedgehog",
    args=model_args,
    use_cuda=torch.cuda.is_available()
)

# Function to count each label type
def count_hedge_labels(text):
    predictions, _ = model.predict([text])
    token_labels = [list(token.values())[0] for token in predictions[0]]
    counts = Counter(token_labels)
    return {label: counts.get(label, 0) for label in labels}


# Escape brackets in the 'total_review' column; brackets [] are raising errors for hedge function. I should add \ before them.
df_reviews['total_review'] = df_reviews['total_review'].apply(
    lambda x: re.sub(r'([\[\]])', r'\\\1', x)
)

# Apply count_hedge_labels to the 'total_review' column
tqdm.pandas(desc="Counting Hedge Labels")
hedge_counts = df_reviews["total_review"].progress_apply(count_hedge_labels)

# Convert the dictionary output into separate columns
for label in labels:
    df_reviews[f"hedge_{label}"] = hedge_counts.apply(lambda x: x.get(label, 0))

df_reviews

In [30]:
# Save the new DataFrame to a different JSON file
out = '../data/processed/neurips-2023.json'
df_reviews.to_json(out, orient='records', indent=4)

# Slice 1000

In [None]:
input_file = '../data/processed/neurips-2023.json'

df = pd.read_json(input_file)
df

In [None]:
# Randomly select 1000 unique submission_number values
selected_submission_numbers = random.sample(df['submission_number'].unique().tolist(), 1000)

# Filter rows with the selected submission_number values
df_1000 = df[df['submission_number'].isin(selected_submission_numbers)]

# Display the new dataframe
df_1000

In [None]:
df_1000.to_json('../data/processed/neurips-2023-1000-papers.json', orient='records')

# LLM

In [None]:
input_file = '../data/processed/neurips-2023-1000-papers.json'
# Load data
df = pd.read_json(input_file)

llm_fields = [
    "llm_Comprehensiveness", "llm_Vagueness", "llm_Objectivity", "llm_Fairness", "llm_Actionability", 
    "llm_Constructiveness", "llm_Relevance Alignment", "llm_Clarity and Readability", "llm_Usage of Technical Terms",
    "llm_Factuality", "llm_Overall Quality", "llm_overall_score_100", "llm_Sentiment Polarity", "llm_Politeness", 
]


# Check for missing fields and add them if not present
for field in llm_fields:
    if field not in df.columns:
        df[field] = pd.NA

# Pattern to extract JSON block
pattern = re.compile(r"<review_assessment>\s*(\{.*?\})\s*</review_assessment>", re.DOTALL)

# Define prompt template
template = """# REVIEW-QUALITY JUDGE

## 0 — ROLE

You are **ReviewInspector-LLM**, a rigorous, impartial meta-reviewer.
Your goal is to assess the quality of a single peer-review against a predefined set of criteria and to provide precise, structured evaluations.

## 1 — INPUTS

Title: {title}
Abstract: {abstract}
Review: {review_text}

## 2 — EVALUATION CRITERIA

Return **only** the scale value or label at right (no rationale text).

| #  | Criterion                    | Allowed scale / label                       | Description                                                                |
| -- | ---------------------------- | ------------------------------------------- | -------------------------------------------------------------------------- |
| 1  | **Comprehensiveness**        | integer **0-5**                             | Extent to which the review covers all key aspects of the paper.            |
| 2  | **Usage of Technical Terms** | integer **0-5**                             | Appropriateness and frequency of domain-specific vocabulary.               |
| 3  | **Factuality**               | **factual / partially factual / unfactual** | Accuracy of the statements made in the review.                             |
| 4  | **Sentiment Polarity**       | **negative / neutral / positive**           | Overall sentiment conveyed by the reviewer.                                |
| 5  | **Politeness**               | **polite / neutral / impolite**             | Tone and manner of the review language.                                    |
| 6  | **Vagueness**                | **none / low / moderate / high / extreme**  | Degree of ambiguity or lack of specificity in the review.                  |
| 7  | **Objectivity**              | integer **0-5**                             | Presence of unbiased, evidence-based commentary.                           |
| 8  | **Fairness**                 | integer **0-5**                             | Perceived impartiality and balance in judgments.                           |
| 9  | **Actionability**            | integer **0-5**                             | Helpfulness of the review in suggesting clear next steps.                  |
| 10 | **Constructiveness**         | integer **0-5**                             | Degree to which the review offers improvements rather than just criticism. |
| 11 | **Relevance Alignment**      | integer **0-5**                             | How well the review relates to the content and scope of the paper.         |
| 12 | **Clarity and Readability**  | integer **0-5**                             | Ease of understanding the review, including grammar and structure.         |
| 13 | **Overall Quality**          | integer **0-100**                           | Holistic evaluation of the review's usefulness and professionalism.        |

## 3 — SCORING GUIDELINES

For 0-5 scales:

* 5 = Outstanding
* 4 = Strong
* 3 = Adequate
* 2 = Weak
* 1 = Very weak
* 0 = Absent/irrelevant

## 4 — ANALYSIS & COMPUTATION (silent)

1. Read and understand the review in the context of the paper title and abstract.
2. Extract quantitative and qualitative signals (e.g., term usage, factual consistency, tone, clarity).
3. Map observations to the corresponding scoring scales.

## 5 — OUTPUT FORMAT (strict)  
Return **exactly one** JSON block wrapped in the tag below — **no comments or extra text**.

```json
<review_assessment>
{{
  "paper_title": "{title}",
  "criteria": {{
    "Comprehensiveness":       ...,
    "Usage of Technical Terms":   ...,
    "Factuality":    ...,
    "Sentiment Polarity":      ...,
    "Politeness":  ...,
    "Vagueness":          ...,
    "Objectivity":             ...,
    "Fairness":         ...,
    "Actionability":        ...,
    "Constructiveness":    ...,
    "Relevance Alignment":    ...,
    "Clarity and Readability":    ...,
    "Relevance Alignment":    ...,
    "Overall Quality":     ...
  }},
  "overall_score_100": ...
}}
</review_assessment>
```
"""

df

In [None]:
# Set the temperature parameter for the llama model
temperature = 0
seed = 42
llm_name = "qwen3:8b"  # llama3:8b, qwen3:8b

# Process each row
for idx, row in tqdm(df.iterrows(), total=len(df), desc="Scoring with LLM"):
    # Skip if all llm fields are already filled
    if all(pd.notna(row.get(field, pd.NA)) for field in llm_fields):
        continue

    prompt = template.format(
        title=row['submission_title'],
        abstract=row['submission_abstract'],
        review_text=row['total_review']
    )
    
    for attempt in range(5):
        try:
            response = chat(llm_name, messages=[{'role': 'user', 'content': prompt}], options={'temperature': temperature, 'seed': seed})
            content = response['message']['content']
            match = pattern.search(content)
            if not match:
                raise ValueError("No JSON block found")

            parsed = json.loads(match.group(1))
            print(parsed["overall_score_100"])
            for key, val in parsed["criteria"].items():
                df.at[idx, f"llm_{key}"] = val
            df.at[idx, "llm_overall_score_100"] = parsed["overall_score_100"]

            break

        except Exception as e:
            print(f"❌ Error at row {idx}, attempt {attempt + 1}: {e}")

In [None]:
df.to_json(f'../data/processed/neurips-2023-1000-{llm_name}.json', orient='records')