In [None]:
import json
import csv
from datetime import datetime

# Load the dataset
with open('../data/raw/f1000research.json', 'r') as f:
    data = json.load(f)

data[0]

In [1]:
output_file = "../data/processed/f1000research.csv"
input_file = output_file

In [None]:
import json
import csv
from datetime import datetime

rows = []

for entry in data:
    if len(entry) != 1:
        continue  # skip multi-version papers

    paper_obj = entry[0]
    paper_info = paper_obj.get("paper", {})
    reviews = paper_obj.get("reviews", [])
    date_str = paper_obj.get("date", "").strip()

    try:
        paper_date = datetime.strptime(date_str, "%d %b %y")
    except Exception:
        paper_date = None

    title = paper_info.get("title", "").replace("\n", " ").replace("\r", " ").strip()
    abstract = paper_info.get("abstract", "").replace("\n", " ").replace("\r", " ").strip()

    for review in reviews:
        reviewer = review.get("name", "Anonymous").strip()
        review_date_str = review.get("date", "").strip()

        review_text = review.get("report", "").replace("\n", " ").replace("\r", " ").strip()
        review_suggestion = review.get("suggestion", "").strip()
        length_words = len(review_text.split())

        try:
            review_date = datetime.strptime(review_date_str, "%d %b %Y")
            days_to_submit = (review_date - paper_date).days if paper_date else None
        except Exception:
            days_to_submit = None

        rows.append({
            "reviewer": reviewer,
            "review_date": review_date_str,
            "review_suggestion": review_suggestion,
            "length_words": length_words,
            "title": title,
            "abstract": abstract,
            "days_to_submit": days_to_submit,
            "review_text": review_text,
        })

# Save the cleaned reviews to a new CSV file
with open(output_file, mode='w', newline='', encoding='utf-8', errors='ignore') as f:
    writer = csv.DictWriter(f, fieldnames=rows[0].keys(), quoting=csv.QUOTE_ALL)
    writer.writeheader()
    writer.writerows(rows)

print(f"✅ Cleaned and saved {len(rows)} reviews to {output_file}")


In [None]:
!pip install taaled pylats spacy
# English models
!python -m spacy download en_core_web_sm
!python -m spacy download en_core_web_trf

# Spanish models (used as fallback)
!python -m spacy download es_core_news_sm
!python -m spacy download es_dep_news_trf


In [None]:
import csv
from taaled import ld
from pylats import lats
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor

# --- Load and prepare ---
with open(input_file, mode='r', encoding='utf-8', errors='ignore') as f:
    reader = list(csv.DictReader(f))
    fieldnames = list(reader[0].keys())

if "mattr" not in fieldnames:
    fieldnames.append("mattr")
if "mattr_reason" in fieldnames:
    fieldnames.remove("mattr_reason")

params = lats.ld_params_en  # Cache once

def compute_mattr(row):
    review_text = row.get("review_text", "").strip()
    try:
        cleaned = lats.Normalize(review_text, params)
        tokens = cleaned.toks
        row["mattr"] = f"{ld.lexdiv(tokens).mattr:.4f}"
    except Exception:
        row["mattr"] = ""
    row.pop("mattr_reason", None)
    return row

# --- Parallel execution ---
with ThreadPoolExecutor(max_workers=8) as executor:
    output_rows = list(tqdm(executor.map(compute_mattr, reader), total=len(reader), desc="Parallel MATTR"))

# --- Save back to CSV ---
with open(input_file, mode='w', newline='', encoding='utf-8', errors='ignore') as f:
    writer = csv.DictWriter(f, fieldnames=fieldnames, quoting=csv.QUOTE_ALL)
    writer.writeheader()
    writer.writerows(output_rows)

print("✅ Parallel MATTR values saved to review_analysis.csv")


In [None]:
!pip install transformers torch nltk

###########################
# Apple silicon support
# Uninstall current PyTorch version (if any)
# !pip uninstall torch -y

# Install PyTorch with MPS (Metal Performance Shaders) support
# !pip install torch==2.1.2 torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
###########################

import nltk
nltk.download('punkt')
nltk.download('punkt_tab')

In [None]:
import csv
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from tqdm import tqdm
from nltk.tokenize import sent_tokenize
import nltk

# Ensure NLTK punkt tokenizer is available
nltk.download('punkt', quiet=True)

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("shahrukhx01/bert-mini-finetune-question-detection")
model = AutoModelForSequenceClassification.from_pretrained("shahrukhx01/bert-mini-finetune-question-detection")

# Use GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

output_rows = []

# Load review rows
with open(input_file, mode='r', encoding='utf-8', errors='ignore') as f:
    reader = list(csv.DictReader(f))
    fieldnames = list(reader[0].keys())
    if "question_count" not in fieldnames:
        fieldnames.append("question_count")

    for row in tqdm(reader, desc="Detecting Questions"):
        review_text = row.get("review_text", "")
        question_count = 0

        try:
            sentences = sent_tokenize(review_text)
            for sent in sentences:
                inputs = tokenizer(
                    sent,
                    return_tensors="pt",
                    truncation=True,
                    max_length=64,
                    padding=True
                ).to(device)

                with torch.no_grad():
                    outputs = model(**inputs)
                    predicted = torch.argmax(outputs.logits, dim=1).item()

                    # Label 0 = question
                    if predicted == 0:
                        question_count += 1
        except Exception as e:
            question_count = ""

        row["question_count"] = question_count
        output_rows.append(row)

# Save updated CSV
with open(input_file, mode='w', newline='', encoding='utf-8', errors='ignore') as f:
    writer = csv.DictWriter(f, fieldnames=fieldnames, quoting=csv.QUOTE_ALL)
    writer.writeheader()
    writer.writerows(output_rows)

print("✅ Questions counted and saved in review_analysis.csv")


In [None]:
import csv
import re
from tqdm import tqdm

# --- Citation counting logic ---
def count_citations(text):
    citation_patterns = [
        r'\[\d+(?:,\s*\d+)*\]',                         # [1], [1, 2, 3]
        r'\([A-Za-z]+ et al\.,\s*\d{4}\)',               # (Smith et al., 2020)
        r'\(\d{4}[a-z]?\)',                              # (2020), (2020a)
        r'\[[A-Za-z]+\d{4}[a-z]?\]',                     # [Smith2020], [Johnson2021a]
        r'\b(?:doi:|arxiv:|https?://[^\s]+)',             # DOI, arXiv, URLs
    ]
    pattern = '|'.join(citation_patterns)
    matches = re.findall(pattern, text)
    return len(matches)

# --- Load CSV and apply ---
output_rows = []

with open(input_file, mode='r', encoding='utf-8', errors='ignore') as f:
    reader = list(csv.DictReader(f))
    fieldnames = list(reader[0].keys())

    # Update for citation_count
    if "citation_count" not in fieldnames:
        fieldnames.append("citation_count")
    if "has_citation" in fieldnames:
        fieldnames.remove("has_citation")  # Remove old 'has_citation' if needed

    for row in tqdm(reader, desc="Counting Citations"):
        review_text = row.get("review_text", "")
        citation_count = count_citations(review_text)
        row["citation_count"] = citation_count
        output_rows.append(row)

# --- Save updated CSV ---
with open(input_file, mode='w', newline='', encoding='utf-8', errors='ignore') as f:
    writer = csv.DictWriter(f, fieldnames=fieldnames, quoting=csv.QUOTE_ALL)
    writer.writeheader()
    writer.writerows(output_rows)

print("✅ Citation counts added to review_analysis.csv")


In [None]:
import csv

with open(output_file, mode='r', encoding='utf-8', errors='ignore') as f:
    reader = csv.DictReader(f)
    total = 0
    with_citations = 0

    for row in reader:
        total += 1
        if row.get("citation_count") == "0":
            with_citations += 1

print(f"📄 Total reviews: {total}")
print(f"🔍 Reviews with citations: {with_citations}")
print(f"📊 Percentage: {(with_citations / total * 100):.2f}%")

In [None]:
!pip install textblob
!python -m textblob.download_corpora

In [None]:
import csv
from textblob import TextBlob
from tqdm import tqdm

output_rows = []

# Read and process the file
with open(input_file, mode='r', encoding='utf-8', errors='ignore') as f:
    reader = list(csv.DictReader(f))
    fieldnames = list(reader[0].keys())

    # Add new column if not already there
    if "sentiment_polarity" not in fieldnames:
        fieldnames.append("sentiment_polarity")

    for row in tqdm(reader, desc="Analyzing Sentiment"):
        review_text = row.get("review_text", "").strip()
        try:
            blob = TextBlob(review_text)
            sentiment = blob.sentiment.polarity
        except Exception:
            sentiment = ""

        row["sentiment_polarity"] = sentiment
        output_rows.append(row)

# Write updated CSV
with open(input_file, mode='w', newline='', encoding='utf-8', errors='ignore') as f:
    writer = csv.DictWriter(f, fieldnames=fieldnames, quoting=csv.QUOTE_ALL)
    writer.writeheader()
    writer.writerows(output_rows)

print("✅ Sentiment polarity added to review_analysis.csv")

In [None]:
!pip install convokit
!python -m spacy download en_core_web_sm

In [None]:
import csv
from tqdm import tqdm
from convokit import Corpus, download, TextParser, PolitenessStrategies, Classifier, Utterance, Speaker

# Step 1: Load training corpus
print("📥 Downloading training corpus...")
train_corpus = Corpus(filename=download('wiki-politeness-annotated'))

# Step 2: Load review data and convert to Utterances with dummy speakers
review_utterances = []

with open(input_file, mode='r', encoding='utf-8', errors='ignore') as f:
    reader = list(csv.DictReader(f))
    for idx, row in tqdm(enumerate(reader), desc="🔧 Preparing Utterances", total=1805):  # Adjust total if needed
        review_text = row.get("review_text", "").strip()
        if review_text:
            dummy_speaker = Speaker(id=f"reviewer_{idx}")
            review_utterances.append(
                Utterance(id=str(idx), text=review_text, speaker=dummy_speaker, meta={"orig_row": row})
            )

# Step 3: Build test corpus
print("📦 Building test corpus...")
test_corpus = Corpus(utterances=review_utterances)

# Step 4: Parse
print("🧠 Parsing utterances...")
parser = TextParser()
parser.transform(train_corpus)
parser.transform(test_corpus)

# Step 5: Extract politeness strategies
print("✨ Extracting politeness strategies...")
ps = PolitenessStrategies()
ps.transform(train_corpus)
ps.transform(test_corpus)

# Step 6: Train classifier
print("🎓 Training classifier...")
clf = Classifier(obj_type='utterance', pred_feats=['politeness_strategies'],
                 labeller=lambda utt: utt.meta.get("Binary") == 1)
clf.fit(train_corpus)
clf.transform(test_corpus)

# Step 7: Summarize results
print("📈 Summarizing scores...")
results = clf.summarize(test_corpus)

# Step 8: Merge back to CSV rows
print("🧾 Merging scores into CSV...")
output_rows = []
fieldnames = list(reader[0].keys())
if "politeness_score" not in fieldnames:
    fieldnames.append("politeness_score")

for utt in tqdm(test_corpus.iter_utterances(), desc="🔗 Assigning Scores"):
    row = utt.meta["orig_row"]
    try:
        score = results.loc[utt.id, "pred_score"]
        row["politeness_score"] = round(score, 4)
    except KeyError:
        row["politeness_score"] = ""
    output_rows.append(row)

# Step 9: Save
print("💾 Saving to review_analysis.csv...")
with open(input_file, mode='w', newline='', encoding='utf-8', errors='ignore') as f:
    writer = csv.DictWriter(f, fieldnames=fieldnames, quoting=csv.QUOTE_ALL)
    writer.writeheader()
    writer.writerows(output_rows)

print("✅ All done! Politeness scores are now in your CSV.")


In [None]:
import csv
import torch
from transformers import AutoTokenizer, AutoModel
from tqdm import tqdm
import torch.nn.functional as F

# Load SPECTER model
model_name = "allenai/specter"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
model.eval()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

output_rows = []

with open(input_file, mode='r', encoding='utf-8', errors='ignore') as f:
    reader = list(csv.DictReader(f))
    fieldnames = list(reader[0].keys())

    if "similarity_score" not in fieldnames:
        fieldnames.append("similarity_score")

    for row in tqdm(reader, desc="Computing Relevance Score"):
        try:
            review_text = row.get("review_text", "").strip()
            title = row.get("title", "").strip()
            abstract = row.get("abstract", "").strip()
            doc_text = f"{title} {abstract}"

            # Encode document
            doc_inputs = tokenizer(doc_text, return_tensors="pt", truncation=True, padding=True, max_length=512)
            doc_inputs = {k: v.to(device) for k, v in doc_inputs.items()}
            with torch.no_grad():
                doc_emb = model(**doc_inputs).last_hidden_state[:, 0, :]  # [CLS]

            # Encode review
            review_inputs = tokenizer(review_text, return_tensors="pt", truncation=True, padding=True, max_length=512)
            review_inputs = {k: v.to(device) for k, v in review_inputs.items()}
            with torch.no_grad():
                review_emb = model(**review_inputs).last_hidden_state[:, 0, :]  # [CLS]

            # Cosine similarity
            similarity_score = F.cosine_similarity(doc_emb, review_emb).item()
            row["similarity_score"] = similarity_score

        except Exception as e:
            row["similarity_score"] = ""

        output_rows.append(row)

# Save updated CSV
with open(input_file, mode='w', newline='', encoding='utf-8', errors='ignore') as f:
    writer = csv.DictWriter(f, fieldnames=fieldnames, quoting=csv.QUOTE_ALL)
    writer.writeheader()
    writer.writerows(output_rows)

print("✅ Relevance scores added using title + abstract in review_analysis.csv")


In [None]:
import pandas as pd
import textstat
import os
from tqdm import tqdm


# Load CSV
df = pd.read_csv(input_file)

# Enable tqdm for pandas apply
tqdm.pandas(desc="Scoring Readability")

# Define the readability scoring function
def readability_scores(text):
    try:
        return {
            "flesch_reading_ease": textstat.flesch_reading_ease(text),
            "flesch_kincaid_grade": textstat.flesch_kincaid_grade(text),
            "gunning_fog": textstat.gunning_fog(text),
            "smog_index": textstat.smog_index(text),
            "automated_readability_index": textstat.automated_readability_index(text),
        }
    except:
        return {
            "flesch_reading_ease": None,
            "flesch_kincaid_grade": None,
            "gunning_fog": None,
            "smog_index": None,
            "automated_readability_index": None,
        }

# Apply function with progress bar
readability_results = df["review_text"].progress_apply(readability_scores)
readability_df = pd.DataFrame(readability_results.tolist())

# Merge new columns
df = pd.concat([df, readability_df], axis=1)

# Save to file
df.to_csv(output_file, index=False)
print(f"Saved enriched file to: {output_file}")


In [None]:
from simpletransformers.ner import NERModel, NERArgs
import pandas as pd
from tqdm import tqdm
import torch
from collections import Counter

# Define labels used by the HEDGEhog model
labels = ["C", "D", "E", "I", "N"]

# Set up model arguments
model_args = NERArgs()
model_args.labels_list = labels
model_args.silent = True
model_args.use_multiprocessing = False

# Initialize model
model = NERModel(
    model_type="bert",
    model_name="jeniakim/hedgehog",
    args=model_args,
    use_cuda=torch.cuda.is_available()
)

# Load the CSV
df = pd.read_csv(input_file)

# Function to count each label type
def count_hedge_labels(text):
    predictions, _ = model.predict([text])
    token_labels = [list(token.values())[0] for token in predictions[0]]
    counts = Counter(token_labels)
    return {label: counts.get(label, 0) for label in labels}

# Apply across review_text
tqdm.pandas(desc="Counting Hedge Labels")
hedge_counts = df["review_text"].progress_apply(count_hedge_labels)

# Convert counts into separate columns and join with df
hedge_df = pd.DataFrame(hedge_counts.tolist())
hedge_df.columns = [f"hedge_{label}" for label in hedge_df.columns]

df = pd.concat([df.reset_index(drop=True), hedge_df.reset_index(drop=True)], axis=1)

# Save updated CSV
df.to_csv(output_file, index=False)
print("✅ Hedge label counts saved to:", output_file)


In [2]:
import pandas as pd
import json
import re
import csv
import time
from tqdm import tqdm
from ollama import chat

# Load data
df = pd.read_csv(input_file)
llm_fields = [
    "llm_comprehensiveness", "llm_technical_terms", "llm_factuality",
    "llm_sentiment_polarity", "llm_politeness", "llm_vagueness",
    "llm_objectivity", "llm_fairness", "llm_actionability",
    "llm_constructiveness", "llm_relevance_alignment",
    "llm_clarity_readability", "llm_overall_score_100"
]

# Check for missing fields and add them if not present
for field in llm_fields:
    if field not in df.columns:
        df[field] = pd.NA

# Pattern to extract JSON block
pattern = re.compile(r"<review_assessment>\s*(\{.*?\})\s*</review_assessment>", re.DOTALL)

# Updated prompt template
template = """# REVIEW-QUALITY JUDGE

## 0 — ROLE

You are **ReviewInspector-LLM**, a rigorous, impartial meta-reviewer.
Your goal is to assess the quality of a single peer-review against a predefined set of criteria and to provide precise, structured evaluations.

## 1 — INPUTS

Title: {title}
Abstract: {abstract}
Review: {review_text}

## 2 — EVALUATION CRITERIA

Return **only** the scale value or label at right (no rationale text).

| #  | Criterion                    | Allowed scale / label                       | Description                                                                |
| -- | ---------------------------- | ------------------------------------------- | -------------------------------------------------------------------------- |
| 1  | **Comprehensiveness**        | integer **0-5**                             | Extent to which the review covers all key aspects of the paper.            |
| 2  | **Usage of Technical Terms** | integer **0-5**                             | Appropriateness and frequency of domain-specific vocabulary.               |
| 3  | **Factuality**               | **factual / partially factual / unfactual** | Accuracy of the statements made in the review.                             |
| 4  | **Sentiment Polarity**       | **negative / neutral / positive**           | Overall sentiment conveyed by the reviewer.                                |
| 5  | **Politeness**               | **polite / neutral / impolite**             | Tone and manner of the review language.                                    |
| 6  | **Vagueness**                | **none / low / moderate / high / extreme**  | Degree of ambiguity or lack of specificity in the review.                  |
| 7  | **Objectivity**              | integer **0-5**                             | Presence of unbiased, evidence-based commentary.                           |
| 8  | **Fairness**                 | integer **0-5**                             | Perceived impartiality and balance in judgments.                           |
| 9  | **Actionability**            | integer **0-5**                             | Helpfulness of the review in suggesting clear next steps.                  |
| 10 | **Constructiveness**         | integer **0-5**                             | Degree to which the review offers improvements rather than just criticism. |
| 11 | **Relevance Alignment**      | integer **0-5**                             | How well the review relates to the content and scope of the paper.         |
| 12 | **Clarity and Readability**  | integer **0-5**                             | Ease of understanding the review, including grammar and structure.         |
| 13 | **Overall Quality**          | integer **0-100**                           | Holistic evaluation of the review's usefulness and professionalism.        |

## 3 — SCORING GUIDELINES

For 0-5 scales:

* 5 = Outstanding
* 4 = Strong
* 3 = Adequate
* 2 = Weak
* 1 = Very weak
* 0 = Absent/irrelevant

## 4 — ANALYSIS & COMPUTATION (silent)

1. Read and understand the review in the context of the paper title and abstract.
2. Extract quantitative and qualitative signals (e.g., term usage, factual consistency, tone, clarity).
3. Map observations to the corresponding scoring scales.

## 5 — OUTPUT FORMAT (strict)  
Return **exactly one** JSON block wrapped in the tag below — **no comments or extra text**.

```json
<review_assessment>
{{
  "paper_title": "{title}",
  "criteria": {{
    "comprehensiveness":       ...,
    "technical_terms":         ...,
    "factuality":              ...,
    "sentiment_polarity":      ...,
    "politeness":              ...,
    "vagueness":               ...,
    "objectivity":             ...,
    "fairness":                ...,
    "actionability":           ...,
    "constructiveness":        ...,
    "relevance_alignment":     ...,
    "clarity_readability":     ...,
    "overall_quality":         ...
  }},
  "overall_score_100": ...
}}
</review_assessment>
```
"""

In [3]:
for idx, row in tqdm(df.iterrows(), total=len(df), desc="Scoring with LLM"):
    # Skip if all llm fields are already filled
    if all(pd.notna(row.get(field, pd.NA)) for field in llm_fields):
        continue
    
    prompt = template.format(
        title=row['title'],
        abstract=row['abstract'],
        review_text=row['review_text']
    )

    for attempt in range(1):
        try:
            response = chat("llama3", messages=[{'role': 'user', 'content': prompt}], options={"temperature": 0.0, "seed": 42})
            content = response['message']['content']
            match = pattern.search(content)
            if not match:
                raise ValueError("No JSON block found")

            parsed = json.loads(match.group(1))
            for key, val in parsed["criteria"].items():
                df.at[idx, f"llm_{key}"] = val
            df.at[idx, "llm_overall_score_100"] = parsed["overall_score_100"]

            # Save after every successful row
            df.to_csv(input_file, index=False, quoting=csv.QUOTE_ALL)
            break

        except Exception as e:
            print(f"❌ Error at row {idx}, attempt {attempt + 1}: {e}")
            # time.sleep(0.5)


Scoring with LLM:   1%|          | 78/10174 [08:22<16:10:02,  5.76s/it]

❌ Error at row 77, attempt 1: Expecting value: line 6 column 32 (char 230)


Scoring with LLM:   1%|          | 78/10174 [08:23<18:05:41,  6.45s/it]


KeyboardInterrupt: 