In [None]:
import json
import csv
from datetime import datetime

# Load the dataset
with open('../data/raw/semantic-web-journal.json', 'r') as f:
    data = json.load(f)

data[0]

In [2]:
output_file = "../data/processed/semantic-web-journal-analysis.csv"
input_file = output_file

In [None]:
import json
import csv
from datetime import datetime

rows = []

for paper in data:
    paper_id = paper.get("id", "").strip()
    if paper_id.upper() == "UNK" or not paper_id:
        continue

    paper_date_str = paper.get("date", "")
    try:
        paper_date = datetime.strptime(paper_date_str, "%m/%d/%Y")
    except Exception:
        paper_date = None

    for review in paper.get("reviews", []):
        reviewer = review.get("reviewer", "Anonymous").strip()
        review_date_str = review.get("date", "").strip()

        # Clean review text
        review_text = review.get("comment", "")
        review_text = review_text.replace("\n", " ").replace("\r", " ").strip()
        review_suggestion = review.get("suggestion", "")

        length_words = len(review_text.split())

        try:
            review_date = datetime.strptime(review_date_str, "%d/%b/%Y")
            days_to_submit = (review_date - paper_date).days if paper_date else None
        except Exception:
            days_to_submit = None

        rows.append({
            "paper_id": paper_id,
            "reviewer": reviewer,
            "review_date": review_date_str,
            "review_suggestion": review_suggestion,
            "length_words": length_words,
            "days_to_submit": days_to_submit,
            "review_text": review_text
        })

# Save to CSV with proper quoting
with open(output_file, mode='w', newline='', encoding='utf-8', errors='ignore') as f:
    writer = csv.DictWriter(f, fieldnames=rows[0].keys(), quoting=csv.QUOTE_ALL)
    writer.writeheader()
    writer.writerows(rows)

print(f"✅ Cleaned and saved {len(rows)} reviews with full text to review_analysis.csv")



In [None]:
!pip install taaled pylats spacy
# English models
!python -m spacy download en_core_web_sm
!python -m spacy download en_core_web_trf

# Spanish models (used as fallback)
!python -m spacy download es_core_news_sm
!python -m spacy download es_dep_news_trf


In [None]:
import csv
from taaled import ld
from pylats import lats
from tqdm import tqdm

output_rows = []

# Read rows
with open(input_file, mode='r', encoding='utf-8', errors='ignore') as f:
    reader = list(csv.DictReader(f))
    fieldnames = list(reader[0].keys())

    # Ensure 'mattr' column exists
    if "mattr" not in fieldnames:
        fieldnames.append("mattr")
    # Drop 'mattr_reason' if it exists
    if "mattr_reason" in fieldnames:
        fieldnames.remove("mattr_reason")

    for row in tqdm(reader, desc="Computing MATTR"):
        review_text = row.get("review_text", "").strip()
        mattr_value = ""

        try:
            cleaned = lats.Normalize(review_text, lats.ld_params_en)
            tokens = cleaned.toks
            mattr_value = f"{ld.lexdiv(tokens).mattr:.4f}"
        except Exception as e:
            mattr_value = ""

        row["mattr"] = mattr_value
        # Remove 'mattr_reason' if it exists in the row
        row.pop("mattr_reason", None)
        output_rows.append(row)

# Write updated file
with open(input_file, mode='w', newline='', encoding='utf-8', errors='ignore') as f:
    writer = csv.DictWriter(f, fieldnames=fieldnames, quoting=csv.QUOTE_ALL)
    writer.writeheader()
    writer.writerows(output_rows)

print("✅ Clean MATTR values saved to review_analysis.csv")


In [None]:
!pip install transformers torch nltk

###########################
# Apple silicon support
# Uninstall current PyTorch version (if any)
# !pip uninstall torch -y

# Install PyTorch with MPS (Metal Performance Shaders) support
# !pip install torch==2.1.2 torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
###########################

import nltk
nltk.download('punkt')
nltk.download('punkt_tab')

In [None]:
import csv
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from tqdm import tqdm
from nltk.tokenize import sent_tokenize
import nltk

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("shahrukhx01/bert-mini-finetune-question-detection")
model = AutoModelForSequenceClassification.from_pretrained("shahrukhx01/bert-mini-finetune-question-detection")
model.eval()

output_rows = []

# Load review rows
with open(input_file, mode='r', encoding='utf-8', errors='ignore') as f:
    reader = list(csv.DictReader(f))
    fieldnames = list(reader[0].keys())
    if "question_count" not in fieldnames:
        fieldnames.append("question_count")

    for row in tqdm(reader, desc="Detecting Questions"):
        review_text = row.get("review_text", "")
        question_count = 0

        try:
            sentences = sent_tokenize(review_text)
            for sent in sentences:
                inputs = tokenizer(
                    sent,
                    return_tensors="pt",
                    truncation=True,
                    max_length=64,
                    padding=True
                )
                with torch.no_grad():
                    outputs = model(**inputs)
                    predicted = torch.argmax(outputs.logits, dim=1).item()

                    # Label 0 = question
                    if predicted == 0:
                        question_count += 1
        except Exception as e:
            question_count = ""

        row["question_count"] = question_count
        output_rows.append(row)

# Save updated CSV
with open(input_file, mode='w', newline='', encoding='utf-8', errors='ignore') as f:
    writer = csv.DictWriter(f, fieldnames=fieldnames, quoting=csv.QUOTE_ALL)
    writer.writeheader()
    writer.writerows(output_rows)

print("✅ Questions counted and saved in review_analysis.csv")


In [None]:
import csv
import re
from tqdm import tqdm

# --- Citation counting logic ---
def count_citations(text):
    citation_patterns = [
        r'\[\d+(?:,\s*\d+)*\]',                         # [1], [1, 2, 3]
        r'\([A-Za-z]+ et al\.,\s*\d{4}\)',               # (Smith et al., 2020)
        r'\(\d{4}[a-z]?\)',                              # (2020), (2020a)
        r'\[[A-Za-z]+\d{4}[a-z]?\]',                     # [Smith2020], [Johnson2021a]
        r'\b(?:doi:|arxiv:|https?://[^\s]+)',             # DOI, arXiv, URLs
    ]
    pattern = '|'.join(citation_patterns)
    matches = re.findall(pattern, text)
    return len(matches)

# --- Load CSV and apply ---
output_rows = []

with open(input_file, mode='r', encoding='utf-8', errors='ignore') as f:
    reader = list(csv.DictReader(f))
    fieldnames = list(reader[0].keys())

    # Update for citation_count
    if "citation_count" not in fieldnames:
        fieldnames.append("citation_count")
    if "has_citation" in fieldnames:
        fieldnames.remove("has_citation")  # Remove old 'has_citation' if needed

    for row in tqdm(reader, desc="Counting Citations"):
        review_text = row.get("review_text", "")
        citation_count = count_citations(review_text)
        row["citation_count"] = citation_count
        output_rows.append(row)

# --- Save updated CSV ---
with open(input_file, mode='w', newline='', encoding='utf-8', errors='ignore') as f:
    writer = csv.DictWriter(f, fieldnames=fieldnames, quoting=csv.QUOTE_ALL)
    writer.writeheader()
    writer.writerows(output_rows)

print("✅ Citation counts added to review_analysis.csv")


In [None]:
import csv

with open(output_file, mode='r', encoding='utf-8', errors='ignore') as f:
    reader = csv.DictReader(f)
    total = 0
    with_citations = 0

    for row in reader:
        total += 1
        if row.get("citation_count") == "2":
            with_citations += 1

print(f"📄 Total reviews: {total}")
print(f"🔍 Reviews with citations: {with_citations}")
print(f"📊 Percentage: {(with_citations / total * 100):.2f}%")

In [None]:
!pip install textblob
!python -m textblob.download_corpora

In [None]:
import csv
from textblob import TextBlob
from tqdm import tqdm

output_rows = []

# Read and process the file
with open(input_file, mode='r', encoding='utf-8', errors='ignore') as f:
    reader = list(csv.DictReader(f))
    fieldnames = list(reader[0].keys())

    # Add new column if not already there
    if "sentiment_polarity" not in fieldnames:
        fieldnames.append("sentiment_polarity")

    for row in tqdm(reader, desc="Analyzing Sentiment"):
        review_text = row.get("review_text", "").strip()
        try:
            blob = TextBlob(review_text)
            sentiment = blob.sentiment.polarity
        except Exception:
            sentiment = ""

        row["sentiment_polarity"] = sentiment
        output_rows.append(row)

# Write updated CSV
with open(input_file, mode='w', newline='', encoding='utf-8', errors='ignore') as f:
    writer = csv.DictWriter(f, fieldnames=fieldnames, quoting=csv.QUOTE_ALL)
    writer.writeheader()
    writer.writerows(output_rows)

print("✅ Sentiment polarity added to review_analysis.csv")

In [None]:
!pip install convokit
!python -m spacy download en_core_web_sm

In [None]:
import csv
from tqdm import tqdm
from convokit import Corpus, download, TextParser, PolitenessStrategies, Classifier, Utterance, Speaker

# Step 1: Load training corpus
print("📥 Downloading training corpus...")
train_corpus = Corpus(filename=download('wiki-politeness-annotated'))

# Step 2: Load review data and convert to Utterances with dummy speakers
review_utterances = []

with open(input_file, mode='r', encoding='utf-8', errors='ignore') as f:
    reader = list(csv.DictReader(f))
    for idx, row in tqdm(enumerate(reader), desc="🔧 Preparing Utterances", total=1805):  # Adjust total if needed
        review_text = row.get("review_text", "").strip()
        if review_text:
            dummy_speaker = Speaker(id=f"reviewer_{idx}")
            review_utterances.append(
                Utterance(id=str(idx), text=review_text, speaker=dummy_speaker, meta={"orig_row": row})
            )

# Step 3: Build test corpus
print("📦 Building test corpus...")
test_corpus = Corpus(utterances=review_utterances)

# Step 4: Parse
print("🧠 Parsing utterances...")
parser = TextParser()
parser.transform(train_corpus)
parser.transform(test_corpus)

# Step 5: Extract politeness strategies
print("✨ Extracting politeness strategies...")
ps = PolitenessStrategies()
ps.transform(train_corpus)
ps.transform(test_corpus)

# Step 6: Train classifier
print("🎓 Training classifier...")
clf = Classifier(obj_type='utterance', pred_feats=['politeness_strategies'],
                 labeller=lambda utt: utt.meta.get("Binary") == 1)
clf.fit(train_corpus)
clf.transform(test_corpus)

# Step 7: Summarize results
print("📈 Summarizing scores...")
results = clf.summarize(test_corpus)

# Step 8: Merge back to CSV rows
print("🧾 Merging scores into CSV...")
output_rows = []
fieldnames = list(reader[0].keys())
if "politeness_score" not in fieldnames:
    fieldnames.append("politeness_score")

for utt in tqdm(test_corpus.iter_utterances(), desc="🔗 Assigning Scores"):
    row = utt.meta["orig_row"]
    try:
        score = results.loc[utt.id, "pred_score"]
        row["politeness_score"] = round(score, 4)
    except KeyError:
        row["politeness_score"] = ""
    output_rows.append(row)

# Step 9: Save
print("💾 Saving to review_analysis.csv...")
with open(input_file, mode='w', newline='', encoding='utf-8', errors='ignore') as f:
    writer = csv.DictWriter(f, fieldnames=fieldnames, quoting=csv.QUOTE_ALL)
    writer.writeheader()
    writer.writerows(output_rows)

print("✅ All done! Politeness scores are now in your CSV.")


In [None]:
import csv
import json
import torch
from transformers import AutoTokenizer, AutoModel
from tqdm import tqdm
import torch.nn.functional as F

# --- Load SPECTER model ---
model_name = "allenai/specter"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
model.eval()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

output_rows = []

with open(input_file, mode='r', encoding='utf-8', errors='ignore') as f:
    reader = list(csv.DictReader(f))
    fieldnames = list(reader[0].keys())

    if "similarity_score" not in fieldnames:
        fieldnames.append("similarity_score")

    for row in tqdm(reader, desc="Computing Relevance Score"):
        review_text = row.get("review_text", "")
        paper_id = row.get("paper_id", "").strip()

        try:
            # Find matching entry in data
            matched_entry = next((entry for entry in data if str(entry.get("id", "")).strip() == paper_id), None)

            if matched_entry:
                title = matched_entry.get("title", "")
                abstract = matched_entry.get("abstract", "")
                doc_text = f"{title} {abstract}"

                # Encode document
                doc_inputs = tokenizer(doc_text, return_tensors="pt", truncation=True, padding=True, max_length=512)
                doc_inputs = {k: v.to(device) for k, v in doc_inputs.items()}
                with torch.no_grad():
                    doc_emb = model(**doc_inputs).last_hidden_state[:, 0, :]  # [CLS]

                # Encode review text
                review_inputs = tokenizer(review_text, return_tensors="pt", truncation=True, padding=True, max_length=512)
                review_inputs = {k: v.to(device) for k, v in review_inputs.items()}
                with torch.no_grad():
                    review_emb = model(**review_inputs).last_hidden_state[:, 0, :]  # [CLS]

                # Cosine similarity
                similarity_score = F.cosine_similarity(doc_emb, review_emb).item()
                row["similarity_score"] = similarity_score

            else:
                row["similarity_score"] = ""

        except Exception as e:
            row["similarity_score"] = ""

        output_rows.append(row)

# --- Save updated CSV ---
with open(input_file, mode='w', newline='', encoding='utf-8', errors='ignore') as f:
    writer = csv.DictWriter(f, fieldnames=fieldnames, quoting=csv.QUOTE_ALL)
    writer.writeheader()
    writer.writerows(output_rows)

print("✅ Relevance scores added to review_analysis.csv")


In [None]:
import csv
import json
from tqdm import tqdm


output_rows = []

with open(input_file, mode='r', encoding='utf-8', errors='ignore') as f:
    reader = list(csv.DictReader(f))
    original_fieldnames = list(reader[0].keys())

    # Insert title and abstract at positions 5 and 6
    new_fieldnames = original_fieldnames[:5] + ["title", "abstract"] + original_fieldnames[5:]

    for row in tqdm(reader, desc="Adding Title and Abstract (Escaping Newlines)"):
        paper_id = row.get("paper_id", "").strip()

        # Find matching entry
        matched_entry = next((entry for entry in data if str(entry.get("id", "")).strip() == paper_id), None)

        if matched_entry:
            title = matched_entry.get("title", "")
            abstract = matched_entry.get("abstract", "")
        else:
            title = ""
            abstract = ""

        # Escape real newlines in title and abstract
        title = title.replace("\r\n", "\\n").replace("\n", "\\n")
        abstract = abstract.replace("\r\n", "\\n").replace("\n", "\\n")

        # Build new row
        new_row = {}
        for idx, field in enumerate(new_fieldnames):
            if field == "title":
                new_row[field] = title
            elif field == "abstract":
                new_row[field] = abstract
            else:
                # Map original fields
                original_field_idx = idx if idx < 5 else idx - 2  # Adjust because we inserted 2 fields
                if original_field_idx < len(original_fieldnames):
                    original_field = original_fieldnames[original_field_idx]
                    new_row[field] = row.get(original_field, "")

        output_rows.append(new_row)

# --- Save updated CSV ---
with open(input_file, mode='w', newline='', encoding='utf-8', errors='ignore') as f:
    writer = csv.DictWriter(f, fieldnames=new_fieldnames, quoting=csv.QUOTE_ALL)
    writer.writeheader()
    writer.writerows(output_rows)

print("✅ Title and Abstract (with clean \\n) added to review_analysis.csv")


In [None]:
import csv
from tqdm import tqdm
from datetime import datetime

# Helper: parse dates consistently
def parse_date(date_str):
    try:
        return datetime.strptime(date_str, "%d/%b/%Y")
    except Exception:
        return None

output_rows = []

with open(input_file, mode='r', encoding='utf-8', errors='ignore') as f:
    reader = list(csv.DictReader(f))
    rows = list(reader)
    fieldnames = list(rows[0].keys())

    if "num_days_before_deadline" not in fieldnames:
        fieldnames.append("num_days_before_deadline")

    # First: find latest review_date per paper_id
    latest_review_dates = {}

    for row in rows:
        paper_id = row["paper_id"]
        review_date = parse_date(row["review_date"])

        if paper_id and review_date:
            if paper_id not in latest_review_dates:
                latest_review_dates[paper_id] = review_date
            else:
                if review_date > latest_review_dates[paper_id]:
                    latest_review_dates[paper_id] = review_date

    # Second: compute days before deadline for each review
    for row in tqdm(rows, desc="Computing num_days_before_deadline"):
        paper_id = row["paper_id"]
        review_date = parse_date(row["review_date"])
        deadline_date = latest_review_dates.get(paper_id)

        if review_date and deadline_date:
            days_before_deadline = (deadline_date - review_date).days
            row["num_days_before_deadline"] = days_before_deadline
        else:
            row["num_days_before_deadline"] = ""

        output_rows.append(row)

# --- Save updated CSV ---
with open(input_file, mode='w', newline='', encoding='utf-8', errors='ignore') as f:
    writer = csv.DictWriter(f, fieldnames=fieldnames, quoting=csv.QUOTE_ALL)
    writer.writeheader()
    writer.writerows(output_rows)

print("✅ num_days_before_deadline added to review_analysis.csv")


In [None]:
import pandas as pd
import textstat
import os
from tqdm import tqdm


# Load CSV
df = pd.read_csv(input_file)

# Enable tqdm for pandas apply
tqdm.pandas(desc="Scoring Readability")

# Define the readability scoring function
def readability_scores(text):
    try:
        return {
            "flesch_reading_ease": textstat.flesch_reading_ease(text),
            "flesch_kincaid_grade": textstat.flesch_kincaid_grade(text),
            "gunning_fog": textstat.gunning_fog(text),
            "smog_index": textstat.smog_index(text),
            "automated_readability_index": textstat.automated_readability_index(text),
        }
    except:
        return {
            "flesch_reading_ease": None,
            "flesch_kincaid_grade": None,
            "gunning_fog": None,
            "smog_index": None,
            "automated_readability_index": None,
        }

# Apply function with progress bar
readability_results = df["review_text"].progress_apply(readability_scores)
readability_df = pd.DataFrame(readability_results.tolist())

# Merge new columns
df = pd.concat([df, readability_df], axis=1)

# Save to file
df.to_csv(output_file, index=False)
print(f"Saved enriched file to: {output_file}")


In [None]:
from simpletransformers.ner import NERModel, NERArgs
import pandas as pd
from tqdm import tqdm
import torch
from collections import Counter

# Define labels used by the HEDGEhog model
labels = ["C", "D", "E", "I", "N"]

# Set up model arguments
model_args = NERArgs()
model_args.labels_list = labels
model_args.silent = True
model_args.use_multiprocessing = False

# Initialize model
model = NERModel(
    model_type="bert",
    model_name="jeniakim/hedgehog",
    args=model_args,
    use_cuda=torch.cuda.is_available()
)

# Load the CSV
df = pd.read_csv(input_file)

# Function to count each label type
def count_hedge_labels(text):
    predictions, _ = model.predict([text])
    token_labels = [list(token.values())[0] for token in predictions[0]]
    counts = Counter(token_labels)
    return {label: counts.get(label, 0) for label in labels}

# Apply across review_text
tqdm.pandas(desc="Counting Hedge Labels")
hedge_counts = df["review_text"].progress_apply(count_hedge_labels)

# Convert counts into separate columns and join with df
hedge_df = pd.DataFrame(hedge_counts.tolist())
hedge_df.columns = [f"hedge_{label}" for label in hedge_df.columns]

df = pd.concat([df.reset_index(drop=True), hedge_df.reset_index(drop=True)], axis=1)

# Save updated CSV
df.to_csv(output_file, index=False)
print("✅ Hedge label counts saved to:", output_file)


In [12]:
import pandas as pd
import json
import re
import csv
import time
from tqdm import tqdm
from ollama import chat

# Load data
df = pd.read_csv(input_file)
llm_fields = [
    "llm_length_effort", "llm_lexical_diversity", "llm_questions_raised",
    "llm_citation_usage", "llm_sentiment_polarity", "llm_politeness", "llm_hedging",
    "llm_specificity", "llm_domain_terms", "llm_relevance_alignment",
    "llm_readability", "llm_overall_quality", "llm_overall_score_100"
]

# Check for missing fields and add them if not present
for field in llm_fields:
    if field not in df.columns:
        df[field] = pd.NA

# Pattern to extract JSON block
pattern = re.compile(r"<review_assessment>\s*(\{.*?\})\s*</review_assessment>", re.DOTALL)

# Define prompt template
template = """# REVIEW-QUALITY JUDGE

## 0 — ROLE  
You are **ReviewInspector-LLM**, a rigorous, impartial meta-reviewer.  
Your goal is to grade the quality of a single peer-review against a rich set of predefined criteria and to provide concise, actionable feedback.

## 1 — INPUTS  
Title : {title}  
Abstract : {abstract}  
Review: {review_text}

## 2 — EVALUATION CRITERIA  
Return **only** the scale value or label at right (no rationale text).

| # | Criterion | Allowed scale / label |
|---|-----------|-----------------------|
| 1 | **Length & Effort**             | integer **0-5** |
| 2 | **Lexical Diversity (TTR)**     | integer **0-5** |
| 3 | **Number of Questions Raised**  | non-negative **integer** |
| 4 | **Citation Usage**              | **yes / no** |
| 5 | **Sentiment Polarity**          | **negative / neutral / positive** |
| 6 | **Politeness**                  | **polite / neutral / impolite** |
| 7 | **Hedging / Uncertainty**       | **No Hedging / Minimal / Moderate / Heavy / Extreme** |
| 8 | **Specificity**                 | **very specific / somewhat specific / neutral / broad / very broad** |
| 9 | **Use of Domain-Specific Terms**| integer **0-5** |
|10 | **Relevance Alignment**         | integer **0-5** |
|11 | **Readability**                 | integer **0-5** |
|12 | **Overall Quality**             | integer **0-100** |

## 3 — SCORING GUIDELINES  
For 0-5 scales: 5 = Outstanding, 4 = Strong, 3 = Adequate, 2 = Weak, 1 = Very weak, 0 = Absent/irrelevant.

## 4 — ANALYSIS & COMPUTATION (silent)  
1. Read and comprehend the review text.  
2. Compute raw metrics (word count, TTR, sentiment, FK grade, counts, etc.) or otherwise quantify qualitative aspects.  
3. Map raw metrics to the scales above.

## 5 — OUTPUT FORMAT (strict)  
Return **exactly one** JSON block wrapped in the tag below — **no comments or extra text**.

```json
<review_assessment>
{{
  "paper_title": "{title}",
  "criteria": {{
    "length_effort":       ...,
    "lexical_diversity":   ...,
    "questions_raised":    ...,
    "citation_usage":      ...,
    "sentiment_polarity":  ...,
    "politeness":          ...,
    "hedging":             ...,
    "specificity":         ...,
    "domain_terms":        ...,
    "relevance_alignment": ...,
    "readability":         ...,
    "overall_quality":     ...
  }},
  "overall_score_100": ...
}}
</review_assessment>
```
"""

# Process each row
for idx, row in tqdm(df.iterrows(), total=len(df), desc="Scoring with LLM"):
    # Skip if all llm fields are already filled
    if all(pd.notna(row.get(field, pd.NA)) for field in llm_fields):
        continue

    prompt = template.format(
        title=row['title'],
        abstract=row['abstract'],
        review_text=row['review_text']
    )

    for attempt in range(15):
        try:
            response = chat("llama3.2", messages=[{'role': 'user', 'content': prompt}])
            content = response['message']['content']
            match = pattern.search(content)
            if not match:
                raise ValueError("No JSON block found")

            parsed = json.loads(match.group(1))
            for key, val in parsed["criteria"].items():
                df.at[idx, f"llm_{key}"] = val
            df.at[idx, "llm_overall_score_100"] = parsed["overall_score_100"]

            # Save after every successful row
            df.to_csv(input_file, index=False, quoting=csv.QUOTE_ALL)
            break

        except Exception as e:
            print(f"❌ Error at row {idx}, attempt {attempt + 1}: {e}")
            # time.sleep(1)


Scoring with LLM:   0%|          | 0/1805 [00:00<?, ?it/s]

❌ Error at row 35, attempt 1: No JSON block found
❌ Error at row 35, attempt 2: Invalid \escape: line 2 column 30 (char 31)
❌ Error at row 35, attempt 3: No JSON block found
❌ Error at row 35, attempt 4: Invalid \escape: line 2 column 30 (char 31)
❌ Error at row 35, attempt 5: Invalid \escape: line 2 column 30 (char 31)
❌ Error at row 35, attempt 6: Invalid \escape: line 2 column 30 (char 31)
❌ Error at row 35, attempt 7: Invalid \escape: line 2 column 30 (char 31)
❌ Error at row 35, attempt 8: Invalid \escape: line 2 column 30 (char 31)
❌ Error at row 35, attempt 9: Invalid \escape: line 2 column 30 (char 31)
❌ Error at row 35, attempt 10: Invalid \escape: line 2 column 30 (char 31)
❌ Error at row 35, attempt 11: Invalid \escape: line 2 column 30 (char 31)
❌ Error at row 35, attempt 12: Invalid \escape: line 2 column 30 (char 31)
❌ Error at row 35, attempt 13: Invalid \escape: line 2 column 30 (char 31)
❌ Error at row 35, attempt 14: Invalid \escape: line 2 column 30 (char 31)


Scoring with LLM:   2%|▏         | 36/1805 [00:37<30:18,  1.03s/it]

❌ Error at row 35, attempt 15: Invalid \escape: line 2 column 30 (char 31)
❌ Error at row 36, attempt 1: Invalid \escape: line 2 column 30 (char 31)
❌ Error at row 36, attempt 2: Invalid \escape: line 2 column 30 (char 31)
❌ Error at row 36, attempt 3: Invalid \escape: line 2 column 30 (char 31)
❌ Error at row 36, attempt 4: Invalid \escape: line 2 column 30 (char 31)
❌ Error at row 36, attempt 5: Invalid \escape: line 2 column 30 (char 31)
❌ Error at row 36, attempt 6: Invalid \escape: line 2 column 30 (char 31)
❌ Error at row 36, attempt 7: Invalid \escape: line 2 column 30 (char 31)
❌ Error at row 36, attempt 8: Invalid \escape: line 2 column 30 (char 31)
❌ Error at row 36, attempt 9: Invalid \escape: line 2 column 30 (char 31)
❌ Error at row 36, attempt 10: Invalid \escape: line 2 column 30 (char 31)
❌ Error at row 36, attempt 11: Invalid \escape: line 2 column 30 (char 31)
❌ Error at row 36, attempt 12: Invalid \escape: line 2 column 30 (char 31)
❌ Error at row 36, attempt 13: Inv

Scoring with LLM:   2%|▏         | 37/1805 [01:20<1:17:38,  2.63s/it]

❌ Error at row 36, attempt 15: Invalid \escape: line 2 column 30 (char 31)
❌ Error at row 37, attempt 1: Invalid \escape: line 2 column 30 (char 31)
❌ Error at row 37, attempt 2: No JSON block found
❌ Error at row 37, attempt 3: No JSON block found
❌ Error at row 37, attempt 4: Invalid \escape: line 2 column 30 (char 31)
❌ Error at row 37, attempt 5: No JSON block found
❌ Error at row 37, attempt 6: No JSON block found
❌ Error at row 37, attempt 7: No JSON block found
❌ Error at row 37, attempt 8: No JSON block found
❌ Error at row 37, attempt 9: No JSON block found
❌ Error at row 37, attempt 10: No JSON block found
❌ Error at row 37, attempt 11: No JSON block found
❌ Error at row 37, attempt 12: No JSON block found
❌ Error at row 37, attempt 13: No JSON block found
❌ Error at row 37, attempt 14: No JSON block found


Scoring with LLM:   2%|▏         | 38/1805 [01:37<1:40:21,  3.41s/it]

❌ Error at row 37, attempt 15: No JSON block found
❌ Error at row 38, attempt 1: No JSON block found
❌ Error at row 38, attempt 2: No JSON block found
❌ Error at row 38, attempt 3: Invalid \escape: line 2 column 30 (char 31)
❌ Error at row 38, attempt 4: Invalid \escape: line 2 column 30 (char 31)
❌ Error at row 38, attempt 5: Invalid \escape: line 2 column 30 (char 31)
❌ Error at row 38, attempt 6: Invalid \escape: line 2 column 30 (char 31)
❌ Error at row 38, attempt 7: Invalid \escape: line 2 column 30 (char 31)
❌ Error at row 38, attempt 8: No JSON block found
❌ Error at row 38, attempt 9: Invalid \escape: line 2 column 30 (char 31)
❌ Error at row 38, attempt 10: Invalid \escape: line 2 column 30 (char 31)
❌ Error at row 38, attempt 11: Invalid \escape: line 2 column 30 (char 31)
❌ Error at row 38, attempt 12: Invalid \escape: line 2 column 30 (char 31)
❌ Error at row 38, attempt 13: Invalid \escape: line 2 column 30 (char 31)
❌ Error at row 38, attempt 14: Invalid \escape: line 2 

Scoring with LLM:   2%|▏         | 39/1805 [02:00<2:20:10,  4.76s/it]

❌ Error at row 38, attempt 15: Invalid \escape: line 2 column 30 (char 31)
❌ Error at row 595, attempt 1: No JSON block found
❌ Error at row 595, attempt 2: No JSON block found
❌ Error at row 595, attempt 3: No JSON block found
❌ Error at row 595, attempt 4: No JSON block found
❌ Error at row 595, attempt 5: No JSON block found


Scoring with LLM:  33%|███▎      | 596/1805 [02:08<01:58, 10.18it/s] 

❌ Error at row 678, attempt 1: No JSON block found
❌ Error at row 678, attempt 2: No JSON block found
❌ Error at row 678, attempt 3: No JSON block found
❌ Error at row 678, attempt 4: No JSON block found
❌ Error at row 678, attempt 5: No JSON block found
❌ Error at row 678, attempt 6: No JSON block found
❌ Error at row 678, attempt 7: No JSON block found
❌ Error at row 678, attempt 8: No JSON block found
❌ Error at row 678, attempt 9: No JSON block found
❌ Error at row 678, attempt 10: No JSON block found
❌ Error at row 678, attempt 11: No JSON block found
❌ Error at row 678, attempt 12: No JSON block found
❌ Error at row 678, attempt 13: No JSON block found
❌ Error at row 678, attempt 14: No JSON block found


Scoring with LLM:  38%|███▊      | 679/1805 [02:25<02:11,  8.53it/s]

❌ Error at row 678, attempt 15: No JSON block found
❌ Error at row 855, attempt 1: No JSON block found
❌ Error at row 855, attempt 2: No JSON block found
❌ Error at row 855, attempt 3: No JSON block found
❌ Error at row 855, attempt 4: No JSON block found
❌ Error at row 855, attempt 5: No JSON block found
❌ Error at row 855, attempt 6: No JSON block found
❌ Error at row 855, attempt 7: No JSON block found
❌ Error at row 855, attempt 8: No JSON block found
❌ Error at row 855, attempt 9: No JSON block found
❌ Error at row 855, attempt 10: No JSON block found
❌ Error at row 855, attempt 11: No JSON block found
❌ Error at row 855, attempt 12: No JSON block found
❌ Error at row 855, attempt 13: No JSON block found
❌ Error at row 855, attempt 14: No JSON block found


Scoring with LLM:  47%|████▋     | 856/1805 [02:39<01:38,  9.63it/s]

❌ Error at row 855, attempt 15: No JSON block found
❌ Error at row 1146, attempt 1: No JSON block found
❌ Error at row 1146, attempt 2: No JSON block found
❌ Error at row 1146, attempt 3: No JSON block found
❌ Error at row 1146, attempt 4: No JSON block found
❌ Error at row 1146, attempt 5: No JSON block found
❌ Error at row 1146, attempt 6: No JSON block found
❌ Error at row 1146, attempt 7: No JSON block found
❌ Error at row 1146, attempt 8: No JSON block found
❌ Error at row 1146, attempt 9: No JSON block found
❌ Error at row 1146, attempt 10: No JSON block found
❌ Error at row 1146, attempt 11: No JSON block found
❌ Error at row 1146, attempt 12: No JSON block found
❌ Error at row 1146, attempt 13: No JSON block found
❌ Error at row 1146, attempt 14: No JSON block found


Scoring with LLM:  64%|██████▎   | 1147/1805 [02:56<00:55, 11.90it/s]

❌ Error at row 1146, attempt 15: No JSON block found
❌ Error at row 1234, attempt 1: No JSON block found
❌ Error at row 1234, attempt 2: No JSON block found
❌ Error at row 1234, attempt 3: No JSON block found
❌ Error at row 1234, attempt 4: No JSON block found
❌ Error at row 1234, attempt 5: No JSON block found
❌ Error at row 1234, attempt 6: No JSON block found
❌ Error at row 1234, attempt 7: No JSON block found
❌ Error at row 1234, attempt 8: No JSON block found
❌ Error at row 1234, attempt 9: No JSON block found
❌ Error at row 1234, attempt 10: No JSON block found
❌ Error at row 1234, attempt 11: No JSON block found
❌ Error at row 1234, attempt 12: No JSON block found
❌ Error at row 1234, attempt 13: No JSON block found
❌ Error at row 1234, attempt 14: No JSON block found


Scoring with LLM:  68%|██████▊   | 1235/1805 [03:08<00:52, 10.85it/s]

❌ Error at row 1234, attempt 15: No JSON block found
❌ Error at row 1300, attempt 1: No JSON block found
❌ Error at row 1300, attempt 2: No JSON block found
❌ Error at row 1300, attempt 3: No JSON block found
❌ Error at row 1300, attempt 4: No JSON block found
❌ Error at row 1300, attempt 5: No JSON block found
❌ Error at row 1300, attempt 6: No JSON block found
❌ Error at row 1300, attempt 7: No JSON block found
❌ Error at row 1300, attempt 8: No JSON block found
❌ Error at row 1300, attempt 9: No JSON block found
❌ Error at row 1300, attempt 10: No JSON block found
❌ Error at row 1300, attempt 11: No JSON block found
❌ Error at row 1300, attempt 12: No JSON block found
❌ Error at row 1300, attempt 13: No JSON block found
❌ Error at row 1300, attempt 14: No JSON block found


Scoring with LLM:  72%|███████▏  | 1301/1805 [03:25<00:59,  8.53it/s]

❌ Error at row 1300, attempt 15: No JSON block found
❌ Error at row 1552, attempt 1: No JSON block found
❌ Error at row 1552, attempt 2: No JSON block found
❌ Error at row 1552, attempt 3: No JSON block found
❌ Error at row 1552, attempt 4: No JSON block found
❌ Error at row 1552, attempt 5: No JSON block found
❌ Error at row 1552, attempt 6: No JSON block found
❌ Error at row 1552, attempt 7: No JSON block found
❌ Error at row 1552, attempt 8: No JSON block found
❌ Error at row 1552, attempt 9: No JSON block found
❌ Error at row 1552, attempt 10: No JSON block found
❌ Error at row 1552, attempt 11: No JSON block found
❌ Error at row 1552, attempt 12: No JSON block found
❌ Error at row 1552, attempt 13: No JSON block found
❌ Error at row 1552, attempt 14: No JSON block found


Scoring with LLM:  86%|████████▌ | 1553/1805 [03:42<00:23, 10.63it/s]

❌ Error at row 1552, attempt 15: No JSON block found
❌ Error at row 1626, attempt 1: Expecting value: line 4 column 28 (char 155)


Scoring with LLM:  86%|████████▌ | 1553/1805 [03:56<00:23, 10.63it/s]

❌ Error at row 1626, attempt 2: Expecting value: line 4 column 28 (char 155)
❌ Error at row 1626, attempt 3: No JSON block found
❌ Error at row 1626, attempt 4: No JSON block found
❌ Error at row 1626, attempt 5: Expecting value: line 4 column 28 (char 155)
❌ Error at row 1626, attempt 6: No JSON block found
❌ Error at row 1626, attempt 7: Expecting value: line 4 column 28 (char 155)
❌ Error at row 1626, attempt 8: Expecting value: line 4 column 28 (char 155)
❌ Error at row 1626, attempt 9: Expecting value: line 3 column 30 (char 141)
❌ Error at row 1626, attempt 10: No JSON block found
❌ Error at row 1626, attempt 11: No JSON block found
❌ Error at row 1626, attempt 12: Expecting value: line 4 column 32 (char 163)
❌ Error at row 1626, attempt 13: Expecting value: line 4 column 28 (char 155)
❌ Error at row 1626, attempt 14: No JSON block found


Scoring with LLM: 100%|██████████| 1805/1805 [05:07<00:00,  5.87it/s]

❌ Error at row 1626, attempt 15: Expecting value: line 4 column 28 (char 155)





In [3]:
import pandas as pd

# Load the CSV file
df = pd.read_csv(input_file)

# Define the expected LLM columns (excluding 'paper_title')
llm_columns = [
    "llm_length_effort",
    "llm_lexical_diversity",
    "llm_questions_raised",
    "llm_citation_usage",
    "llm_sentiment_polarity",
    "llm_politeness",
    "llm_hedging",
    "llm_specificity",
    "llm_domain_terms",
    "llm_relevance_alignment",
    "llm_readability",
    "llm_overall_quality",
    "llm_overall_score_100"
]

# Find rows with any missing LLM fields
incomplete_rows = df[df[llm_columns].isnull().any(axis=1)]

# Print count
print(f"❗ Incomplete rows: {len(incomplete_rows)} out of {len(df)}")


❗ Incomplete rows: 11 out of 1805
