# Data Cleaning & Data Labelling

In [None]:
from pathlib import Path
import json
import re
import unicodedata
from typing import Any, Dict, List, Optional, Tuple

from rapidfuzz import fuzz
import bibtexparser

base_dir = Path("23127238_output")
start_folder = "2304-14607"
papers_to_check = 1500
output_file = Path("labels") / "auto_label.json"

stop_words = {
    "a", "an", "the", "and", "or", "of", "for", "in", "on", "with",
    "to", "from", "by", "at", "into", "over", "under", "after", "before"
}


In [None]:
ARXIV_URL_RE = re.compile(r"arxiv\.org/(?:abs|pdf)/([^\s#/}]+)", re.I)
ARXIV_OLD_RE = re.compile(r"[a-z\-]+/\d{7}", re.I)
DOI_URL_RE = re.compile(r"doi\.org/([^\s}]+)", re.I)
DOI_RE = re.compile(r"\b10\.\d{4,9}/\S+\b", re.I)
HREF_RE = re.compile(r"\\href\{([^}]+)\}")

def normalize_arxiv_id(value: Any) -> Optional[str]:
    if not value:
        return None
    raw = str(value).strip()
    raw = raw.replace("arXiv:", "").replace("arxiv:", "")
    raw = raw.lower()
    match = re.match(r"^(\d{4})[.\-]?(\d{4,5})$", raw)
    if match:
        return f"{match.group(1)}-{match.group(2)}"
    
    return raw

def normalize_doi(value: Any) -> Optional[str]:
    if not value:
        return None
    v = str(value).strip().lower()
    v = re.sub(r"^https?://(dx\.)?doi\.org/", "", v)
    
    return v or None

def normalize_plain_text(text: str) -> str:
    cleaned = strip_accents(text)
    cleaned = cleaned.casefold()
    cleaned = re.sub(r"[^a-z0-9]", "", cleaned)
    
    return cleaned

def strip_accents(text: str) -> str:
    return unicodedata.normalize("NFKD", text).encode("ascii", "ignore").decode("ascii")

def normalize_for_fuzzy(text: str) -> str:
    text = strip_accents(text.casefold())
    text = re.sub(r"[^a-z0-9\s]", " ", text)
    tokens = []
    for token in text.split():
        if not token:
            continue
        if token in stop_words:
            continue
        if token.isdigit():
            continue
        tokens.append(token)
        
    return "".join(tokens)

def extract_year(value: Any) -> Optional[str]:
    if not value:
        return None
    match = re.search(r"(19|20)\d{2}", str(value))
    
    return match.group(0) if match else None

def extract_ids_from_text(text: str) -> Tuple[Optional[str], Optional[str]]:
    if not text:
        return None, None
    arxiv_id: Optional[str] = None
    doi: Optional[str] = None
    targets = [text] + HREF_RE.findall(text)
    for chunk in targets:
        if not arxiv_id:
            m = ARXIV_URL_RE.search(chunk)
            if m:
                arxiv_id = m.group(1)
        if not arxiv_id:
            m_old = ARXIV_OLD_RE.search(chunk)
            if m_old:
                arxiv_id = m_old.group(0)
        if not doi:
            m_doi_url = DOI_URL_RE.search(chunk)
            if m_doi_url:
                doi = m_doi_url.group(1)
        if not doi:
            m_doi = DOI_RE.search(chunk)
            if m_doi:
                doi = m_doi.group(0)
        if arxiv_id and doi:
            break
        
    return arxiv_id, doi


def load_json_data(path: Path) -> Tuple[Dict[str, Any], Optional[str]]:
    if not path.exists():
        return {}, "json_missing"
    if path.stat().st_size == 0:
        return {}, "json_empty_file"
    try:
        text = path.read_text(encoding="utf-8-sig")
        data = json.loads(text)
    except json.JSONDecodeError as exc:
        return {}, f"json_decode_error:{exc.msg}"
    except UnicodeDecodeError as exc:
        return {}, f"json_decode_error:{exc.reason}"
    if not isinstance(data, dict):
        return {}, "json_not_dict"
    
    return data, None


def parse_bib_entries(bib_files: List[Path]) -> List[Dict[str, Any]]:
    entries: List[Dict[str, Any]] = []
    for bib_file in bib_files:
        content = bib_file.read_text(encoding="utf-8", errors="ignore")
        parser = bibtexparser.bparser.BibTexParser(common_strings=True)
        parser.ignore_nonstandard_types = False
        try:
            db = bibtexparser.loads(content, parser=parser)
        except KeyboardInterrupt:
            raise
        except Exception:
            print(f"Parse error in {bib_file.name}, skipping file")
            continue
        for entry in db.entries:
            entry["__bib_file"] = bib_file.name
            entries.append(entry)
            
    return entries


def build_json_records(data: Dict[str, Any]) -> Dict[str, Dict[str, Any]]:
    records: Dict[str, Dict[str, Any]] = {}
    for raw_id, info in data.items():
        if not isinstance(info, dict):
            continue
        arxiv_id = normalize_arxiv_id(raw_id) or str(raw_id)
        record = {"arxiv_id": arxiv_id, "raw_id": raw_id, "info": info}
        records.setdefault(arxiv_id, record)
        doi_alias = normalize_doi(info.get("doi") or info.get("DOI"))
        if doi_alias and doi_alias not in records:
            records[doi_alias] = record
            
    return records


def resolve_canonical_id(candidate: Optional[str], records: Dict[str, Dict[str, Any]]) -> Optional[str]:
    if not candidate:
        return None
    rec = records.get(candidate)
    
    return rec["arxiv_id"] if rec else None


def match_entry(entry: Dict[str, Any], records: Dict[str, Dict[str, Any]], available: set) -> Tuple[Optional[str], Optional[str]]:
    eprint_norm = normalize_arxiv_id(entry.get("eprint"))
    bib_title = entry.get("title")
    bib_note = entry.get("note")
    bib_year = extract_year(entry.get("year"))
    
    fields_to_check = [
        (entry.get("note"), "note"),
        (entry.get("url"), "url"),
        (entry.get("howpublished"), "howpublished"),
        (entry.get("repository"), "repository"),
        (entry.get("doi"), "doi")
    ]

    if eprint_norm:
        canon = resolve_canonical_id(eprint_norm, records)
        if canon and canon in available:
            return canon, "eprint"

    for text_field, tag in fields_to_check:
        if not text_field: 
            continue
        arxiv_found, doi_found = extract_ids_from_text(text_field)
        for candidate, label in ((arxiv_found, "arxiv"), (normalize_doi(doi_found), "doi")):
            candidate_norm = normalize_arxiv_id(candidate) if label == "arxiv" else candidate
            canon = resolve_canonical_id(candidate_norm, records)
            if canon and canon in available:
                return canon, f"{label}_from_{tag}"

    def get_info_title(info_dict):
        return info_dict.get("paper_title") or info_dict.get("title") or ""

    if bib_title:
        bib_title_norm = normalize_plain_text(bib_title)
        for arxiv_id in list(available):
            info = records[arxiv_id]["info"]
            json_title = get_info_title(info)
            
            if json_title and bib_title_norm == normalize_plain_text(json_title):
                return arxiv_id, "title_exact"

    candidates = [text for text in (bib_title, bib_note) if text]
    best_score = 0
    best_id: Optional[str] = None
    best_json_year: Optional[str] = None
    
    for arxiv_id in list(available):
        info = records[arxiv_id]["info"]
        json_title = get_info_title(info)
        
        json_year = extract_year(info.get("submission_date") or info.get("year"))
        json_norm = normalize_for_fuzzy(json_title)
        
        for text in candidates:
            norm = normalize_for_fuzzy(text)
            score = max(
                fuzz.token_sort_ratio(norm, json_norm),
                fuzz.token_set_ratio(norm, json_norm),
                fuzz.partial_ratio(norm, json_norm),
            )
            if score > best_score:
                best_score = score
                best_id = arxiv_id
                best_json_year = json_year
                
    if best_id and best_score >= 75:
        if bib_year and best_json_year:
            try:
                year_diff = abs(int(bib_year) - int(best_json_year))
            except ValueError:
                year_diff = 0
            if year_diff > 1:
                return None, None
            return best_id, f"fuzzy_{best_score:.0f}_year_diff_{year_diff}"
        return best_id, f"fuzzy_{best_score:.0f}"

    return None, None


In [None]:
all_folders = sorted([p for p in base_dir.iterdir() if p.is_dir()], key=lambda x: x.name)
start_index = next((i for i, p in enumerate(all_folders) if p.name >= start_folder), 0)
candidate_folders = all_folders[start_index:] 

results: Dict[str, Dict[str, Any]] = {}
skipped_missing = 0
processed = 0
skip_details: List[Tuple[str, str]] = []
no_label_folders: List[str] = []

papers_found_count = 0 

print(f"Starting scan from {start_folder}. Target: {papers_to_check} labeled papers.")

for folder in candidate_folders:
    if papers_found_count >= papers_to_check:
        break

    json_path = folder / "references.json"
    bib_files = list(folder.glob("*.bib"))

    data, json_error = load_json_data(json_path)
    if json_error:
        skipped_missing += 1
        skip_details.append((folder.name, json_error))
        continue

    if not bib_files:
        skipped_missing += 1
        skip_details.append((folder.name, "no_bib_files"))
        continue

    entries = parse_bib_entries(bib_files)
    if not entries:
        skipped_missing += 1
        skip_details.append((folder.name, "bib_entries_empty"))
        continue

    records = build_json_records(data)
    if not records:
        skipped_missing += 1
        skip_details.append((folder.name, "references.json is empty"))
        continue

    available = {rec["arxiv_id"] for rec in records.values()}
    
    labels = {} 
    
    for entry in entries:
        arxiv_id, reason = match_entry(entry, records, available)
        if arxiv_id:
            bib_key = entry.get("ID") or entry.get("id") or entry.get("key") or "unknown"
            labels[bib_key] = arxiv_id
            available.discard(arxiv_id)
            
    if labels:
        results[folder.name] = labels
        papers_found_count += 1 
    else:
        no_label_folders.append(folder.name)
    
    processed += 1

    if papers_found_count % 50 == 0 and labels:
        print(f"Progress: Found {papers_found_count}/{papers_to_check} papers (Scanned {processed} folders so far)")
        
output_file.parent.mkdir(parents=True, exist_ok=True)
output_file.write_text(json.dumps(results, indent=4), encoding="utf-8")

print("-" * 30)
print(f"Scan finished at folder: {folder.name}")
print(f"Total folders scanned: {processed + skipped_missing}")
print(f"Papers processed (valid inputs): {processed}")
print(f"Papers skipped (invalid inputs): {skipped_missing}")
print(f"Papers with labels written (TARGET): {len(results)}")
print(f"Output: {output_file.resolve()}")


Starting scan from 2304-14607. Target: 1500 labeled papers.
Progress: Found 50/1500 papers (Scanned 52 folders so far)
Progress: Found 100/1500 papers (Scanned 103 folders so far)
Progress: Found 150/1500 papers (Scanned 159 folders so far)
Progress: Found 200/1500 papers (Scanned 209 folders so far)
Progress: Found 250/1500 papers (Scanned 259 folders so far)
Progress: Found 300/1500 papers (Scanned 312 folders so far)
Progress: Found 350/1500 papers (Scanned 363 folders so far)
Progress: Found 400/1500 papers (Scanned 413 folders so far)
Progress: Found 450/1500 papers (Scanned 467 folders so far)
Progress: Found 500/1500 papers (Scanned 518 folders so far)
Progress: Found 550/1500 papers (Scanned 570 folders so far)
Progress: Found 600/1500 papers (Scanned 621 folders so far)
Progress: Found 650/1500 papers (Scanned 671 folders so far)
Progress: Found 700/1500 papers (Scanned 722 folders so far)
Progress: Found 750/1500 papers (Scanned 774 folders so far)
Progress: Found 800/1500 pa