In [1]:
!pip install pymupdf

Collecting pymupdf
  Downloading pymupdf-1.26.3-cp39-abi3-manylinux_2_28_x86_64.whl.metadata (3.4 kB)
Downloading pymupdf-1.26.3-cp39-abi3-manylinux_2_28_x86_64.whl (24.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m63.7 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hInstalling collected packages: pymupdf
Successfully installed pymupdf-1.26.3


In [None]:
import os
import re
from pathlib import Path
from typing import List, Dict
import fitz  # PyMuPDF
import pandas as pd
import torch
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForCausalLM
from sklearn.model_selection import train_test_split, KFold
from lxml import etree
import warnings
warnings.filterwarnings("ignore")

# ==============================================================================
# 1. KONFIGURASI DAN KONSTANTA
# ==============================================================================
INPUT_DIR = Path("/kaggle/input/make-data-count-finding-data-references")
PDF_TRAIN_DIR = INPUT_DIR / "train/PDF"
XML_TRAIN_DIR = INPUT_DIR / "train/XML"
TRAIN_LABELS_PATH = INPUT_DIR / "train_labels.csv"
PDF_TEST_DIR = INPUT_DIR / "test/PDF"
XML_TEST_DIR = INPUT_DIR / "test/XML"

MODEL_NAME = "Qwen/Qwen2-7B-Instruct"  # Ganti ke Qwen2.5-32B-Instruct-AWQ jika tersedia
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
BATCH_SIZE = 4
VALIDATION_SET_SIZE = 0.2
RANDOM_STATE = 42
N_FOLDS = 5

# Valid dataset DOI prefixes
VALID_DATASET_PREFIXES = ["10.5061/dryad", "10.5281/zenodo", "10.25386/genetics", "10.7937"]

# ==============================================================================
# 2. REGULAR EXPRESSIONS (REGEX)
# ==============================================================================
RE_DOI = re.compile(r"\b(10\.\d{4,9}/[-._;()/:A-Z0-9]+)", re.IGNORECASE)
ACCESSION_PATTERNS = [
    "GSE\d+", "SR[APRX]\d+", "PRJ[NAED][A-Z]?\d+", "EPI(?:_ISL_)?\d+",
    "PXD\d{6}", "SAM[ND]\d+", "ERR\d+", "PDB\s+[A-Z0-9]+", "E-MTAB-\d+",
    "IPR\d{6}", "PF\d{5}", "EMPIAR-\d{5}", "CHEMBL\d+", "CVCL_[A-Z0-9]{4}",
    "ENS[A-Z]{0,6}[GT]\d{11}", "N[MC]_\d+(?:\.\d+)?", "rs\d+",
    "uniprot:(?:[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9][A-Z][A-Z0-9]{2}[0-9])",
]
RE_ACCESSION = re.compile(r"\b(" + "|".join(ACCESSION_PATTERNS) + r")\b", re.IGNORECASE)
RE_REFERENCES_SECTION = re.compile(
    r"^(REFERENCES?|BIBLIOGRAPHY|Literature\s+Cited|Works\s+Cited|\d+\.?\s+(REFERENCES?|Bibliography))(:)?$",
    re.IGNORECASE | re.MULTILINE
)
RE_CITATION_PATTERNS = [
    r'\(\d{4}\)',  # (2020)
    r'\d{4}\.',     # 2020.
    r'doi:',        # doi:
    r'\bet al\b',   # et al
]
RE_CLEAN_LLM_OUTPUT = re.compile(r"^\s*([ABC])\b", re.MULTILINE)

# ==============================================================================
# 3. FUNGSI EKSTRAKSI DATA
# ==============================================================================
def normalize_doi(doi: str) -> str:
    doi = doi.strip().lower()
    if not doi.startswith("https://doi.org/"):
        doi = doi.lstrip("doi:").strip()
        if doi.startswith("10."):
            return f"https://doi.org/{doi}"
    return doi

def is_valid_dataset_doi(doi: str) -> bool:
    return any(doi.lower().startswith(prefix) for prefix in VALID_DATASET_PREFIXES)

def remove_references_section(text: str) -> str:
    lines = text.split('\n')
    cut_index = -1
    for i in range(len(lines) - 1, max(0, int(len(lines) * 0.3)), -1):
        line = lines[i].strip()
        if RE_REFERENCES_SECTION.match(line):
            following_lines = lines[i+1:i+4]
            has_citations = False
            for follow_line in following_lines:
                if follow_line.strip() and any(re.search(pat, follow_line, re.IGNORECASE) for pat in RE_CITATION_PATTERNS):
                    has_citations = True
                    break
            if has_citations or i >= len(lines) - 3:
                cut_index = i
                break
    if cut_index != -1:
        ref_section = '\n'.join(lines[cut_index:])
        if RE_DOI.search(ref_section) or RE_ACCESSION.search(ref_section):
            return text
        return '\n'.join(lines[:cut_index]).strip()
    return text.strip()

def extract_text_from_xml(xml_path: Path) -> str:
    try:
        tree = etree.parse(str(xml_path))
        sections = tree.xpath("//sec[@sec-type='materials|methods' or @sec-type='data' or @sec-type='supplementary-material']//p/text()")
        return "\n".join(sections) if sections else ""
    except Exception:
        return ""

def find_potential_citations(text: str, article_id: str) -> List[Dict[str, str]]:
    citations = []
    paragraphs = text.split("\n\n")
    for para in paragraphs:
        para = para.strip()
        if not para:
            continue
        patterns = {"doi": RE_DOI, "accession": RE_ACCESSION}
        for source, pattern in patterns.items():
            for match in pattern.finditer(para):
                dataset_id = match.group(0)
                if source == "doi":
                    dataset_id = normalize_doi(dataset_id)
                    if not is_valid_dataset_doi(dataset_id) or dataset_id == f"https://doi.org/{article_id}":
                        continue
                citations.append({
                    "article_id": article_id,
                    "text": para,
                    "dataset_id": dataset_id,
                    "source": source
                })
    return citations

def extract_chunks_from_paths(pdf_paths: List[Path], xml_dir: Path) -> pd.DataFrame:
    all_chunks = []
    print(f"Memulai ekstraksi dari {len(pdf_paths)} file...")
    for pdf_path in tqdm(pdf_paths, desc="📄 Mengekstrak File"):
        article_id = pdf_path.stem
        try:
            xml_path = xml_dir / f"{article_id}.xml"
            if xml_path.exists():
                full_text = extract_text_from_xml(xml_path)
            else:
                with fitz.open(pdf_path) as doc:
                    full_text = "\n".join([page.get_text("text") for page in doc])
            cleaned_text = remove_references_section(full_text)
            chunks = find_potential_citations(cleaned_text, article_id)
            all_chunks.extend(chunks)
        except Exception as e:
            print(f"⚠️ Gagal memproses file {article_id}: {e}")
    return pd.DataFrame(all_chunks)

# ==============================================================================
# 4. FUNGSI KLASIFIKASI LLM
# ==============================================================================
def build_prompt_messages(batch_df: pd.DataFrame) -> List[Dict[str, str]]:
    system_message = """You are a meticulous research analyst specializing in data citation. Your task is to analyze text snippets from scientific papers and classify the relationship between the paper and a mentioned dataset ID.

Follow these rules for classification:
- **Primary (A)**: The data was **generated by the authors for this specific study**. Look for phrases like "data are available", "we generated", "our data have been deposited in", "supplemental material", "data for this study", "deposited at", "available at".
- **Secondary (B)**: The data was **reused from an external source** or a previous study. Look for phrases like "data were obtained from", "retrieved from", "we used the dataset from", "downloaded from", "sourced from", "accessed from".
- **None (C)**: The ID refers to something else (e.g., another publication), is mentioned in passing, or there is insufficient context.

Examples:
1. ID: https://doi.org/10.5061/dryad.6m3n9
   Context: "The data we used in this publication can be accessed from Dryad at doi:10.5061/dryad.6m3n9."
   Classification: A
2. ID: pdb 5yfp
   Context: "Structure of the Saccharomyces cerevisiae exocyst holomeric octameric complex... (pdb 5yfp)."
   Classification: B
3. ID: https://doi.org/10.1098/rspb.2016.1151
   Context: "As described in a previous study (doi:10.1098/rspb.2016.1151)."
   Classification: C
4. ID: GSE37569
   Context: "Primary data for Agilent and Affymetrix microarray experiments are available at the NCBI Gene Expression Omnibus (GEO, http://www.ncbi.nlm.nih.gov/geo/) under the accession numbers GSE37569."
   Classification: A
5. ID: E-MTAB-10217
   Context: "The datasets presented in this study can be found in online repositories... E-MTAB-10217."
   Classification: A

For each item below:
1. Read the Context carefully.
2. Analyze the language used (generated or reused).
3. Decide the final classification: A, B, or C.

Output ONLY a list of single letters (A, B, or C), each on a new line.
"""
    user_prompts = []
    for i, row in enumerate(batch_df.itertuples(), 1):
        snippet = ' '.join(str(row.text).split())
        user_prompts.append(f"{i}. ID: {row.dataset_id}\n   Context: \"...{snippet}...\"")
    messages = [
        {"role": "system", "content": system_message},
        {"role": "user", "content": "\n\n".join(user_prompts)}
    ]
    return messages

def classify_batch_with_llm(batch_df: pd.DataFrame, model, tokenizer) -> List[str]:
    messages = build_prompt_messages(batch_df)
    prompt_text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = tokenizer(prompt_text, return_tensors="pt", truncation=True, max_length=4096).to(model.device)
    outputs = model.generate(
        **inputs,
        max_new_tokens=100,
        do_sample=True,
        temperature=0.4,
        top_p=0.9,
        pad_token_id=tokenizer.eos_token_id
    )
    decoded_output = tokenizer.decode(outputs[0, inputs['input_ids'].shape[1]:], skip_special_tokens=True)
    labels = RE_CLEAN_LLM_OUTPUT.findall(decoded_output.strip())
    return labels + ["C"] * (len(batch_df) - len(labels))

def run_llm_classification(df_chunks: pd.DataFrame, model, tokenizer):
    results = []
    print(f"\nMemulai klasifikasi dengan LLM untuk {len(df_chunks)} chunk...")
    for i in tqdm(range(0, len(df_chunks), BATCH_SIZE), desc="🤖 Mengklasifikasi"):
        batch_df = df_chunks.iloc[i:i+BATCH_SIZE].reset_index(drop=True)
        try:
            labels = classify_batch_with_llm(batch_df, model, tokenizer)
            for j, label_code in enumerate(labels):
                if label_code in ["A", "B"]:
                    row = batch_df.iloc[j]
                    results.append({
                        "article_id": row.article_id,
                        "dataset_id": row.dataset_id,
                        "type": "Primary" if label_code == "A" else "Secondary"
                    })
        except Exception as e:
            print(f"❌ Error pada batch {i//BATCH_SIZE}: {e}")
            continue
    df_results = pd.DataFrame(results)
    return df_results.drop_duplicates(subset=['article_id', 'dataset_id']).reset_index(drop=True)

# ==============================================================================
# 5. FUNGSI EVALUASI
# ==============================================================================
def calculate_f1_score(true_labels: pd.DataFrame, pred_labels: pd.DataFrame):
    print("\n--- HASIL EVALUASI ---")
    if pred_labels.empty:
        print("Tidak ada prediksi valid (A/B) yang dihasilkan. F1-Score adalah 0.")
        return 0
    true_set = set(map(tuple, true_labels[['article_id', 'dataset_id', 'type']].astype(str).apply(lambda x: x.str.lower()).values))
    pred_set = set(map(tuple, pred_labels[['article_id', 'dataset_id', 'type']].astype(str).apply(lambda x: x.str.lower()).values))
    tp = len(true_set.intersection(pred_set))
    fp = len(pred_set - true_set)
    fn = len(true_set - pred_set)
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    print(f"📊 True Positives (TP) : {tp}")
    print(f"📊 False Positives (FP): {fp}")
    print(f"📊 False Negatives (FN): {fn}")
    print("-" * 25)
    print(f"🎯 Precision: {precision:.4f}")
    print(f"🔍 Recall   : {recall:.4f}")
    print(f"⭐ F1-Score : {f1:.4f}")
    print("-" * 25)
    return f1

# ==============================================================================
# 6. BLOK EKSEKUSI UTAMA (MAIN)
# ==============================================================================
if __name__ == "__main__":
    # --- Langkah 1: Muat data pelatihan ---
    print("Membaca data pelatihan...")
    df_labels_all = pd.read_csv(TRAIN_LABELS_PATH)
    all_article_ids = df_labels_all['article_id'].unique()
    
    # --- Langkah 2: Cross-validation ---
    print(f"\nMelakukan {N_FOLDS}-fold cross-validation...")
    kf = KFold(n_splits=N_FOLDS, shuffle=True, random_state=RANDOM_STATE)
    f1_scores = []
    
    for fold, (train_idx, val_idx) in enumerate(kf.split(all_article_ids)):
        print(f"\nFold {fold + 1}/{N_FOLDS}")
        train_ids = all_article_ids[train_idx]
        val_ids = all_article_ids[val_idx]
        true_val_labels = df_labels_all[df_labels_all['article_id'].isin(val_ids)].copy()
        val_pdf_paths = [p for p in PDF_TRAIN_DIR.glob("*.pdf") if p.stem in val_ids]
        print(f"✅ Data dibagi: {len(train_ids)} untuk latihan, {len(val_ids)} untuk validasi.")
        
        # --- Langkah 3: Ekstrak chunks untuk validasi ---
        df_val_chunks = extract_chunks_from_paths(val_pdf_paths, XML_TRAIN_DIR)
        if df_val_chunks.empty:
            print("⚠️ Tidak ada chunk yang diekstrak dari set validasi. Lewati fold.")
            continue
        
        # --- Langkah 4: Muat model LLM ---
        print(f"\nMemuat model {MODEL_NAME}...")
        tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
        model = AutoModelForCausalLM.from_pretrained(
            MODEL_NAME,
            device_map="auto",
            torch_dtype=torch.bfloat16 if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else torch.float16,
        )
        model.eval()
        print("✅ Model berhasil dimuat.")
        
        # --- Langkah 5: Klasifikasi dan evaluasi ---
        pred_val_labels = run_llm_classification(df_val_chunks, model, tokenizer)
        f1 = calculate_f1_score(true_val_labels, pred_val_labels)
        f1_scores.append(f1)
    
    print(f"\nRata-rata F1-Score dari {N_FOLDS} folds: {sum(f1_scores) / len(f1_scores):.4f}")
    
    # --- Langkah 6: Proses test set untuk submission ---
    print("\nMemproses test set untuk submission...")
    test_pdf_paths = list(PDF_TEST_DIR.glob("*.pdf"))
    df_test_chunks = extract_chunks_from_paths(test_pdf_paths, XML_TEST_DIR)
    if not df_test_chunks.empty:
        pred_test_labels = run_llm_classification(df_test_chunks, model, tokenizer)
        if not pred_test_labels.empty:
            submission = pred_test_labels[['article_id', 'dataset_id', 'type']].copy()
            submission = submission.sort_values(by=["article_id", "dataset_id", "type"], ascending=False).drop_duplicates(subset=['article_id', 'dataset_id'], keep="first")
            submission.insert(0, 'row_id', range(len(submission)))
            submission.to_csv('submission.csv', index=False)
            print("✅ File submission.csv telah dibuat.")
            print("Distribusi tipe:", submission['type'].value_counts())
        else:
            print("⚠️ Tidak ada prediksi valid untuk test set.")
    else:
        print("⚠️ Tidak ada chunk yang diekstrak dari test set.")

Membaca data pelatihan...

Melakukan 5-fold cross-validation...

Fold 1/5
✅ Data dibagi: 418 untuk latihan, 105 untuk validasi.
Memulai ekstraksi dari 105 file...


📄 Mengekstrak File: 100%|██████████| 105/105 [00:06<00:00, 15.26it/s]



Memuat model Qwen/Qwen2-7B-Instruct...


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/663 [00:00<?, ?B/s]

2025-07-07 02:29:40.108824: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1751855380.321205      35 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1751855380.383868      35 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/3.95G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/3.86G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/3.86G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/3.56G [00:00<?, ?B/s]

Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/243 [00:00<?, ?B/s]

✅ Model berhasil dimuat.

Memulai klasifikasi dengan LLM untuk 26 chunk...


🤖 Mengklasifikasi:  43%|████▎     | 3/7 [01:58<02:39, 39.91s/it]