In [13]:
import os
import re
from pathlib import Path
from tqdm import tqdm
import fitz  # PyMuPDF


pdf_dir = "/kaggle/input/make-data-count-finding-data-references/train/PDF"
pdf_paths = list(Path(pdf_dir).glob("*.pdf"))


RE_DOI = re.compile(r"10\.\d{4,9}/[\w.()/:;-]+", re.I)
RE_ACCESSION = re.compile(r"GSE\d+|SR[APRX]\d+|PRJ[NAED]\d+|EPI(?:_ISL_)?\d+|PXD\d{6}|SAM[ND]\d+|ERR\d+", re.I)

# Fungsi untuk membersihkan teks referensi
def remove_references(text: str) -> str:
    lines = text.split("\n")
    for i in range(len(lines)-1, max(0, int(len(lines)*0.3)), -1):
        if re.match(r"^(REFERENCES|BIBLIOGRAPHY|Literature Cited|Works Cited)", lines[i], re.I):
            return "\n".join(lines[:i])
    return text

# Ekstraksi teks & potong di sekitar kandidat referensi
span = 200  # jumlah karakter sebelum/sesudah entitas
chunks = []

for pdf_path in tqdm(pdf_paths):
    article_id = pdf_path.stem
    doc = fitz.open(pdf_path)
    text = "\n".join([page.get_text() for page in doc])
    doc.close()

    text = remove_references(text)

    # Ambil potongan teks sekitar DOI
    for match in RE_DOI.finditer(text):
        chunk = text[max(0, match.start() - span): match.end() + span]
        chunks.append({"article_id": article_id, "text": chunk, "dataset_id": match.group(), "source": "doi"})

    # Ambil potongan teks sekitar Accession ID
    for match in RE_ACCESSION.finditer(text):
        chunk = text[max(0, match.start() - span): match.end() + span]
        chunks.append({"article_id": article_id, "text": chunk, "dataset_id": match.group(), "source": "accession"})

# Simpan hasil sementara untuk dipakai oleh LLM nanti
import pandas as pd

df_chunks = pd.DataFrame(chunks)
df_chunks.to_csv("chunks_for_llm.csv", index=False)

print(df_chunks.head())


 13%|█▎        | 66/524 [00:05<00:46,  9.84it/s]

MuPDF error: unsupported error: cannot create appearance stream for  annotations

MuPDF error: unsupported error: cannot create appearance stream for  annotations

MuPDF error: unsupported error: cannot create appearance stream for  annotations

MuPDF error: unsupported error: cannot create appearance stream for  annotations

MuPDF error: unsupported error: cannot create appearance stream for  annotations

MuPDF error: unsupported error: cannot create appearance stream for  annotations

MuPDF error: unsupported error: cannot create appearance stream for  annotations

MuPDF error: unsupported error: cannot create appearance stream for  annotations

MuPDF error: unsupported error: cannot create appearance stream for  annotations

MuPDF error: unsupported error: cannot create appearance stream for  annotations

MuPDF error: unsupported error: cannot create appearance stream for  annotations

MuPDF error: unsupported error: cannot create appearance stream for  annotations

MuPDF error: uns

100%|██████████| 524/524 [00:53<00:00,  9.80it/s]


                     article_id  \
0  10.1371_journal.pone.0191086   
1  10.1371_journal.pone.0191086   
2  10.1371_journal.pone.0191086   
3  10.1371_journal.pone.0191086   
4  10.1371_journal.pone.0191086   

                                                text  \
0  sh and other aquatic organisms, particularly f...   
1  inlola MA, Reygondeau G, Wabnitz\nCCC, Troell ...   
2   Global trends in food fish production from 19...   
3  reshwater\naquaculture and mariculture. Data f...   
4   1 Echinodermata) (S1 Table). Although sea-\nw...   

                          dataset_id source  
0       10.1371/journal.pone.0191086    doi  
1                   10.1371/journal.    doi  
2  10.1371/journal.pone.0191086.g001    doi  
3       10.1371/journal.pone.0191086    doi  
4       10.1371/journal.pone.0191086    doi  


In [None]:
from openai import OpenAI
import pandas as pd
import time
from tqdm import tqdm

# 🔑 Ganti dengan API KEY kamu
client = OpenAI(
    base_url="https://openrouter.ai/api/v1",
    api_key="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", 
)

HEADERS = {
    "HTTP-Referer": "", 
    "X-Title": "",  
}

MODEL_NAME = "mistralai/mixtral-8x7b-instruct"


SYSTEM_PROMPT = """You are a scientific assistant.
Given a DOI and a relevant academic text snippet, classify the dataset associated with the DOI as:
A) Primary — generated by this study
B) Secondary — reused from other studies
C) None — not dataset-related or just cited

Respond with exactly one letter: A, B, or C.
"""

def create_prompt(text, dataset_id):
    return f"DOI: {dataset_id}\n\nText:\n{text}"

def classify_with_llm(text, dataset_id, retries=3):
    for _ in range(retries):
        try:
            messages = [
                {"role": "system", "content": SYSTEM_PROMPT},
                {"role": "user", "content": create_prompt(text, dataset_id)}
            ]

            completion = client.chat.completions.create(
                model=MODEL_NAME,
                messages=messages,
                extra_headers=HEADERS,
                extra_body={},
                temperature=0.1,
                max_tokens=1
            )

            return completion.choices[0].message.content.strip()
        except Exception as e:
            print("Error:", e)
            time.sleep(5)
    return "C"

# 🔄 Load data
df_chunks = pd.read_csv("chunks_for_llm.csv")

# 🔁 Proses dalam batch
BATCH_SIZE = 15  # < 20 per menit
results = []

for i in tqdm(range(0, len(df_chunks), BATCH_SIZE)):
    batch = df_chunks.iloc[i:i+BATCH_SIZE]
    
    for _, row in batch.iterrows():
        text = row['text']
        dataset_id = row['dataset_id']
        article_id = row['article_id']

        result = classify_with_llm(text, dataset_id)

        if result == "A":
            label = "Primary"
        elif result == "B":
            label = "Secondary"
        else:
            continue  # Skip "None"

        results.append({
            "article_id": article_id,
            "dataset_id": dataset_id,
            "type": label
        })
    
    time.sleep(60)  

# 💾 Simpan hasil
df_results = pd.DataFrame(results)
df_results.to_csv("llm_predictions.csv", index=False)
print(df_results.head())


  1%|▏         | 4/278 [04:39<5:22:14, 70.57s/it]

In [None]:
submission = pd.DataFrame(results).drop_duplicates()
submission['row_id'] = range(len(submission))
submission = submission[["row_id", "article_id", "dataset_id", "type"]]
submission.to_csv("submission.csv", index=False)

print("✅ Saved submission.csv")
print(submission.head())
