Paper PDF -> JSON Pipeline & COT Generation

This notebook contains two main parts:
1. Extract text from PDF and split it into JSON paragraphs;
2. Call a large model (OpenAI-like interface) to generate a chain-of-thought (CoT) for each paragraph and save the results.

Instructions and Precautions:
- Please install dependencies according to your actual environment (pdfminer.six, natsort, openai, or the SDK for the large model you are using).
- To avoid rate limits or unexpected charges, it is recommended to test on a small number of samples first (for example, set start_index to a small number).
- If PDF extraction quality is not high, it is recommended to use OCR tools (such as Tesseract) or manually proofread key sections.

Install dependencies

In [None]:
!pip install pdfminer.six natsort openai

!pip install gradio transformers accelerate

!git clone --depth 1 https://github.com/hiyouga/LLaMA-Factory.git
%cd LLaMA-Factory
!pip install -e "LLaMA-Factory.[torch,metrics]"

Configuration

Enter your API Key, base_url, model name, and data path in this section.

In [None]:
API_KEY = "" # Enter your API key
BASE_URL = "" # Enter the address of your calling model
MODEL_NAME = "" # Enter the model you wish to call

# File path (adjust according to your directory)
INPUT_PDF_FOLDER = "user/paper_dataset_perovskite/pdf"
OUTPUT_JSON_FOLDER = "user/paper_dataset_perovskite/json_split"
MERGED_JSON_FILE = "user/paper_dataset_perovskite/paper_split.json"
FAIL_PATH = "user/paper_dataset_perovskite/fail.txt"

# CoT Output Path
COT_OUTPUT_FILE = "user/dataset_perovskite/new/paper_split_cot_new.json"

# Other Settings
MAX_LENGTH = 2500
SLEEP_TIME = 1
START_INDEX = 0

Part One: Extracting Text from PDF and Splitting into JSON Paragraphs

In [None]:
import os
import json
import shutil
from glob import glob
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from io import StringIO
from natsort import natsorted

def extract_pdf_content(pdf_path):
    rsrcmgr = PDFResourceManager()
    outfp = StringIO()
    laparams = LAParams()
    device = TextConverter(rsrcmgr=rsrcmgr, outfp=outfp, laparams=laparams)
    try:
        with open(pdf_path, 'rb') as fp:
            interpreter = PDFPageInterpreter(rsrcmgr, device)
            password = ""
            maxpages = 0
            caching = True
            pagenos = set()
            for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True):
                try:
                    interpreter.process_page(page)
                except Exception as e:
                    print(f"Error processing page in {pdf_path}: {e}", flush=True)
                    continue
    except Exception as e:
        print(f"Error opening or processing {pdf_path}: {e}", flush=True)
        return ""
    finally:
        try:
            mystr = outfp.getvalue()
        except Exception:
            mystr = ""
        try:
            device.close()
        except Exception:
            pass
        try:
            outfp.close()
        except Exception:
            pass
    return mystr

def split_content_by_words(content, max_length=MAX_LENGTH):
    words = content.split()
    content_parts = []
    step = int(max_length * 0.8) 
    i = 0
    while i < len(words):
        part = " ".join(words[i:i + max_length])
        content_parts.append(part)
        i += step
    return content_parts

def process_pdfs(input_folder, output_folder, fail_path):
    os.makedirs(output_folder, exist_ok=True)
    s = 0
    w = 0
    failed_files = []

    for filename in natsorted(os.listdir(input_folder)):
        if not filename.lower().endswith('.pdf'):
            continue

        pdf_path = os.path.join(input_folder, filename)
        print(f"Processing {pdf_path}...")


        content = extract_pdf_content(pdf_path)
        if not content:
            print(f"No content extracted from {pdf_path}, skipping.")
            w += 1
            failed_files.append(pdf_path)
            continue


        content_parts = split_content_by_words(content)
        base_filename = os.path.splitext(filename)[0]


        for i, part in enumerate(content_parts):
            json_filename = f"{base_filename}_part{i+1}.json"
            json_path = os.path.join(output_folder, json_filename)
            os.makedirs(os.path.dirname(json_path), exist_ok=True)


            try:
                with open(json_path, 'w', encoding='utf-8') as json_file:
                    json.dump({"content": part}, json_file, ensure_ascii=False, indent=4)
                print(f"Saved JSON part {i+1} to {json_path}", flush=True)
            except Exception as e:
                print(f"Error saving JSON file {json_path}: {e}", flush=True)
                w += 1

        s += 1

    try:
        with open(os.path.join(output_folder, os.path.basename(fail_path)), "w", encoding='utf-8') as f:
            if failed_files:
                f.write("Failed files:\n")
                for file in failed_files:
                    f.write(f"- {file}\n")
                f.write(f"\nTotal failed files: {len(failed_files)}")
            else:
                f.write("No files failed.\n")
    except Exception as e:
        print(f"Error writing fail file: {e}", flush=True)

    print(f"{s} files successfully processed.")
    print(f"{w} files skipped.")

def merge_json_files(json_folder, output_file):
    merged_data = []
    json_files = natsorted(glob(os.path.join(json_folder, "*.json")))
    for json_file in json_files:
        with open(json_file, "r", encoding='utf-8') as f:
            try:
                data = json.load(f)
                if isinstance(data, list):
                    merged_data.extend(data)
                else:
                    merged_data.append(data)
            except json.JSONDecodeError as e:
                print(f"Error decoding JSON file {json_file}: {e}", flush=True)

    try:
        with open(output_file, "w", encoding='utf-8') as f:
            json.dump(merged_data, f, ensure_ascii=False, indent=4)
        print(f"All JSON files merged into {output_file}", flush=True)
    except Exception as e:
        print(f"Error saving merged JSON file {output_file}: {e}", flush=True)

def delete_folder(folder_path):
    if os.path.exists(folder_path):
        try:
            shutil.rmtree(folder_path)
            print(f"Folder {folder_path} deleted successfully.", flush=True)
        except Exception as e:
            print(f"Error deleting folder {folder_path}: {e}", flush=True)

Run Example: Process PDF and Merge into a Single JSON File

In [None]:
process_pdfs(INPUT_PDF_FOLDER, OUTPUT_JSON_FOLDER, FAIL_PATH)
merge_json_files(OUTPUT_JSON_FOLDER, MERGED_JSON_FILE)

Part Two: Using Large Language Models to Generate Chain-of-Thought (CoT) for Each Paragraph

In [None]:
import time
try:
    from openai import OpenAI
except Exception:
    OpenAI = None

client = None
if OpenAI is not None and API_KEY:
    client = OpenAI(api_key=API_KEY, base_url=BASE_URL)
else:
    print("Warning: OpenAI client not initialized. Please check the API_KEY / environment and dependencies.")

def generate_cot_from_merged(merged_json_path, output_path, start_index=START_INDEX, model_name=MODEL_NAME, client_obj=None, sleep_time=SLEEP_TIME):
    if client_obj is None:
        raise RuntimeError("Model client is not initialized. Please set API_KEY and ensure SDK is installed.")

    with open(merged_json_path, 'r', encoding='utf-8') as f:
        data = json.load(f)

        cot_set = []
    for idx, item in enumerate(data):
        if idx < start_index:
            continue

        excerpt = item.get('content', '')
        print("The Number:", idx, flush=True)

        prompt_user = (
            "You are an expert in the field of perovskite. Your task is to generate chain of thought for the construction of dataset.\n"
            "Provide step-by-step reasoning, key considerations, typical experimental parameters (where relevant), and suggestions for dataset construction."
        )
        prompt_system = "Read and think carefully about the fragment. Generate the suitable output."

        try:
            completion = client_obj.chat.completions.create(
                model=model_name,
                messages=[
                    {"role": "system", "content": prompt_system},
                    {"role": "user", "content": prompt_user + "\n\nFragment:\n" + excerpt},
                ],
                temperature=8.0,
            )
            output = completion.choices[0].message.content
            print(output, "\n", flush=True)
        except Exception as e:
            output = f"Error: {str(e)}"
            print(f"Unexpected error at index {idx}: {str(e)}", flush=True)

        cot_set.append({"id": idx, "cot": output})
        time.sleep(sleep_time)

    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    with open(output_path, 'w', encoding='utf-8') as file:
        json.dump(cot_set, file, ensure_ascii=False, indent=2)

    print("\ndone")

Run example: Generate CoT for the merged JSON

In [None]:
generate_cot_from_merged(MERGED_JSON_FILE, COT_OUTPUT_FILE, start_index=START_INDEX, model_name=MODEL_NAME, client_obj=client, sleep_time=SLEEP_TIME)

Part Three: Training model

Set reasonable parameters in the YAML file and start training the model. You can refer to this website to set up the appropriate YAML file: https://llamafactory.readthedocs.io/en/latest/getting_started/sft.html

In [None]:
!llamafactory-cli train examples/train_lora/llama3_lora_sft.yaml

Part Four: Interact with the Perovskite-R1 model. Load the model in the Notebook, and launch the Gradio chat interface

In [None]:
import os
import gradio as gr
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
import re

MODEL_PATH = "JH976/Perovskite-R1" # or a locally trained model

print("Loading model...")
try:
    tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_PATH,
        dtype=torch.bfloat16,
        device_map="auto"
    )
    model.eval()
    print("Model loaded successfully.")
except Exception as e:
    print(f"Model loading failed: {e}")
    raise

# %% 
def predict(message, history):
    messages = []
    for user_turn, bot_turn in history:
        messages.append({"role": "user", "content": user_turn})
        messages.append({"role": "assistant", "content": bot_turn})
    
    messages.append({"role": "user", "content": message})

    input_ids = tokenizer.apply_chat_template(
        messages,
        add_generation_prompt=True,
        return_tensors="pt"
    ).to(model.device)

    outputs = model.generate(
        input_ids,
        max_new_tokens=8192,
        eos_token_id=tokenizer.eos_token_id,
        do_sample=True,
        temperature=0.7,
        top_p=0.8
    )

    response_ids = outputs[0][input_ids.shape[-1]:]
    response_text = tokenizer.decode(response_ids, skip_special_tokens=True)

    clean_response = re.sub(r'<think>.*?</think>\s*\n*', '', response_text, flags=re.DOTALL)
    return clean_response.strip()

# %%
print("Starting Gradio...")
demo = gr.ChatInterface(
    fn=predict,
    title="Perovskite-R1",
    description="Enter your question to have a conversation with the LLM.",
    theme="soft",
    examples=[["Hello"], ["Do you know about perovskite?"]],
)

# %%
# launch Gradioï¼Œshare=True can generate a public access link
demo.launch(share=True)