In [1]:
# !pandoc "data/方程式手册.epub" -o data/output.tex
# 1208 ~ 10818 lines are valid

In [2]:
start, end = 1208, 10818
model = "qwen/qwen-2.5-7b-instruct"
source_file = "data/output.tex"
output_file = "data/equations.txt"
prompt = """You are an automated text-processing bot. Your only function is to extract all chemical equations from the user's text and return them inside a single JSON object. Follow all rules exactly.

### JSON Output Structure
- Your entire response MUST be a single, valid JSON object. Do not include any other text or markdown.
- The JSON object must have one key: `"equations"`.
- The value of `"equations"` must be a JSON array of strings.
- Each string in the array must be a correctly formatted chemical equation.
- If no chemical equations are found in the input text, the value for `"equations"` MUST be an empty array `[]`.

### Equation Formatting Rules
- Reaction arrow MUST BE ` == `.
- Separate molecules with ` + `.
- Convert subscripts like `H{2}O` to `H2O`.
- Format charges compactly like `Fe3+` and `SO4 2-`.
- Charges MUST be apart from the molecule by a space. `SO4 2-` is correct, `SO4^2-` or `SO4(2-)` are incorrect.
- Remove all `{}` brackets and `↑` symbols from the final string.

### Examples

**Example 1: Multiple Equations Found**
- **Input:** `{Si+2Cl{2} SiCl{4} } ... {SiO{2} +2CSi+2CO↑}`
- **Correct Output:**
  ```json
  {"equations": ["Si + 2Cl2 == SiCl4", "SiO2 + 2C == Si + 2CO"]}
"""

In [3]:
import os
import re
from openai import OpenAI

client = OpenAI(
    base_url="https://openrouter.ai/api/v1",
    api_key=os.getenv("OPENROUTER_API_KEY"),
)

with open(source_file, "r", encoding="utf-8") as f:
    lines = f.readlines()

selected_lines = lines[start:end]
text = "".join(selected_lines)

chunks = [chunk.strip() for chunk in re.split(r"\n\s*\n", text) if chunk.strip()]

print(f"Total chunks: {len(chunks)}")
print(f"Chunk preview: {chunks[0]}")


Total chunks: 2841
Chunk preview: \section{A部}\label{text00006.html_chapter}


In [4]:
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm.notebook import tqdm
from threading import Lock
import json


def process_chunk(i_chunk: tuple[int, str]) -> list[str]:
    i, chunk = i_chunk

    try:
        response = client.chat.completions.create(
            model=model,
            messages=[
                {"role": "system", "content": prompt},
                {"role": "user", "content": chunk},
            ],
            temperature=0,
            response_format={"type": "json_object"},
        )

        json_string: str = response.choices[0].message.content  # type: ignore
        data = json.loads(json_string)
        equations = data.get("equations", [])
        if not equations:
            return []

        return equations

    except Exception as e:
        print(f"An error occurred processing chunk {i}: {e}")
        return []


max_workers = 30
# Use Consumer-Producer pattern might be more performant
# Current approach is for better visual effect of an increasing file in editor
file_lock = Lock()

# Clear/create the output file
with open(output_file, "w", encoding="utf-8") as f:
    pass

equation_count = 0

with ThreadPoolExecutor(max_workers=max_workers) as executor:
    futures = [
        executor.submit(process_chunk, (i, chunk)) for i, chunk in enumerate(chunks)
    ]
    with tqdm(total=len(futures), desc="Processing chunks") as pbar:
        for future in as_completed(futures):
            result = future.result()
            if not result:
                pbar.update(1)
                continue
            with file_lock:
                with open("data/equations.txt", "a", encoding="utf-8") as f:
                    f.write("\n".join(result) + "\n")
                equation_count += 1
            pbar.update(1)

print(f"\nCompleted! Saved {equation_count} equations to data/equations.txt")

Processing chunks:   0%|          | 0/2841 [00:00<?, ?it/s]

An error occurred processing chunk 28: Request timed out.
An error occurred processing chunk 27: Request timed out.
An error occurred processing chunk 3: Request timed out.
An error occurred processing chunk 6: Request timed out.
An error occurred processing chunk 47: Request timed out.
An error occurred processing chunk 45: Request timed out.
An error occurred processing chunk 46: Request timed out.
An error occurred processing chunk 120: 'list' object has no attribute 'get'
An error occurred processing chunk 145: 'list' object has no attribute 'get'
An error occurred processing chunk 138: Request timed out.
An error occurred processing chunk 247: 'list' object has no attribute 'get'
An error occurred processing chunk 232: Request timed out.
An error occurred processing chunk 585: 'list' object has no attribute 'get'
An error occurred processing chunk 698: 'list' object has no attribute 'get'
An error occurred processing chunk 750: 'list' object has no attribute 'get'
An error occurre