In [1]:
# !pandoc "data/方程式手册.epub" -o data/output.tex
# 1208 ~ 10818 lines are valid

In [2]:
start, end = 1208, 10818
model = "qwen/qwen-2.5-7b-instruct"
source_file = "data/output.tex"
output_file = "data/equations.txt"
prompt = """You are an automated text-processing bot. Your only function is to extract all chemical equations from the user's text and return them inside a single JSON object. Follow all rules exactly.

### JSON Output Structure
- Your entire response MUST be a single, valid JSON object. Do not include any other text or markdown.
- The JSON object must have one key: `"equations"`.
- The value of `"equations"` must be a JSON array of strings.
- Each string in the array must be a correctly formatted chemical equation.
- If no chemical equations are found in the input text, the value for `"equations"` MUST be an empty array `[]`.

### Equation Formatting Rules
- Reaction arrow MUST BE ` == `.
- Separate molecules with ` + `.
- Convert subscripts like `H{2}O` to `H2O`.
- Format charges compactly like `Fe3+` and `SO4 2-`.
- Charges MUST be apart from the molecule by a space. `SO4 2-` is correct, `SO4^2-` or `SO4(2-)` are incorrect.
- Remove all `{}` brackets and `↑` symbols from the final string.

### Examples

**Example 1: Multiple Equations Found**
- **Input:** `{Si+2Cl{2} SiCl{4} } ... {SiO{2} +2CSi+2CO↑}`
- **Correct Output:**
  ```json
  {"equations": ["Si + 2Cl2 == SiCl4", "SiO2 + 2C == Si + 2CO"]}
"""

In [3]:
import os
import re
from openai import OpenAI

client = OpenAI(
    base_url="https://openrouter.ai/api/v1",
    api_key=os.getenv("OPENROUTER_API_KEY"),
)

with open(source_file, "r", encoding="utf-8") as f:
    lines = f.readlines()

selected_lines = lines[start:end]
text = "".join(selected_lines)

chunks = [chunk.strip() for chunk in re.split(r"\n\s*\n", text) if chunk.strip()]

print(f"Total chunks: {len(chunks)}")
print(f"Chunk preview: {chunks[0]}")


Total chunks: 2841
Chunk preview: \section{A部}\label{text00006.html_chapter}


In [None]:
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm.notebook import tqdm
from threading import Lock
import json

error_chunks = []

def process_chunk(i_chunk: tuple[int, str]) -> list[str]:
    i, chunk = i_chunk

    try:
        response = client.chat.completions.create(
            model=model,
            messages=[
                {"role": "system", "content": prompt},
                {"role": "user", "content": chunk},
            ],
            temperature=0,
            response_format={"type": "json_object"},
        )

        json_string: str = response.choices[0].message.content  # type: ignore
        data = json.loads(json_string)
        equations = data.get("equations", [])
        if not equations:
            return []

        return equations

    except Exception as e:
        print(f"An error occurred processing chunk {i}: {e}")
        error_chunks.append((i, chunk))
        return []


max_workers = 30
# Use Consumer-Producer pattern might be more performant
# Current approach is for better visual effect of an increasing file in editor
file_lock = Lock()

# Clear/create the output file
with open(output_file, "w", encoding="utf-8") as f:
    pass

equation_count = 0

with ThreadPoolExecutor(max_workers=max_workers) as executor:
    futures = [
        executor.submit(process_chunk, (i, chunk)) for i, chunk in enumerate(chunks)
    ]
    with tqdm(total=len(futures), desc="Processing chunks") as pbar:
        for future in as_completed(futures):
            result = future.result()
            if not result:
                pbar.update(1)
                continue
            with file_lock:
                with open("data/equations.txt", "a", encoding="utf-8") as f:
                    f.write("\n".join(result) + "\n")
                equation_count += 1
            pbar.update(1)

print(f"\nCompleted! Saved {equation_count} equations to data/equations.txt")

Processing chunks:   0%|          | 0/2841 [00:00<?, ?it/s]

An error occurred processing chunk 28: Request timed out.
An error occurred processing chunk 27: Request timed out.
An error occurred processing chunk 3: Request timed out.
An error occurred processing chunk 6: Request timed out.
An error occurred processing chunk 47: Request timed out.
An error occurred processing chunk 45: Request timed out.
An error occurred processing chunk 46: Request timed out.
An error occurred processing chunk 120: 'list' object has no attribute 'get'
An error occurred processing chunk 145: 'list' object has no attribute 'get'
An error occurred processing chunk 138: Request timed out.
An error occurred processing chunk 247: 'list' object has no attribute 'get'
An error occurred processing chunk 232: Request timed out.
An error occurred processing chunk 585: 'list' object has no attribute 'get'
An error occurred processing chunk 698: 'list' object has no attribute 'get'
An error occurred processing chunk 750: 'list' object has no attribute 'get'
An error occurre

In [None]:
# Post-process error chunks
for i, chunk in error_chunks:
    print(f"Reprocessing chunk {i} due to previous error.")
    result = process_chunk((i, chunk))
    if not result:
        continue
    with file_lock:
        with open("data/equations.txt", "a", encoding="utf-8") as f:
            f.write("\n".join(result) + "\n")
        equation_count += 1
print(f"\nFinal equation count after reprocessing errors: {equation_count}")

In [3]:
# Remove duplicates while preserving order
from chem_parser import Equation, get_equation
import logging

# silent the logging from chem_parser
logging.getLogger("chem_parser").setLevel(logging.ERROR)

with open("data/equations.txt", "r", encoding="utf-8") as f:
    lines = f.readlines()
unique_eq: set[Equation] = set()

for i, line in enumerate(lines):
    line = line.strip()
    try:
        eq = get_equation(line)
        unique_eq.add(eq)
    except Exception as e:
        print(f"Could not parse line: '{line}' at {i}. Error: {e}")

with open("data/equations_unique.txt", "w", encoding="utf-8") as f:
    f.write("\n".join(sorted([str(eq) for eq in unique_eq])) + "\n")

print(f"Final unique equation count: {len(unique_eq)}")

Could not parse line: '3Ag + NO3- + 4H+ ==' at 6. Error: Empty formula in equation
Could not parse line: 'Ag + 2HNO3 ==' at 7. Error: Empty formula in equation
Could not parse line: '4[Ag(NH3)2]+ + HCHO + 4OH- ==' at 36. Error: Empty formula in equation
Could not parse line: '2[Ag(NH3)2]+ + CH3 CHO + 2OH- ==' at 50. Error: Empty formula in equation
Could not parse line: 'Ag+ + 2NH3 · H2 O' at 51. Error: Expected EQUALS but found None
Could not parse line: '2Ag(NH3)2 OH + HCOOC2H5 C2H5 OCOONH4 + 2Ag↓ + 3NH3 + H2 O' at 53. Error: Expected EQUALS but found None
Could not parse line: '2Ag+ + 2OH- ==' at 92. Error: Empty formula in equation
Could not parse line: '2Ag(NH3)2OH + CH2OH(CHOH)4CHO ==' at 107. Error: Empty formula in equation
Could not parse line: 'Ag+ + 2NH3·H2O ==' at 108. Error: Empty formula in equation
Could not parse line: 'Ag2CO3 + 2HNO3 ==' at 136. Error: Empty formula in equation
Could not parse line: 'Al2(SO4)3 + 6NaHCO3 == ?' at 147. Error: Empty formula in equation
Co

In [4]:
with open("data/equations_unique.txt", "r", encoding="utf-8") as f:
    lines = f.readlines()
    balanced_count = 0
    total_count = 0
    unbalanced_equations = []

    for line in lines:
        line = line.strip()
        total_count += 1
        try:
            eq = get_equation(line)
            if eq.is_balanced():
                balanced_count += 1
            else:
                unbalanced_equations.append(line)
        except Exception:
            # Assuming lines that can't be parsed are not counted as balanced
            unbalanced_equations.append(line)

    print(f"Total unique equations: {total_count}")
    print(f"Balanced equations: {balanced_count}")
    print(f"Unbalanced equations: {total_count - balanced_count}")

    # Optional: print unbalanced equations
    print("\nSample of unbalanced equations:")
    for eq_str in unbalanced_equations:
        print(eq_str)

Total unique equations: 1687
Balanced equations: 1204
Unbalanced equations: 483

Sample of unbalanced equations:
(C6H10O5)n + n H2O == C12H22O11
(C6H10O5)n + n H2O == n C6H12O6
(NH4)2SO3 + SO2 + H2O2 == NH4HSO3
-1Br + CH3CH2 + H - 1OH == C2H5 + HBr - 1OH
-1Br + CH3CH2 + NaOH == CH2CH2 + H2O + NaBr
2Ag(NH3)2 OH + 3NH3 + H2O == 2Ag
2Ag+ + Cu == 2Ag + Cu
2Al + 2H2O + 2OH- == 2AlO2- + 3H2
2Al + 2NaOH + 2H2O == 2Al(OH)3 + H2
2Al + 3Cl2 == AlCl3
2Al + 3Hg(NO3)2 == 2Al(NO3)2 + 3Hg
2Al + 6H+ == 2Al3+
2Al(OH)3 == Al2 O3
2Al3+ + 3CO3 2- + 3H2O == Al(OH)3 + 3HCO3 -
2Al3+ + 3SO3 2- + 3H2O == 2Al(OH)3
2AlO2- + CO2 () + 3H2O == 2Al(OH)3 + CO3 2-
2AlO2- + CO2 + 3H2O == 2Al(OH)3 + CO3 2-
2AlO2- + CO2 + 3H2O == Al(OH)3 + HCO3-
2Ba2+ + 4OH- + Al3+ + 2SO4 2- == 2BaSO4 + AlO2- + 2H2O
2Ba2+ + 4OH- + Al3+ + 2SO42- == 2BaSO4 + AlO2- + 2H2O
2Br2 + CH2CH - 1CHCH2 == CH2Br - 1CHBr - 1CHBr - 1CH2Br
2Br2 + CH2CH - 1CHCH2 == CH2BrCHBr - 1CHCH2
2Br2 + CHCH == Br2CH2CH2Br2
2Br2 + CHCH == CBr2CHBr2
2C17H35COO- + Ca2+