# Data Processing for chemical equation datasets

## Data Cleaning with LLM

### File Type Conversion

In [1]:
!pandoc "data/方程式手册.epub" -o data/source.tex
# 1208 ~ 10818 lines are valid

### LLM Cleaning Script

In [None]:
start, end = 1208, 10818
model = "qwen/qwen3-235b-a22b-2507"
# I've gotta say, previously I used qwen/qwen-2.5-7b-instruct, which is a pain in the ass to use
# Bigger models are indeed better :)
source_file = "./data/source.tex"
extracted_file = "./data/extracted_equations.txt"
deduped_file = "./data/deduped_equations.txt"
scrape_prompt = """You are an automated text-processing bot. Your only function is to extract all chemical equations from the user's text and return them inside a single JSON object. Follow all rules exactly.
### Extraction Instructions
- Extract ALL chemical equations from the input text with no missing.
- Complete the extraction even if some equations are malformed. Attempt to correct them according to chemistry knowledge and extra information in the text.
- Try to instantiate equations involving variables or placeholders using context from the input text.

### JSON Output Structure
- Your entire response MUST be a single, valid JSON object. Do not include any other text or markdown.
- The JSON object must have one key: `"equations"`.
- The value of `"equations"` must be a JSON array of strings.
- Each string in the array must be a correctly formatted chemical equation.
- If no chemical equations are found in the input text, the value for `"equations"` MUST be an empty array `[]`.

### Equation Formatting Rules
- Reaction arrow MUST BE ` == `.
- Separate molecules with ` + `.
- Convert subscripts like `H{2}O` to `H2O`.
- Format charges compactly like `Fe3+` and `SO4 2-`.
- Charges MUST be apart from the molecule by a space. `NH4 +` and `SO4 2+` are correct, `NH4+`, `SO4^2-` or `SO4(2-)` are incorrect. Electrons should be represented as `e-`.
- Remove all `{}` brackets, `↑` symbols and `(aq)`/`(s)` marks from the final string.

### Examples

**Example 1: Multiple Equations Found**
- **Input:** `{Si+2Cl{2} SiCl{4} } ... {SiO{2} +2CSi+2CO↑}`
- **Correct Output:**
  ```json
  {"equations": ["Si + 2Cl2 == SiCl4", "SiO2 + 2C == Si + 2CO"]}
"""

In [3]:
for path in (extracted_file, deduped_file):
    with open(path, "w", encoding="utf-8") as f:
        pass

print(f"Prepared files:\n - {extracted_file}\n - {deduped_file}")

Prepared files:
 - ./data/extracted_equations.txt
 - ./data/deduped_equations.txt


In [4]:
import os
import re
from openai import OpenAI

client = OpenAI(
    base_url="https://openrouter.ai/api/v1",
    api_key=os.getenv("OPENROUTER_API_KEY"),
)

with open(source_file, "r", encoding="utf-8") as f:
    equation_strs = f.readlines()

selected_lines = equation_strs[start:end]
text = "".join(selected_lines)

chunks: list[str] = [
    chunk.strip() for chunk in re.split(r"\n\s*\n", text) if chunk.strip()
]

print(f"Total chunks: {len(chunks)}")
print(f"Chunk preview: {chunks[30]}")


Total chunks: 2841
Chunk preview: { {【AgBr】} }
溴化银，不溶于水，浅黄色晶体或粉末，在水中生成时为浅黄色沉淀。见光分解。


In [5]:
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm.notebook import tqdm
from threading import Lock
import json
import logging

logging.basicConfig(level=logging.ERROR)
logger = logging.getLogger(__name__)

error_chunks: list[str] = []

max_workers = 30
# Use Consumer-Producer pattern might be more performant
# Current approach is for better visual effect of an increasing file in editor
file_lock = Lock()


def process_chunk(
    chunk: str, error_list: list[str], prompt: str = scrape_prompt
) -> list[str]:
    try:
        response = client.chat.completions.create(
            model=model,
            messages=[
                {"role": "system", "content": prompt},
                {"role": "user", "content": chunk},
            ],
            temperature=0,
            response_format={"type": "json_object"},
        )

        json_string: str = response.choices[0].message.content  # type: ignore
        data = json.loads(json_string)
        equations = data.get("equations", [])
        if not equations:
            return []

        return equations

    except Exception:
        error_list.append(chunk)
        return []

In [None]:
equation_count = 0

with ThreadPoolExecutor(max_workers=max_workers) as executor:
    futures = [executor.submit(process_chunk, chunk, error_chunks) for chunk in chunks]
    with tqdm(total=len(futures), desc="Processing chunks") as pbar:
        for future in as_completed(futures):
            result = future.result()
            if not result:
                pbar.update(1)
                continue
            with file_lock:
                with open(extracted_file, "a", encoding="utf-8") as f:
                    f.write("\n".join(result) + "\n")
                equation_count += len(result)
            pbar.update(1)

print(f"\nCompleted! Saved {equation_count} equations to {extracted_file}")
print(f"Total error chunks: {len(error_chunks)}")


Processing chunks:   0%|          | 0/2841 [00:00<?, ?it/s]


Completed! Saved 3610 equations to ./data/extracted_equations.txt
Total error chunks: 0


In [7]:
# With such powerful model, reprocessing errors seems unnecessary since no errors observed...
# Previous logic:

# Post-process error chunks
# error_chunks_last = []
# I don't bother defining a new function
# Afterall, copy & paste is also a way to code reuse, right?
# with ThreadPoolExecutor(max_workers=max_workers) as executor:
#     futures = [
#         executor.submit(process_chunk, chunk, error_chunks_last)
#         for chunk in error_chunks
#     ]
#     with tqdm(total=len(futures), desc="Reprocessing error chunks") as pbar:
#         for future in as_completed(futures):
#             result = future.result()
#             if not result:
#                 pbar.update(1)
#                 continue
#             with file_lock:
#                 with open(extracted_file, "a", encoding="utf-8") as f:
#                     f.write("\n".join(result) + "\n")
#                 equation_count += len(result)
#             pbar.update(1)

# print(f"\nFinal equation count after reprocessing errors: {equation_count}")
# print(f"Total error chunks after reprocessing: {len(error_chunks_last)}")
# # Manual Fix
# print(error_chunks_last)

### Remove Duplicates/Write to File

In [None]:
# LLM are prone to electron charge formatting errors, so we do a pre-processing pass here
def charge_correction(eqs: list[str]) -> list[str]:
    correction_pattern = {
        "SO42-": "SO4 2-",
        "SO32-": "SO3 2-",
        "NH4+": "NH4 +",
        "HCO3-": "HCO3 -",
        "HSO3-": "HSO3 -",
        "Ag(NH3)2+": "Ag(NH3)2 +",
        "AlO2-": "AlO2 -",
        "CO32-": "CO3 2-",
        "ClO4-": "ClO4 -",
        "NO3-": "NO3 -",
        "HF2-": "HF2 -",
        "NO2-": "NO2 -",
        "H2PO4-": "H2PO4 -",
        "HPO42-": "HPO4 2-",
        "PO43-": "PO4 3-",
        "ClO3-": "ClO3 -",
        "MnO4-": "MnO4 -",
    }
    corrected_eqs = []
    for eq in eqs:
        for mistake, correction in correction_pattern.items():
            if mistake in eq:
                eq = eq.replace(mistake, correction)
        corrected_eqs.append(eq)
    return corrected_eqs

In [None]:
# Remove duplicates while preserving order
from chem_solver import Equation


with open(extracted_file, "r", encoding="utf-8") as f:
    equation_strs = f.readlines()
error_eqs: list[tuple[str, str]] = []


def test_and_deduped(
    equation_strs: list[str], error_eqs: list[tuple[str, str]] = error_eqs
) -> list[str]:
    tested_eqs: list[str] = []
    eq_strs = charge_correction(equation_strs)
    for line in eq_strs:
        line = line.strip()
        try:
            eq = Equation(line)
        except Exception as e:
            error_eqs.append((line, str(e)))
            continue

        # Try to balance
        if not (balanced := eq.balance()):
            error_eqs.append((line, "Could not be balanced"))
            continue
        # Make sure multiple solutions aren't due to identical reactants/products
        elif len(balanced) > 1:
            if eq.reactants.keys() == eq.products.keys():
                error_eqs.append((line, "Identical reactants and products"))
                continue
        for b in balanced:
            tested_eqs.append(str(b))

    return list(dict.fromkeys(tested_eqs))


deduped_eqs = test_and_deduped(equation_strs)
with open(deduped_file, "w", encoding="utf-8") as f:
    f.write("\n".join(deduped_eqs) + "\n")

print(f"Processed equation count: {len(deduped_eqs)}")
print(f"Total equations that could not be parsed: {len(error_eqs)}")

Processed equation count: 2544
Total equations that could not be parsed: 87


In [28]:
error_eqs[:5]

[('Ag+ + 2NH3 + H2O == Ag(NH3)2OH', 'Could not be balanced'),
 ('Ag+ + 2NH3 * H2O == Ag(NH3)2OH', 'Could not be balanced'),
 ('Al3+ + CO3 2- == Al(OH)3 + CO2', 'Could not be balanced'),
 ('Al2(CO3)3 == 2Al(OH)3 + 3CO2', 'Could not be balanced'),
 ('Al(OH)3 + OH- == Al(OH)4-', 'Could not be balanced')]

In [None]:
# Fix some common errors in error_eqs with LLM
error_eqs_last = []
fix_prompt = """You are an expert chemist and data correction bot, specializing in high school and introductory college-level chemistry. Your function is to analyze faulty chemical equations, diagnose the problem based on the provided error, and rewrite them into a list of valid, balanced, and parsable equations.

### Your Task
You will be given a `broken_equation` and an error `message`. Your goal is to fix the equation according to the following logic:

1.  **Analyze the Error Type**:
    *   **If the error is `Could not be balanced`**: This means the equation is chemically incorrect. Do NOT just try to balance it. Instead, fix the underlying chemical error (e.g., correct a wrong product, add a missing substance like H2O in a redox reaction) and then provide the correctly balanced equation.
    *   **If the error is about parsing (`Expected closing bracket...`, etc.)**: This usually indicates a general algebraic formula (e.g., involving `n`, `x`, `y`). Your task is to **instantiate** it. Provide 2-3 specific, representative examples using small integer values for the variables (e.g., n=1, n=2, n=3).
    *   **If the error is `Empty formula in equation`**: The reaction is incomplete. Use the provided reactants to deduce and add the most likely products based on common high school chemical reactions (acid-base, redox, precipitation, etc.), then balance it.

2.  **Apply Chemical Knowledge**: Always default to the most common reaction pathways taught in high school chemistry. For example, the oxidation of ethanol produces acetaldehyde and hydrogen, not water. Permanganate in acid goes to Mn2+.

3.  **Generate Output**: Provide a list of corrected equations in the specified JSON format. The list may be empty (if unfixable), have one entry, or multiple entries (especially for instantiated general formulas).

### JSON Output Structure
- Your entire response MUST be a single, valid JSON object. Do not include any other text or markdown.
- The JSON object must have one key: `"equations"`.
- The value of `"equations"` must be a JSON array of strings. Each string must be a correctly formatted and parsable chemical equation.
- If you cannot determine a valid correction, the value for `"equations"` MUST be an empty array `[]`.

### Equation Formatting Rules
- Reaction arrow MUST BE ` == `.
- Separate all molecules and ions with ` + `.
- Format charges compactly like `Fe3+` and `SO4 2-`. Charges MUST be separated from the molecule by a space.
- Electrons MUST be represented as `e-`.
- Remove all `{}` brackets, `↑`, `(s)`, `(aq)` symbols, and algebraic expressions like `(n+1)`.

---
### Correction Examples

**Example 1: Fixing a Chemically Incorrect Equation**
- **Input:**
  ```
  broken_equation: CH3CH2OH == CH3CHO + H2O
  message: Could not be balanced
  ```
- **Correct Output:**
  ```json
  {
    "equations": ["CH3CH2OH == CH3CHO + H2"]
  }
  ```

**Example 2: Instantiating a General Formula**
- **Input:**
  ```
  broken_equation: CnH2n+2 + O2 == (n +1)H2O + nCO
  message: Expected closing bracket TokenType.RPAREN but found Token(...)
  ```
- **Correct Output:**
  ```json
  {
    "equations": [
      "2CH4 + 3O2 == 4H2O + 2CO",
      "2C2H6 + 5O2 == 6H2O + 4CO"
    ]
  }
  ```

**Example 3: Completing an Incomplete Redox Reaction**
- **Input:**
  ```
  broken_equation: 3Fe2+ + 4H+ + NO3 - ==
  message: Empty formula in equation
  ```
- **Correct Output:**
  ```json
  {
    "equations": ["3Fe 2+ + 4H + + NO3 - == 3Fe 3+ + NO + 2H2O"]
  }
  ```

**Example 4: Fixing an Incorrect Ionic Equation**
- **Input:**
  ```
  broken_equation: Ag+ + 2NH3 + H2O == Ag(NH3)2OH
  message: Could not be balanced
  ```
- **Correct Output:**
  ```json
  {
    "equations": ["Ag + + 2NH3·H2O == Ag(NH3)2 + + 2H2O"]
  }
  ```
  ```"""
fixed_eqs: list[str] = []
with ThreadPoolExecutor(max_workers=max_workers) as executor:
    futures = [
        executor.submit(
            process_chunk,
            f"broken_equation: {line}\nmessage: {error_msg}",
            error_eqs_last,
            prompt=fix_prompt,
        )
        for line, error_msg in error_eqs
    ]
    with tqdm(total=len(futures), desc="Fixing error equations") as pbar:
        for future in as_completed(futures):
            result = future.result()
            if not result:
                pbar.update(1)
                continue
            with file_lock:
                fixed_eqs.extend(result)
            pbar.update(1)

print(f"\nFixed equations count: {len(fixed_eqs)}")
with open(deduped_file, "r", encoding="utf-8") as f:
    equation_strs = f.readlines()
deduped_eqs = test_and_deduped(equation_strs + fixed_eqs)
with open(deduped_file, "w", encoding="utf-8") as f:
    f.write("\n".join(deduped_eqs) + "\n")

print(f"Processed equation count: {len(deduped_eqs)}")
print(f"Total equations that could not be parsed: {len(error_eqs_last)}")

Fixing error equations:   0%|          | 0/87 [00:00<?, ?it/s]


Fixed equations count: 114
Processed equation count: 2576
Total equations that could not be parsed: 12


In [31]:
# Manual fix
error_eqs_last[:5]

['broken_equation: ROH + HX == RX + H2O\nmessage: Could not be balanced',
 "broken_equation: CnH2n-2 + (3n-1)/2 O2 == n CO2 + (n-1) H2O\nmessage: Expected closing bracket TokenType.RPAREN but found Token(type=<TokenType.NUMBER: '\\\\d+'>, value='3')",
 'broken_equation: 2Ag(NH3)2OH == 2Ag + 3NH3 + H2O\nmessage: Could not be balanced',
 "broken_equation: CnH2n-6 + (3n-3)O2 == nCO2 + (n-3)H2O\nmessage: Expected closing bracket TokenType.RPAREN but found Token(type=<TokenType.NUMBER: '\\\\d+'>, value='3')",
 'broken_equation: (C6H10O5)n + nH2O == nC6H12O6\nmessage: Could not be balanced']

In [39]:
# Final Test
from pprint import pprint

with open(deduped_file, "r", encoding="utf-8") as f:
    equation_strs = f.readlines()
errors = []
deduped_eqs = test_and_deduped(equation_strs, error_eqs=errors)
with open(deduped_file, "w", encoding="utf-8") as f:
    f.write("\n".join(deduped_eqs) + "\n")
print(f"Total equations that could not be parsed: {len(errors)}")
pprint(errors)

Total equations that could not be parsed: 8
[('H2O ==', 'Empty formula in equation'),
 ('HO(CH2)5COOH ==', 'Empty formula in equation'),
 ('2H2O + H2N(CH2)6NHCO(CH2)4CONH(CH2)6NH2 == 2H2N(CH2)6NH2',
  'Could not be balanced'),
 ('2Ag(NH3)2OH == 2Ag + 3NH3 + H2O', 'Could not be balanced'),
 ('(C6H10O5)n + nH2O == nC6H12O6', 'Could not be balanced'),
 ('nH2O + (C6H10O5)n == C6H12O6', 'Could not be balanced'),
 ('Fe3+ + 10SCN- == [Fe(SCN)10] 10-', 'Could not be balanced'),
 ('10 HOOC(CH2)4COOH + 10 H2N(CH2)6NH2 == 18 H2O', 'Could not be balanced')]


## Data Processing

In [1]:
from chem_solver import Equation  # noqa: F811

extracted_file = "./data/extracted_equations.txt"
deduped_file = "./data/deduped_equations.txt"
with open(deduped_file, "r", encoding="utf-8") as f:
    equation_strs = f.readlines()
equations = list(map(Equation, equation_strs))

In [2]:
from collections import Counter

# This is a big suprise that all eqs in my dataset have a unique solution ...
Counter([len(eq.balance()) for eq in equations])

Counter({1: 2571})

In [None]:
Counter([True if eq.find_observable_solution_order() else False for eq in equations])
# there are 2322/2571 = 90.3% equations solvable by observation.

Counter({True: 2322, False: 249})

In [13]:
observable = [eq for eq in equations if eq.find_observable_solution_order()]
nonobservable = [eq for eq in equations if not eq.find_observable_solution_order()]
print("Observable Equations:")
for eq in observable[:6]:
    print(eq)
print("Non-Observable Equations:")
for eq in nonobservable[:6]:
    print(eq)


Observable Equations:
2Ag+ + Cu == Cu2+ + 2Ag
Ag+ + I- == AgI
Ag+ + Cl- == AgCl
2Ag+ + H2S == Ag2S + 2H+
2Ag+ + SO3 2- == Ag2 SO3
Ag+ + OH- == AgOH
Non-Observable Equations:
2Ag + 2H2SO4 == Ag2SO4 + SO2 + 2H2O
3Ag + 4HNO3 == 3Ag+ + NO + 2H2O + 3NO3 -
16Ag(NH3)2 + + 9OH- + C6H11O7- == C6H12O6 + 16Ag + 32NH3 + 4H2O
2Ag(NH3)2 + + CH2OH(CHOH)4CHO + OH- == 2Ag + 2NH4 + + 2NH3 + CH2OH(CHOH)4COO-
2Ag(NH3)2OH + CH2OH(CHOH)4CHO == 2Ag + H2O + 4NH3 + CH2OH(CHOH)4COOH
2Ag(NH3)2OH + CH3CH2CHO == 2Ag + 3NH3 + H2O + CH3CH2COONH4
