In [None]:
import os
import pandas as pd
import requests
import json
from itertools import combinations, product
from joblib import Parallel, delayed
import time
from typing import List, Dict, Any, Tuple
import random
from tqdm.auto import tqdm

In [None]:
OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY")
MODEL = "openai/gpt-4.1"

### Aggregate JSON results into Parquet and CSV

โค้ดด้านล่างจะ:
- ค้นหาโฟลเดอร์ `infomation/testing/<model>/results` อัตโนมัติ
- อ่านไฟล์ `.json` ทั้งหมดรวมเป็น DataFrame ต่อโมเดล
- บันทึก `all_results.csv` (เข้ารหัส UTF-8-SIG รองรับภาษาไทย) และ `all_results.parquet` ในโฟลเดอร์ของแต่ละโมเดล
- แสดงสรุปจำนวนไฟล์/แถว/เส้นทางที่บันทึก

หมายเหตุ:
- สำหรับ Parquet แนะนำให้ติดตั้ง `pyarrow` (หรือ `fastparquet`) หากยังไม่ได้ติดตั้ง

In [11]:
from pathlib import Path
import json
import pandas as pd
from typing import List, Dict, Any

# Root directory that contains model folders
testing_root = Path(".").resolve() / "infomation" / "testing"

# Fallback if __file__ is not defined (e.g., in notebook context)
if not testing_root.exists():
    nb_root = Path.cwd()
    # Try to locate the repo root by finding 'infomation/testing'
    candidate = nb_root / "infomation" / "testing"
    if candidate.exists():
        testing_root = candidate
    else:
        # Walk up a few levels
        for _ in range(5):
            nb_root = nb_root.parent
            candidate = nb_root / "infomation" / "testing"
            if candidate.exists():
                testing_root = candidate
                break

print(f"Testing root: {testing_root}")
assert testing_root.exists(), "Cannot locate 'infomation/testing' directory. Please adjust the path."

# Discover model directories (directories containing a 'results' subfolder)
model_dirs = [p for p in testing_root.iterdir() if p.is_dir() and (p / "results").is_dir()]
print("Found model folders:", [p.name for p in model_dirs])

summary: List[Dict[str, Any]] = []

for model_dir in model_dirs:
    model_name = model_dir.name
    results_dir = model_dir / "results"
    json_files = sorted(results_dir.glob("*.json"))

    records: List[Dict[str, Any]] = []

    for jf in json_files:
        try:
            with open(jf, "r", encoding="utf-8") as f:
                data = json.load(f)
            # Normalize nested JSON into flat columns when possible
            rec = {}
            if isinstance(data, dict):
                rec = data.copy()
            else:
                rec = {"raw": data}
            rec["_source_file"] = str(jf)
            records.append(rec)
        except Exception as e:
            print(f"Failed to parse {jf}: {e}")

    if not records:
        print(f"No JSON files found for model '{model_name}'. Skipping.")
        continue

    df = pd.json_normalize(records, sep=".")

    # Write outputs
    csv_path = model_dir / "all_results.csv"
    parquet_path = model_dir / "all_results.parquet"

    # CSV with UTF-8-SIG for Thai compatibility in Excel
    df.to_csv(csv_path, index=False, encoding="utf-8-sig")

    # Parquet requires pyarrow or fastparquet
    try:
        df.to_parquet(parquet_path, index=False)
        parquet_ok = True
    except Exception as e:
        print(f"Parquet write failed for '{model_name}': {e}")
        parquet_ok = False

    summary.append({
        "model": model_name,
        "json_files": len(json_files),
        "rows": len(df),
        "csv": str(csv_path),
        "parquet": str(parquet_path) if parquet_ok else None,
    })

# Display summary
pd.DataFrame(summary)

Testing root: /Users/tk17250/Documents/Work/UpMath/UpMathLLM/infomation/testing
Found model folders: ['thai-homeworkgen-v4', 'gemini-2.5-pro', 'chatgpt-4o-mini']


Unnamed: 0,model,json_files,rows,csv,parquet
0,thai-homeworkgen-v4,156,156,/Users/tk17250/Documents/Work/UpMath/UpMathLLM...,/Users/tk17250/Documents/Work/UpMath/UpMathLLM...
1,gemini-2.5-pro,189,189,/Users/tk17250/Documents/Work/UpMath/UpMathLLM...,/Users/tk17250/Documents/Work/UpMath/UpMathLLM...
2,chatgpt-4o-mini,189,189,/Users/tk17250/Documents/Work/UpMath/UpMathLLM...,/Users/tk17250/Documents/Work/UpMath/UpMathLLM...


### Evaluate

In [26]:
import os
import json
import math
import random
import time
from pathlib import Path
from typing import Dict, Any, Optional, List
from concurrent.futures import ThreadPoolExecutor, as_completed
import csv
import pandas as pd
import requests
from joblib import Parallel, delayed
from tqdm.auto import tqdm

In [None]:
# Configuration
OPENROUTER_API_KEY = "xxx"
EVAL_ENDPOINT = "https://openrouter.ai/api/v1/chat/completions"
EVAL_MODEL = "openai/gpt-4.1"  # OpenRouter slug
MODEL_EVALUATE_FIELD = "gpt-4.1"  # Value to record in output
EVAL_CONCURRENCY = int(os.getenv("EVAL_CONCURRENCY", "4"))  # parallel requests
MAX_RETRIES = 2
TIMEOUT = 120  # seconds per request

assert OPENROUTER_API_KEY, "Environment variable OPENROUTER_API_KEY is not set."

In [28]:
headers = {
    "Authorization": f"Bearer {OPENROUTER_API_KEY}",
    "Content-Type": "application/json",
    # Optional but recommended for OpenRouter policy
    "HTTP-Referer": os.getenv("OPENROUTER_HTTP_REFERER", "http://localhost"),
    "X-Title": os.getenv("OPENROUTER_TITLE", "UpMathLLM Evaluation"),
}

# Locate testing root containing model folders and all_results.* files
testing_root = Path(".").resolve() / "infomation" / "testing"
if not testing_root.exists():
    nb_root = Path.cwd()
    candidate = nb_root / "infomation" / "testing"
    if candidate.exists():
        testing_root = candidate
    else:
        for _ in range(5):
            nb_root = nb_root.parent
            candidate = nb_root / "infomation" / "testing"
            if candidate.exists():
                testing_root = candidate
                break

print(f"Testing root: {testing_root}")
assert testing_root.exists(), "Cannot locate 'infomation/testing' directory. Please adjust the path."

Testing root: /Users/tk17250/Documents/Work/UpMath/UpMathLLM/infomation/testing


In [29]:
# Discover model folders
model_dirs = [p for p in testing_root.iterdir() if p.is_dir()]
model_dirs = [p for p in model_dirs if (p / "all_results.csv").exists() or (p / "all_results.parquet").exists()]
print("Models to evaluate:", [p.name for p in model_dirs])

Models to evaluate: ['thai-homeworkgen-v4', 'gemini-2.5-pro', 'chatgpt-4o-mini']


In [30]:
rubric_keys = [
    ("score_katex_formatting", "KaTeX Formatting (0-10)"),
    ("score_mathematical_correctness", "Mathematical Correctness (0-10)"),
    ("score_explanation_clarity", "Explanation Clarity (0-10)"),
    ("score_difficulty_alignment", "Difficulty Alignment (0-10)"),
    ("score_blooms_taxonomy", "Bloom's Taxonomy (0-10)"),
    ("score_thai_grammar_style", "Grammar & Style (0-10)"),
    ("score_thai_math_terms", "Mathematical Thai Terms (0-10)"),
]

system_prompt = (
    "You are a meticulous Thai math education evaluator. Score the provided question+explanation content "
    "across specific criteria. Return strict JSON only. Use Thai language in think/explanations. "
    "Each score is an integer 0-10. If a criterion does not apply, still provide a best-effort score."
)

rubric_instructions = (
    "Rubric:\n"
    "1) Technical Accuracy\n"
    "- KaTeX Formatting (0-10): KaTeX/LaTeX syntax validity, renderability.\n"
    "- Mathematical Correctness (0-10): Correct results and steps.\n"
    "2) Content Quality\n"
    "- Explanation Clarity (0-10): Clear, step-by-step explanation.\n"
    "- Difficulty Alignment (0-10): Difficulty matches the described level.\n"
    "- Bloom's Taxonomy (0-10): Correct mapping of skills to Bloom's levels.\n"
    "3) Thai Language\n"
    "- Grammar & Style (0-10): Thai grammar and style quality.\n"
    "- Mathematical Thai Terms (0-10): Correct Thai math terminology.\n\n"
    "Output JSON schema:\n"
    "{\n"
    "  \"scores\": {\n"
    "    \"katex_formatting\": <int 0-10>,\n"
    "    \"mathematical_correctness\": <int 0-10>,\n"
    "    \"explanation_clarity\": <int 0-10>,\n"
    "    \"difficulty_alignment\": <int 0-10>,\n"
    "    \"blooms_taxonomy\": <int 0-10>,\n"
    "    \"thai_grammar_style\": <int 0-10>,\n"
    "    \"thai_math_terms\": <int 0-10>\n"
    "  },\n"
    "  \"average_score\": <float>,\n"
    "  \"think\": <string in Thai summarizing reasoning>\n"
    "}"
)

In [31]:
def clamp_score(x: Any) -> Optional[int]:
    try:
        v = int(round(float(x)))
        return max(0, min(10, v))
    except Exception:
        return None


def compute_average(scores: Dict[str, Any]) -> Optional[float]:
    vals: List[float] = []
    for k in [
        "katex_formatting",
        "mathematical_correctness",
        "explanation_clarity",
        "difficulty_alignment",
        "blooms_taxonomy",
        "thai_grammar_style",
        "thai_math_terms",
    ]:
        v = scores.get(k)
        if v is None:
            continue
        try:
            vals.append(float(v))
        except Exception:
            pass
    if not vals:
        return None
    return sum(vals) / len(vals)


def call_openrouter_eval(content: str, meta: Dict[str, Any]) -> Dict[str, Any]:
    user_context = {
        "topic": meta.get("topic"),
        "grade": meta.get("grade"),
        "qtype": meta.get("qtype"),
        "level": meta.get("level"),
        "bloom": meta.get("bloom"),
    }
    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": f"{rubric_instructions}\n\nContent to evaluate (Thai):\n{content}\n\nContext: {json.dumps(user_context, ensure_ascii=False)}"},
    ]
    payload = {
        "model": EVAL_MODEL,
        "messages": messages,
        "temperature": 0,
        "response_format": {"type": "json_object"},
    }

    last_err = None
    for attempt in range(MAX_RETRIES + 1):
        try:
            # small jitter to avoid bursts
            time.sleep(random.uniform(0.05, 0.2))
            resp = requests.post(EVAL_ENDPOINT, headers=headers, data=json.dumps(payload), timeout=TIMEOUT)
            if resp.status_code == 200:
                data = resp.json()
                msg = data.get("choices", [{}])[0].get("message", {})
                content_json = msg.get("content", "{}")
                # Parse returned JSON content
                try:
                    parsed = json.loads(content_json)
                except Exception:
                    # try to strip code fences if any
                    cleaned = content_json.strip().strip('`')
                    parsed = json.loads(cleaned)
                return parsed
            else:
                last_err = f"HTTP {resp.status_code}: {resp.text[:300]}"
        except Exception as e:
            last_err = str(e)
        # backoff
        time.sleep(0.5 * (attempt + 1))
    raise RuntimeError(last_err or "Unknown evaluation error")


def evaluate_row(row: pd.Series, fallback_model_name: str) -> Dict[str, Any]:
    item_id = row.get("id")
    content = row.get("content") or ""
    # Resolve model name
    meta_model = row.get("meta.model")
    model_name = meta_model if (isinstance(meta_model, str) and meta_model.strip()) else fallback_model_name
    # Special case capitalization for Thai-HomeworkGen-v4
    if fallback_model_name.lower() == "thai-homeworkgen-v4":
        model_name = "Thai-HomeworkGen-v4" if not (isinstance(meta_model, str) and meta_model.strip()) else meta_model

    meta = {
        "topic": row.get("topic"),
        "grade": row.get("grade"),
        "qtype": row.get("qtype"),
        "level": row.get("level"),
        "bloom": row.get("bloom"),
    }

    try:
        resp = call_openrouter_eval(str(content), meta)
    except Exception as e:
        return {
            "id": item_id,
            "model": model_name,
            "error": str(e),
            "model_evaluate": MODEL_EVALUATE_FIELD,
        }

    scores = resp.get("scores", {}) if isinstance(resp, dict) else {}
    out = {
        "id": item_id,
        "model": model_name,
        "score_katex_formatting": clamp_score(scores.get("katex_formatting")),
        "score_mathematical_correctness": clamp_score(scores.get("mathematical_correctness")),
        "score_explanation_clarity": clamp_score(scores.get("explanation_clarity")),
        "score_difficulty_alignment": clamp_score(scores.get("difficulty_alignment")),
        "score_blooms_taxonomy": clamp_score(scores.get("blooms_taxonomy")),
        "score_thai_grammar_style": clamp_score(scores.get("thai_grammar_style")),
        "score_thai_math_terms": clamp_score(scores.get("thai_math_terms")),
        "average_score": resp.get("average_score"),
        "think": resp.get("think"),
        "model_evaluate": MODEL_EVALUATE_FIELD,
    }
    # Compute average if missing
    if out["average_score"] is None or (isinstance(out["average_score"], float) and math.isnan(out["average_score"])):
        avg = compute_average({
            "katex_formatting": out["score_katex_formatting"],
            "mathematical_correctness": out["score_mathematical_correctness"],
            "explanation_clarity": out["score_explanation_clarity"],
            "difficulty_alignment": out["score_difficulty_alignment"],
            "blooms_taxonomy": out["score_blooms_taxonomy"],
            "thai_grammar_style": out["score_thai_grammar_style"],
            "thai_math_terms": out["score_thai_math_terms"],
        })
        out["average_score"] = avg
    return out

In [32]:
all_results: List[pd.DataFrame] = []

#### All model

In [33]:
for model_dir in model_dirs:
    model_folder = model_dir.name
    print(f"\nEvaluating model folder: {model_folder}")
    # Load aggregated results
    df = None
    csv_path = model_dir / "all_results.csv"
    pq_path = model_dir / "all_results.parquet"
    if csv_path.exists():
        df = pd.read_csv(csv_path)
    elif pq_path.exists():
        df = pd.read_parquet(pq_path)
    else:
        print(f"No all_results.* found in {model_dir}, skipping.")
        continue

    if df.empty:
        print("No rows to evaluate.")
        continue

    # Output paths
    out_csv = model_dir / f"evaluation_{MODEL_EVALUATE_FIELD}.csv"
    out_parquet = model_dir / f"evaluation_{MODEL_EVALUATE_FIELD}.parquet"

    # Resumability: skip rows already evaluated
    processed_ids = set()
    header_exists = out_csv.exists()
    if header_exists:
        try:
            prev = pd.read_csv(out_csv, usecols=["id"], dtype=str)
            processed_ids = set(prev["id"].astype(str).tolist())
        except Exception:
            # If legacy or empty, ignore
            processed_ids = set()
    print(f"Already processed: {len(processed_ids)} rows")

    rows_to_eval = [r[1] for r in df.iterrows() if str(r[1].get("id")) not in processed_ids]
    if not rows_to_eval:
        print("Nothing new to evaluate. Skipping to Parquet sync...")
        # Ensure parquet syncs with current CSV
        try:
            cur = pd.read_csv(out_csv)
            cur.to_parquet(out_parquet, index=False)
        except Exception as e:
            print(f"Parquet sync failed for {model_folder}: {e}")
        continue

    print(f"To evaluate now: {len(rows_to_eval)} rows")

    # Prepare CSV append writer
    field_order = [
        "id",
        "model",
        "score_katex_formatting",
        "score_mathematical_correctness",
        "score_explanation_clarity",
        "score_difficulty_alignment",
        "score_blooms_taxonomy",
        "score_thai_grammar_style",
        "score_thai_math_terms",
        "average_score",
        "think",
        "model_evaluate",
        "error",
    ]

    # Append results as they complete
    writes = 0
    with ThreadPoolExecutor(max_workers=EVAL_CONCURRENCY) as ex:
        futures = [ex.submit(evaluate_row, row, model_folder) for row in rows_to_eval]
        # Open file once for append; we’ll write header if creating new file
        with open(out_csv, "a", encoding="utf-8-sig", newline="") as f:
            writer = csv.DictWriter(f, fieldnames=field_order)
            if not header_exists:
                writer.writeheader()
                header_exists = True
            for fut in tqdm(as_completed(futures), total=len(futures), desc=f"Writing {model_folder}"):
                res = fut.result()
                # Ensure all fields exist
                for k in field_order:
                    res.setdefault(k, None)
                writer.writerow({k: res.get(k) for k in field_order})
                writes += 1

    print(f"Appended {writes} rows -> {out_csv}")

    # Refresh Parquet from the full CSV after this batch
    try:
        eval_df = pd.read_csv(out_csv)
        eval_df.to_parquet(out_parquet, index=False)
    except Exception as e:
        print(f"Parquet write failed for {model_folder}: {e}")

    # Track in-memory summary for combined
    try:
        all_results.append(eval_df.assign(_model_folder=model_folder))
    except Exception:
        pass

    print(f"Saved/updated: {out_csv} (total {len(eval_df)} rows)")


Evaluating model folder: thai-homeworkgen-v4
Already processed: 0 rows
To evaluate now: 156 rows


Writing thai-homeworkgen-v4: 100%|██████████| 156/156 [03:01<00:00,  1.16s/it]



Appended 156 rows -> /Users/tk17250/Documents/Work/UpMath/UpMathLLM/infomation/testing/thai-homeworkgen-v4/evaluation_gpt-4.1.csv
Saved/updated: /Users/tk17250/Documents/Work/UpMath/UpMathLLM/infomation/testing/thai-homeworkgen-v4/evaluation_gpt-4.1.csv (total 156 rows)

Evaluating model folder: gemini-2.5-pro
Already processed: 0 rows
To evaluate now: 189 rows


Writing gemini-2.5-pro: 100%|██████████| 189/189 [03:01<00:00,  1.04it/s]



Appended 189 rows -> /Users/tk17250/Documents/Work/UpMath/UpMathLLM/infomation/testing/gemini-2.5-pro/evaluation_gpt-4.1.csv
Saved/updated: /Users/tk17250/Documents/Work/UpMath/UpMathLLM/infomation/testing/gemini-2.5-pro/evaluation_gpt-4.1.csv (total 189 rows)

Evaluating model folder: chatgpt-4o-mini
Already processed: 0 rows
To evaluate now: 189 rows


Writing chatgpt-4o-mini: 100%|██████████| 189/189 [03:12<00:00,  1.02s/it]

Appended 189 rows -> /Users/tk17250/Documents/Work/UpMath/UpMathLLM/infomation/testing/chatgpt-4o-mini/evaluation_gpt-4.1.csv
Saved/updated: /Users/tk17250/Documents/Work/UpMath/UpMathLLM/infomation/testing/chatgpt-4o-mini/evaluation_gpt-4.1.csv (total 189 rows)





In [35]:
# Optional combined summary across all models
if all_results:
    combined = pd.concat(all_results, ignore_index=True)
    combined_csv = Path(".").resolve() / f"evaluation_{MODEL_EVALUATE_FIELD}_all_models.csv"
    combined_parquet = Path(".").resolve() / f"evaluation_{MODEL_EVALUATE_FIELD}_all_models.parquet"
    combined.to_csv(combined_csv, index=False, encoding="utf-8-sig")
    try:
        combined.to_parquet(combined_parquet, index=False)
    except Exception as e:
        print(f"Combined Parquet write failed: {e}")
    print(f"\nCombined saved: {combined_csv} ({len(combined)} rows)")


Combined saved: /Users/tk17250/Documents/Work/UpMath/UpMathLLM/src/upmathllm/evaluation_gpt-4.1_all_models.csv (534 rows)


In [None]:
# Display a small sample
combined.head(10) if all_results else pd.DataFrame()

#### Test once (one-shot evaluation)

ใช้เซลล์ด้านล่างเพื่อทดสอบประเมิน 1 แถวแบบโต้ตอบ โดยไม่เขียนไฟล์ออก ปรับ index หรือกรองข้อมูลตามต้องการก่อนเรียก `evaluate_row`.

In [25]:
# Configure test target
TEST_MODEL_FOLDER = os.getenv("TEST_MODEL_FOLDER", "thai-homeworkgen-v4")  # change to any model folder name
TEST_ROW_INDEX = int(os.getenv("TEST_ROW_INDEX", "0"))  # pick row index to evaluate

# Load that model’s aggregated data
test_model_dir = next((p for p in model_dirs if p.name == TEST_MODEL_FOLDER), None)
assert test_model_dir is not None, f"Model folder '{TEST_MODEL_FOLDER}' not found under {testing_root}"

csv_path = test_model_dir / "all_results.csv"
pq_path = test_model_dir / "all_results.parquet"
if csv_path.exists():
    df_test = pd.read_csv(csv_path)
elif pq_path.exists():
    df_test = pd.read_parquet(pq_path)
else:
    raise FileNotFoundError(f"No all_results.* found in {test_model_dir}")

assert len(df_test) > 0, "No rows in aggregated data"
row = df_test.iloc[TEST_ROW_INDEX]
res = evaluate_row(row, test_model_dir.name)
res

{'id': 'sync-00966335-5305-46ac-9644-9c8846fdcd1b-e2',
 'model': 'Thai-HomeworkGen-v4',
 'score_katex_formatting': 10,
 'score_mathematical_correctness': 10,
 'score_explanation_clarity': 9,
 'score_difficulty_alignment': 10,
 'score_blooms_taxonomy': 8,
 'score_thai_grammar_style': 9,
 'score_thai_math_terms': 10,
 'average_score': 9.43,
 'think': "คำถามและคำอธิบายมีการใช้ KaTeX ได้ถูกต้องสมบูรณ์ ไม่มีข้อผิดพลาดทางคณิตศาสตร์ อธิบายขั้นตอนอย่างชัดเจนและเป็นลำดับ เหมาะสมกับระดับ 'ง่าย' ของ ม.5 และสอดคล้องกับระดับ Bloom's 'วิเคราะห์' แม้จะเน้นการแยกตัวประกอบและตัดทอนซึ่งอาจอยู่ระหว่าง 'เข้าใจ' กับ 'วิเคราะห์' การใช้ภาษาไทยถูกต้องและใช้คำศัพท์คณิตศาสตร์ได้เหมาะสม มีจุดเล็กน้อยที่อาจเพิ่มความชัดเจนในคำอธิบาย เช่น การเน้นเหตุผลที่ $x \\neq 3$ แต่โดยรวมถือว่าดีมาก",
 'model_evaluate': 'gpt-4.1'}