In [None]:
import os
import glob
import json
import math
import random
import re
from collections import Counter
import networkx as nx
import pdfplumber
from openai import OpenAI
from dotenv import load_dotenv
load_dotenv()

# Set the OpenAI API key from environment variable
api_key = os.getenv("OPENAI_API_KEY")
client = OpenAI(api_key=api_key)

# -- Document Loading ------------------------------------------------------

def load_docs_from_dir(dir_path):
    """
    Load all .pdf files from a directory and extract their full text.
    Returns a list of strings, one per PDF.
    """
    docs = []
    filenames = []
    for filepath in glob.glob(os.path.join(dir_path, '*.pdf')):
        try:
            text_pages = []
            with pdfplumber.open(filepath) as pdf:
                for page in pdf.pages:
                    text_pages.append(page.extract_text() or "")
            full_text = "\n".join(text_pages)
            docs.append(full_text)
            filenames.append(os.path.basename(filepath))
        except Exception as e:
            # skip unreadable PDFs
            print(f"Warning: could not load {filepath}: {e}")
    return docs, filenames

# -- 1. Extract topics and key concepts -----------------------------------

def extract_json_from_text(text):
    """Helper function to extract JSON content from markdown code blocks or raw text"""
    # Try to extract JSON from markdown code blocks
    json_match = re.search(r'```(?:json)?\s*(.*?)\s*```', text, re.DOTALL)
    if json_match:
        return json_match.group(1).strip()
    
    # If no code blocks, try to find JSON-like structures
    json_match = re.search(r'(\{.*\})', text, re.DOTALL)
    if json_match:
        return json_match.group(1).strip()
    
    # Return original text if no JSON structure found
    return text

def extract_concepts_from_docs(doc_texts, filenames, model="gpt-4.1-mini"):  
    """
    Call OpenAI API to extract topics and key concepts from each document.
    Returns list of dicts with keys 'topics' and 'key_concepts'.
    """
    extractions = []
    for i, text in enumerate(doc_texts):
        filename = filenames[i] if i < len(filenames) else f"doc_{i}"
        
        prompt = (
            "Extract high-level topics and key concepts from the following document. topics should be broad , "
            "while key concepts should be specific ideas and concepts related to those topics.\n\n"
            "Topics should be general and cover the main areas of the document, while key concepts should be more specific "
            "and include important terms, definitions, or theories.\n\n"
            f"Return JSON with keys 'topics' and 'key_concepts'.\n\n{text}"
        )
        response = client.chat.completions.create(
            model=model,
            messages=[
                {"role": "system", "content": "You are an expert Maths summarizer. Return your response as a JSON object with 'topics' and 'key_concepts' as arrays."},
                {"role": "user", "content": prompt}
            ],
        )
        
        output = response.choices[0].message.content
        print("Raw output:", output)
        
        try:
            # Clean and extract JSON from the output
            json_str = extract_json_from_text(output)
            data = json.loads(json_str)
            
            # Ensure we have the expected keys
            if "topics" in data and "key_concepts" in data:
                extraction = {
                    "filename": filename,
                    "topics": data["topics"],
                    "key_concepts": data["key_concepts"]
                }
                extractions.append(extraction)
                print(f"Successfully extracted {len(data['topics'])} topics and {len(data['key_concepts'])} key concepts from {filename}")
            else:
                print(f"Warning: Parsed JSON doesn't have expected keys: {data.keys()}")
                extractions.append({
                    "filename": filename,
                    "topics": [], 
                    "key_concepts": []
                })
                
        except json.JSONDecodeError as e:
            print(f"Failed to parse JSON: {e}")
            extractions.append({
                "filename": filename,
                "topics": [], 
                "key_concepts": []
            })
    
    return extractions

# -- 2. Construct the concept graph ----------------------------------------

def build_concept_graph(extractions, eps=1e-6):
    """
    Build a unified graph G where nodes are topics + key concepts and
    edges weighted by log(freq+eps) based on co-occurrence in docs.
    Returns: G (nx.Graph), topic_nodes, kc_nodes
    """
    freq = Counter()
    all_topics, all_kcs = set(), set()

    for ex in extractions:
        nodes = ex["topics"] + ex["key_concepts"]
        for i in range(len(nodes)):
            for j in range(i+1, len(nodes)):
                u, v = sorted((nodes[i], nodes[j]))
                freq[(u, v)] += 1
        all_topics.update(ex["topics"])
        all_kcs.update(ex["key_concepts"])

    G = nx.Graph()
    for (u, v), f in freq.items():
        weight = math.log(f + eps)
        G.add_edge(u, v, weight=weight)

    print(f"Graph built with {len(all_topics)} topics and {len(all_kcs)} key concepts")
    return G, all_topics, all_kcs

# -- Helpers for sampling --------------------------------------------------

def softmax(weights):
    exps = [math.exp(w) for w in weights]
    s = sum(exps) or 1.0
    return [e/s for e in exps]


def random_walk(G, start, steps):
    """
    Random walk on graph G for given steps from 'start',
    with transition probabilities via softmax over edge weights.
    """
    path = [start]
    current = start
    for _ in range(steps):
        nbrs = list(G[current])
        if not nbrs:
            break
        weights = [G[current][n]['weight'] for n in nbrs]
        probs = softmax(weights)
        current = random.choices(nbrs, probs)[0]
        path.append(current)
    return path

# -- 3. Concept combination sampling ---------------------------------------

def sample_concept_combinations(
    G, topic_nodes, kc_nodes,
    num_samples=100,
    topic_walk_steps=(1, 2),
    kc_walk_steps=(3, 4)
):
    """
    Generate sampled sets of topics and key concepts via multi-stage random walks.
    Returns list of dicts: {'topics': set, 'key_concepts': set}
    """
    # Safety check
    if not topic_nodes:
        print("Error: No topics found. Cannot sample combinations.")
        return []
        
    G_topic = G.subgraph(topic_nodes)
    G_topic_kc = G.subgraph(topic_nodes | kc_nodes)
    G_kc = G.subgraph(kc_nodes)
    samples = []

    topics_list = list(topic_nodes)
    print(f"Sampling from {len(topics_list)} topics")
    
    for _ in range(num_samples):
        t0 = random.choice(topics_list)
        t_steps = random.choice(topic_walk_steps)
        topic_path = random_walk(G_topic, t0, t_steps)
        sampled_topics = set(topic_path)

        kc_cands = [nbr for t in sampled_topics for nbr in G_topic_kc[t] if nbr in kc_nodes]
        if kc_cands:
            k0 = random.choice(kc_cands)
            k_steps = random.choice(kc_walk_steps)
            kc_path = random_walk(G_kc, k0, k_steps)
            sampled_kcs = set(kc_path)
        else:
            sampled_kcs = set()

        samples.append({
            "topics": list(sampled_topics),  # Convert sets to lists for JSON serialization
            "key_concepts": list(sampled_kcs)
        })
    
    return samples

# -- 4. Question generation ------------------------------------------------

# def generate_questions_for_samples(combos, docs, extractions, model="gpt-4o"):  
#     """
#     For each sampled combo, pick two docs via Jaccard on concept sets,
#     then call LLM to generate questions.
#     Returns list of dicts: {'sample': combo, 'questions': [...]}.
#     """
#     doc_concepts = [set(ex['topics'] + ex['key_concepts']) for ex in extractions]
#     # doc_concepts= doc_concepts[:1]
#     results = []
#     max_samples = 100
#     # combos = combos[:max_samples]
#     for i, combo in enumerate(combos):
#         combo_id = f"combo_{i+1}"
#         kg = set(combo['topics']) | set(combo['key_concepts'])
#         sims = []
#         for idx, dc in enumerate(doc_concepts):
#             inter = kg & dc
#             union = kg | dc
#             sims.append((len(inter) / (len(union) or 1), idx))
#         sims.sort(reverse=True)
#         top_idxs = [i for _, i in sims[:2]]
#         refs = [docs[i] for i in top_idxs]
#         ref_files = [extractions[i]["filename"] for i in top_idxs]
#         System_prompt =f"""
# Each question must follow these instructions:
# Model a Mathematics Scenario: Start from a real-world or idealized setup. Avoid abstract Physics problems or purely conceptual statements.
# Target a Solvable Quantity: Ask for a clear symbolic expression of a physical variable (e.g., tension, acceleration, energy).
# Force Multi-Step Reasoning: Ensure the question involves a sequence of physics laws, transformations, and derivations to reach the answer.
# Avoid Redundancy: Exclude extraneous details or variables that do not impact the final solution.
# Be Unique: Do not rephrase standard textbook problems; ensure originality and complexity.
# Single solution: Expect a single symbolic expression, unambiguous, presented in LaTeX. Multiple equivalent algebraic forms are allowed. No equations or floating-point approximations.
# Use rigorous, concise phrasing.
# Avoid colloquial or ambiguous terminology.
# Units must be consistent; symbols should follow standard notation.
# """
#         prompt = (
#             f"Generate a set of difficult Physics questions based on the following:\n"
            
#             f"Topics: {combo['topics']}\n"
#             f"Key Concepts: {combo['key_concepts']}\n"
#             f"Reference Doc 1:\n{refs[0]}\n"
#         )
#         if len(refs) > 1:
#             prompt += f"Reference Doc 2:\n{refs[1]}\n"
#         prompt += "Return a JSON array of questions."

#         # from ollama import chat
#         # from ollama import ChatResponse

#         # response: ChatResponse = chat(model='qwen3:8b', 
#         #                                messages=[
#         #         {"role": "system", "content": System_prompt},
#         #         {"role": "user", "content": prompt}
#         #     ])
#         # # print(response['message']['content'])
#         # # or access fields directly from the response object
#         # print(response.message.content)
#         # content = response.message.content
#         response = client.chat.completions.create(
#             model=model,
#             messages=[
#                 {"role": "system", "content": System_prompt},
#                 {"role": "user", "content": prompt}
#             ]
#         )
#         content = response.choices[0].message.content
        
#         try:
#             # Clean and extract JSON from the output
#             json_str = extract_json_from_text(content)
#             questions = json.loads(json_str)
#             if not isinstance(questions, list):
#                 # If the output is an object with a questions key
#                 if isinstance(questions, dict) and "questions" in questions:
#                     questions = questions["questions"]
#                 else:
#                     questions = [str(questions)]
#         except json.JSONDecodeError:
#             questions = [content]

#         results.append({
#             "id": combo_id,
#             "topics": combo['topics'],
#             "key_concepts": combo['key_concepts'],
#             "reference_files": ref_files,
#             "questions": questions
#         })
#         max_samples-=1
#         if max_samples == 0:
#             break


    # return results

# -- Save outputs to files ------------------------------------------------

def save_extractions(extractions, output_file="document_extractions.json"):
    """Save the extracted topics and key concepts for each document"""
    # Ensure the extractions are serializable (convert sets to lists)
    serializable_extractions = []
    for ex in extractions:
        serializable_extractions.append({
            "filename": ex["filename"],
            "topics": list(ex["topics"]),
            "key_concepts": list(ex["key_concepts"])
        })
    
    with open(output_file, "w") as f:
        json.dump(serializable_extractions, f, indent=2)
    
    print(f"Saved document extractions to {output_file}")

def save_questions_with_topics(questions, output_file="questions_with_topics.json"):
    """Save the generated questions with their topic combinations"""
    with open(output_file, "w") as f:
        json.dump(questions, f, indent=2)
    
    print(f"Saved questions with topic combinations to {output_file}")

# -- Main Execution --------------------------------------------------------

if __name__ == "__main__":
    # Create output directory if it doesn't exist
    output_dir = "output_maths"
    os.makedirs(output_dir, exist_ok=True)
    
    # adjust this path to where your .pdf docs live
    docs_dir = "easy_data/"
    
    print("Loading documents...")
    docs, filenames = load_docs_from_dir(docs_dir)
    print(f"Loaded {len(docs)} documents")
    
    if not docs:
        print("No documents found. Please check the docs directory.")
        exit(1)
    
    # 1) Extract topics & KCs
    print("\nExtracting topics and key concepts...")
    extractions = extract_concepts_from_docs(docs, filenames)
    
    # Save extractions to file
    save_extractions(extractions, os.path.join(output_dir, "document_extractions_maths.json"))
    
    # Verify we have valid extractions
    valid_extractions = [ex for ex in extractions if ex["topics"] or ex["key_concepts"]]
    if not valid_extractions:
        print("No valid topics or key concepts extracted. Check your data and API responses.")
        exit(1)
    
    # 2) Build graph
    print("\nBuilding concept graph...")
    G, topic_nodes, kc_nodes = build_concept_graph(extractions)
    
    if not topic_nodes:
        print("No topics found in the graph. Cannot proceed.")
        exit(1)
    
    # 3) Sample combinations
    print("\nSampling concept combinations...")
    combos = sample_concept_combinations(G, topic_nodes, kc_nodes, num_samples=10)  # Reduced for testing
    
    if not combos:
        print("Failed to generate concept combinations.")
        exit(1)
    
    # Save topic combinations
    with open(os.path.join(output_dir, "topic_combinations_maths.json"), "w") as f:
        json.dump(combos, f, indent=2)
    
    # 4) Generate questions
    # print("\nGenerating questions for each combination...")
    # q_outputs = generate_questions_for_samples(combos, docs, extractions)
    
    # # Save questions with topics
    # save_questions_with_topics(q_outputs, os.path.join(output_dir, "questions_with_topics.json"))
    
    # # Display results
    # print("\n===== GENERATED QUESTIONS =====")
    # for idx, out in enumerate(q_outputs, 1):
    #     print(f"\nSample {idx}:")
    #     print(f"Topics: {out['topics']}")
    #     print(f"Key Concepts: {out['key_concepts']}")
    #     print(f"Reference Files: {out['reference_files']}")
    #     print("Questions:")
    #     for q in out['questions']:
    #         print(f" - {q}")
    
    # print(f"\nAll outputs saved to directory: {output_dir}") 

# if __name__ == "__main__":
#     # Create output directory if it doesn't exist
#     # Create output directory if it doesn't exist
#     output_dir = "output_apple"
#     os.makedirs(output_dir, exist_ok=True)
    
#     # adjust this path to where your .pdf docs live
#     docs_dir = "docs/"
    
#     print("Loading documents...")
#     docs, filenames = load_docs_from_dir(docs_dir)
#     print(f"Loaded {len(docs)} documents")
#     document_extractions = []
#     with open("output_apple/document_extractions.json", "r") as f:
#         document_extractions = json.load(f)
#     for doc in document_extractions:
#         print(doc)
#         break

#     topic_combinations = []
#     with open("output_apple/topic_combinations.json", "r") as f:
#         topic_combinations = json.load(f)
#     for combo in topic_combinations:
#         print(combo)
#         break

#     # generate questions 
#     print("Generating questions...")
#     q_outputs = generate_questions_for_samples(topic_combinations, docs, document_extractions)

#     # Save questions with topics
#     save_questions_with_topics(q_outputs, os.path.join(output_dir, "questions_with_topics.json"))
    
#     # Display results
#     print("\n===== GENERATED QUESTIONS =====")
#     for idx, out in enumerate(q_outputs, 1):
#         print(f"\nSample {idx}:")
#         print(f"Topics: {out['topics']}")
#         print(f"Key Concepts: {out['key_concepts']}")
#         print(f"Reference Files: {out['reference_files']}")
#         print("Questions:")
#         for q in out['questions']:
#             print(f" - {q}")
    
#     print(f"\nAll outputs saved to directory: {output_dir}")

CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, def

Loading documents...


CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, def



CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, def

Loaded 38 documents

Extracting topics and key concepts...
Raw output: {
  "topics": [
    "Vector Calculus Operators",
    "Gradient of a Scalar Field",
    "Divergence of a Vector Field",
    "Curl of a Vector Field",
    "Laplacian Operator",
    "Vector Operator Identities",
    "Application of Vector Operators in Physics",
    "Vector Operators in Curvilinear Coordinates"
  ],
  "key_concepts": [
    "Gradient (grad) operator: ∇U represents the direction and rate of greatest change of scalar field U",
    "Directional derivative: rate of change of scalar field in an arbitrary direction expressed as ∇U · dˆ",
    "Gradient vector is normal to surfaces of constant scalar field (level surfaces)",
    "Divergence (div) operator: ∇ · a gives scalar representing flux generation per unit volume",
    "Divergence interpretation as outward flux per unit volume from an infinitesimal volume element",
    "Curl (curl) operator: ∇ × a representing the rotation or vorticity of a vector field",


### download data from csv

In [2]:
import os
import re
import pandas as pd
import requests
from urllib.parse import urlparse

# 1. Load your CSV
df = pd.read_csv('Researchpapers_PhD_Synthetic_Data - Maths_sources .csv')  # adjust path if necessary

# 2. Extract all HTTPS links from the 'sources' column
all_links = []
pattern = re.compile(r'https://\S+')

for cell in df['Sources'].dropna():
    # findall will grab every substring starting with https:// up to the next whitespace
    links = pattern.findall(cell)
    all_links.extend(links)

# 3. Deduplicate
pdf_links = list(set(all_links))

# 4. Make sure output folder exists
output_folder = 'easy_data'
os.makedirs(output_folder, exist_ok=True)

# 5. Download each PDF
for url in pdf_links:
    try:
        resp = requests.get(url, stream=True, timeout=10)
        resp.raise_for_status()
        
        # pick filename from URL path
        path = urlparse(url).path
        filename = os.path.basename(path)
        if not filename.lower().endswith('.pdf'):
            filename += '.pdf'
        
        out_path = os.path.join(output_folder, filename)
        with open(out_path, 'wb') as f:
            for chunk in resp.iter_content(chunk_size=8192):
                f.write(chunk)
        
        print(f"✅ Downloaded {filename}")
    except Exception as e:
        print(f"❌ Failed {url}: {e}")


✅ Downloaded 14.pdf
❌ Failed https://drive.google.com/file/d/1Rv_vuL-ZFmrnNFOSTSru7ejpejWG64A6/view?usp=drive_link: 401 Client Error: Unauthorized for url: https://drive.google.com/file/d/1Rv_vuL-ZFmrnNFOSTSru7ejpejWG64A6/view?usp=drive_link
❌ Failed https://www.sas.com/storefront/aux/en/spmultdatared/56902_excerpt.pdf": 404 Client Error: Not Found for url: https://www.sas.com/storefront/aux/en/spmultdatared/56902_excerpt.pdf%22
✅ Downloaded field.pdf
✅ Downloaded 4_06-28-2021_11-45-05_Theory%20of%20Field%20Extensions_(20MAT22C1).pdf
✅ Downloaded 19poly.pdf
✅ Downloaded probability-theory.pdf
✅ Downloaded view.pdf
✅ Downloaded 3alg1-7.pdf
✅ Downloaded LinearAlgebra.pdf
✅ Downloaded math4120_lecture-4-01_h.pdf
✅ Downloaded VC-4.pdf
✅ Downloaded implicit_function_thm.pdf
✅ Downloaded view.pdf
✅ Downloaded 20200402172056158c70545b.pdf
✅ Downloaded HELM%20Workbook%2042%20Goodness%20of%20Fit%20and%20Contingency%20Tables.pdf
✅ Downloaded viewcontent.cgi.pdf
✅ Downloaded oer-trigonometry.pdf


## Author - Critique Model

In [5]:
import os
import re
import uuid
import json
from dotenv import load_dotenv
import os
import logging
from langchain.chat_models import ChatOpenAI
from langchain.prompts import PromptTemplate, FewShotPromptTemplate
from langchain.chains import LLMChain
from langchain_google_genai import ChatGoogleGenerativeAI

# Load environment variables from .env
load_dotenv()

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)

def author_critique_loop(
    topics: list[str],
    concepts: list[str],
    guidelines: str,
    few_shot_examples: list[dict],
    max_turns: int = 4
    ) -> list[dict]:
    """
    Runs an iterative author-critic loop to generate and refine Mathematics questions.

    Logs the entire multi-turn conversation (author + critic) to a single file.

    Returns a list of dicts with:
        unique_id: str,
        question: str,
        topics: list[str],
        concepts: list[str],
        file_conversation_log: str  # path to the full conversation log
    """
    # Create a unique run-level conversation log
    run_id = uuid.uuid4().hex
    os.makedirs('logs', exist_ok=True)
    conv_log_path = f"logs/{run_id}.log"
    conv_logger = open(conv_log_path, 'w')

    def log(msg: str):
        conv_logger.write(msg + "\n")
        conv_logger.flush()

    # Initialize LLMs (temperature locked to 1.0)
    author_llm = ChatOpenAI(model_name="gpt-4.1-mini", temperature=1.0)
    critic_llm = ChatOpenAI(model_name="gpt-4.1", temperature=1.0)
    # gemini_api = os.getenv("GEMINI_API_KEY")
    # critic_llm = ChatGoogleGenerativeAI(model="gemini-2.5-flash-preview-05-20", temperature=1.0,google_api_key=gemini_api)
    # Build Few-Shot prompt for author
    example_prompt = PromptTemplate(
        input_variables=["example"],
        template="Example:\n{{example}}\n---",
        template_format="jinja2"

    )
    author_fs_prompt = FewShotPromptTemplate(
        examples=few_shot_examples,
        example_prompt=example_prompt,
        prefix=(
            "You are a Mathematics education specialist. Generate novel High School Mathematics questions. "
            "Each question must start with <question> and end with </question>. Here are examples:"
        ),
        suffix=(
            "Now, based on the topics: {{topics}}, key concepts: {{concepts}}, and guidelines: {{guidelines}}, "
            "produce list of new, high-quality Mathematics questions, each wrapped in <question>...</question> tags."
        ),
        input_variables=["topics", "concepts", "guidelines"],
        template_format="jinja2",

    )
    author_chain = LLMChain(llm=author_llm, prompt=author_fs_prompt)

    # Build Critic prompt
    critic_prompt = PromptTemplate(
        input_variables=["questions", "guidelines"],
        template=(
            "You are a Mathematics assessment expert. Critique the following questions:\n"
            "{questions}\n"
            "Evaluate them against these guidelines:\n{guidelines}\n"
            "Provide concise, actionable feedback on how to improve."
        ),
    )
    critic_chain = LLMChain(llm=critic_llm, prompt=critic_prompt)

    # Turn 0: Generation
    log("=== Turn 0: Author generates questions ===")
    questions_text = author_chain.run(
        topics=topics, concepts=concepts, guidelines=guidelines
    )
    log(questions_text)

    # Turn 0: Critic feedback
    log("=== Turn 0: Critic feedback ===")
    feedback = critic_chain.run(questions=questions_text, guidelines=guidelines)
    log(feedback)

    # Refinement turns
    for turn in range(1, max_turns):
        log(f"=== Turn {turn}: Author refines questions ===")
        refine_prompt = PromptTemplate(
            input_variables=["questions", "feedback"],
            template=(
                "Refine these questions based on the feedback:\n{questions}\n"
                "Feedback:\n{feedback}\n"
                "Return an improved numbered list of Mathematics questions, each wrapped in <question>...</question> tags."
            ),
        )
        refine_chain = LLMChain(llm=author_llm, prompt=refine_prompt)
        questions_text = refine_chain.run(questions=questions_text, feedback=feedback)
        log(questions_text)
        if turn == max_turns - 1:
            log("=== Final questions generated ===")
            break
        log(f"=== Turn {turn}: Critic feedback ===")
        feedback = critic_chain.run(questions=questions_text, guidelines=guidelines)
        log(feedback)

    # Close the conversation log
    conv_logger.close()

    # Parse final questions into structured entries
    entries: list[dict] = []
    pattern = re.compile(r'<question>(.*?)</question>', re.DOTALL)
    for match in pattern.findall(questions_text):
        question_body = match.strip()
        uid = uuid.uuid4().hex
        entries.append({
            "unique_id": uid,
            "topics": topics,
            "concepts": concepts,
            "question": question_body,
            "file_conversation_log": conv_log_path
        })

    return entries



if __name__ == "__main__":
    # Mathematics-specific example usage
    # topics = [
    #     "Thermodynamics of Chemical Reactions",
    #     "Organic Reaction Mechanisms",
    #     "Coordination Mathematics",
    #     "Electrochemical Cells"
    # ]
    # concepts = [
    #     "Use of enthalpy, entropy, and Gibbs free energy to predict reaction spontaneity.",
    #     "Mechanistic steps in nucleophilic substitution and elimination reactions.",
    #     "Formation constants and isomerism in transition metal complexes.",
    #     "Calculating cell potential using standard reduction potentials and Nernst equation."
    # ]
    with open("output_maths/topic_combinations_maths.json", "r") as f:
        topic_combinations = json.load(f)

  # Limit for testing topic combinations is last k
    # topic_combinations = topic_combinations[]  # Adjust as needed for testing
    print(f"Processing {len(topic_combinations)} topic combinations for Mathematics questions...")
    guidelines = (
    "1. Input Format: Fully text-based. No diagrams, visual aids, or references to external media. "
    "2. Mathematical Structure: Start with a well-defined setup—real-world or idealized. Avoid vague, puzzle-like, or speculative scenarios. "
    "3. Solvable Target: Ask for a numerical answer or a clear symbolic result (e.g., function, limit, closed-form expression). Do not use phrasing like 'Prove' or 'Show that'. "
    "4. Multi-Step Reasoning: Require a logical sequence of algebraic or analytic steps, involving definitions, identities, or theorems. "
    "5. Concise and Relevant: Include only essential information. Eliminate redundant details or irrelevant symbols. "
    "6. Originality: Ensure the problem is unique and non-trivial. Avoid rewording standard textbook problems. "
    "7. Single Output: Expect a single, unambiguous symbolic answer in LaTeX. Equivalent forms are allowed. No numeric approximations. Do not rely on computational tools - use traditional mathematical methods. "
    "8. Solution Formatting: Format the equations, numbers and variables using LaTeX with Markdown syntax: "
    "Replace \\(...\\) with $...$ "
    "Replace \\[...\\] with $$...$$ "
    "9. Language and Notation: Be rigorous and concise. Avoid colloquial terms. Use standard, consistent mathematical notation throughout. "
    "10. Questions must not have multiple parts to answer, there should be only 1 solution to each question"
    "11. Only questions should be generated, no answers,solutions or explainations should be provided."

)



    
    few_shot_examples = [
        {"example": (
            """Good: For a given mathematical function representing a curve, we consider the equation:  
  
$$ y = x -x^2 $$  
  
which defines a parabolic curve. Additionally, we have a straight line represented by the equation:  
$$ y = mx $$  
  
where $m$ is a parameter that determines the slope of the line.  
  
The objective is to determine the value of m for which the enclosed area between the given curve and the line is exactly equal to $\frac {9}{2}$. """
        )},
        {"example": (
            "Bad: What is the pythagorus theorm."
        )},
    ]
    all_questions = []

    for combo in topic_combinations:
        topics = combo['topics']
        concepts = combo['key_concepts']
        print(f"Processing combination: {topics} | {concepts}")
        # logger.info(f"Processing combination: {topics} | {concepts}")
        # Run the author–critic loop for this combination
        questions = author_critique_loop(
            topics, concepts, guidelines, few_shot_examples, max_turns=3
        )
        
        all_questions.append({
            # "topics": topics,
            # "key_concepts": concepts,
            "data": questions
        })

    # Save all questions to a file
    with open("final_questions_maths.json", "w") as f:
        json.dump(all_questions, f, indent=2)
        # 2) write a Markdown file you can grab directly
    with open("final_questions_maths.md", "w") as md:
        for batch in all_questions:
            for q in batch["data"]:
                # Write the question with its unique ID and topics
                md.write(f"### Question ID: {q['unique_id']}\n")
                # Write the question text
                md.write("**Question:**\n")
            # q["question"] already contains things like $\theta$
                md.write(q["question"] + "\n\n")
    # logging.info("Author–Critique loop complete.")


Processing 10 topic combinations for Mathematics questions...
Processing combination: ['Applications to Physics', 'Vector Calculus and Multivariable Calculus'] | ['Flux integrals of vector fields over surfaces relating to 2-form integrals', 'Review of multivariable calculus: limits, continuity, differentiability, partial derivatives, chain rule', 'Parameterizations of surfaces, orientation and smoothness', 'Conservation laws (mass, charge) derived via differential forms and integral theorems']


2025-05-30 14:05:01,151 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-05-30 14:05:28,550 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-05-30 14:05:45,441 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-05-30 14:06:02,974 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-05-30 14:06:37,054 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Processing combination: ['Cyclotomic Extensions and Polynomials', 'Fundamentals of Galois Theory', 'Field Automorphisms and Fixed Fields'] | ['Conjugates in Galois extensions and their relation to minimal polynomials', 'Examples of field automorphisms with infinite order', 'Norm function in complex extensions of real closed fields', 'Constructible points and numbers via ruler and compass constructions']


2025-05-30 14:06:55,065 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-05-30 14:07:16,528 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-05-30 14:07:42,795 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-05-30 14:08:02,865 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-05-30 14:08:23,448 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Processing combination: ['Basics of Statistics and Probability', 'Binomial Distribution', 'Probability Theory and Events'] | ['Statistics as a tool for extracting information from data', 'Poisson distribution function and parameter lambda (λ)', 'Normal distribution density function and parameters μ (mean), σ (std dev)', 'Probability distribution functions and properties', 'Probability mass function and cumulative distribution function']


2025-05-30 14:08:35,839 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-05-30 14:09:06,295 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-05-30 14:09:20,688 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-05-30 14:09:44,935 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-05-30 14:09:58,376 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Processing combination: ['Application of Vector Operators in Physics', 'Laplacian Operator'] | ['Physical interpretation of divergence as source/sink strength of a vector field', 'Properties of vector operators: curl(grad U) = 0, div(curl a) = 0', 'Use of metric coefficients (h-factors) in defining grad, div, and curl in orthogonal curvilinear coordinate systems', 'Directional derivative: rate of change of scalar field in an arbitrary direction expressed as ∇U · dˆ']


2025-05-30 14:10:18,651 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-05-30 14:10:32,168 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-05-30 14:10:57,253 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-05-30 14:11:16,188 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-05-30 14:11:37,909 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Processing combination: ['Differentiation in Banach spaces', 'Functional minimization and Euler-Lagrange equations', 'Examples and counterexamples in analysis'] | ["Techniques like partial integration and Hölder's inequality in proofs", 'Implicit function theorem in finite dimensions (R^n × R → R)', 'Use of function spaces (e.g., L^p spaces, spaces of periodic functions) in analysis', 'Inverse function theorem in Banach spaces and continuous differentiability of inverses']


2025-05-30 14:12:01,157 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-05-30 14:12:39,085 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-05-30 14:12:59,837 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-05-30 14:14:03,940 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-05-30 14:14:17,518 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Processing combination: ['Separation of Variables Method', 'Sturm-Liouville Theory and Eigenfunction Expansions'] | ['Definition and classification of PDEs (order, linearity, scalar vs system)', 'Bessel functions and Legendre polynomials in eigenvalue problems', 'Numerical stability conditions (CFL condition)', 'Quasilinear and nonlinear first-order PDEs']


2025-05-30 14:14:42,749 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-05-30 14:15:11,168 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-05-30 14:15:44,289 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-05-30 14:16:03,734 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-05-30 14:16:24,433 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Processing combination: ['F-test and F-distribution', 'Analysis of Variance (ANOVA)'] | ['Degrees of freedom related to groups and samples (between and within)', 'Parametric ANOVA: one-way and two-way', 'Calculation of sum of squares components in two-way ANOVA: SSA, SSB, SSE, TSS', 'Estimators of population variance used in ANOVA']


2025-05-30 14:16:42,043 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-05-30 14:17:03,243 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-05-30 14:17:16,162 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-05-30 14:17:56,819 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-05-30 14:18:19,219 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Processing combination: ['Inferential and Descriptive Statistics', 'Statistical Hypothesis Testing'] | ['Null Hypothesis (H0) and Alternative Hypothesis (H1)', 'Wilcoxon T Test: comparing paired/repeated measures samples', 'Steps in Hypothesis Testing: stating research question, statistical hypothesis, level of significance, selecting test statistics, setting decision rule, computing test statistics, interpreting results', 'Friedman Test: comparing three or more related/repeated measures samples', 'Kruskal-Wallis Test: comparing three or more independent samples']


2025-05-30 14:18:33,460 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-05-30 14:18:54,285 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-05-30 14:19:11,244 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-05-30 14:19:32,429 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-05-30 14:19:45,095 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Processing combination: ['Applications of ANOVA', 'Assumptions of ANOVA', 'Degrees of freedom in ANOVA'] | ['Sum of Squares Total (SST), Sum of Squares Between groups (SSB), Sum of Squares Within groups (SSW)', 'Parametric ANOVA: one-way and two-way', 'Mean Squares Between (MSB) and Mean Squares Within (MSW)', 'Estimators of population variance used in ANOVA']


2025-05-30 14:20:20,161 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


KeyboardInterrupt: 

In [8]:

# Save all questions to a file
with open("final_questions_maths.json", "w") as f:
    json.dump(all_questions, f, indent=2)
    # 2) write a Markdown file you can grab directly
with open("final_questions_maths.md", "w") as md:
    for batch in all_questions:
        for q in batch["data"]:
            # Write the question with its unique ID and topics
            md.write(f"### Question ID: {q['unique_id']}\n")
            # Write the question text
            md.write("**Question:**\n")
        # q["question"] already contains things like $\theta$
            md.write(q["question"] + "\n\n")
# logging.info("Author–Critique loop complete.")

In [5]:
import json

# 1. Load your JSON data
with open('final_questions_maths.json', 'r', encoding='utf-8') as f:
    all_questions = json.load(f)

# 2. Open a text file for writing the “pretty” version
with open('final_questions_maths.txt', 'w', encoding='utf-8') as out:
    for entry in all_questions:
        # Write header info
        out.write(f"Topics: {entry['topics']}\n")
        out.write(f"Key Concepts: {entry['key_concepts']}\n\n")
        
        # Extract questions; could be a single string or a list
        qs = entry['questions']
        
        # If it's one big string, just replace any literal "\n"
        if isinstance(qs, str):
            text = qs.replace('\\n', '\n')
            out.write(text)
            if not text.endswith('\n'):
                out.write('\n')
        
        # If it's a list of strings, iterate
        else:
            for q in qs:
                # json.load already gives you real \n, but just in case:
                line = q.replace('\\n', '\n')
                out.write(line)
                if not line.endswith('\n'):
                    out.write('\n')
        
        # Separator between entries
        out.write('\n' + '='*40 + '\n\n')

