In [2]:
import os
import glob
import json
import math
import random
import re
from collections import Counter
import networkx as nx
import pdfplumber
from openai import OpenAI
from dotenv import load_dotenv
load_dotenv()

# Set the OpenAI API key from environment variable
api_key = os.getenv("OPENAI_API_KEY")
client = OpenAI(api_key=api_key)

# -- Document Loading ------------------------------------------------------ 

def load_docs_from_dir(dir_path):
    """
    Load all .pdf files from a directory and extract their full text.
    Returns a list of strings, one per PDF.
    """
    docs = []
    filenames = []
    for filepath in glob.glob(os.path.join(dir_path, '*.pdf')):
        try:
            text_pages = []
            with pdfplumber.open(filepath) as pdf:
                for page in pdf.pages:
                    text_pages.append(page.extract_text() or "")
            full_text = "\n".join(text_pages)
            docs.append(full_text)
            filenames.append(os.path.basename(filepath))
        except Exception as e:
            # skip unreadable PDFs
            print(f"Warning: could not load {filepath}: {e}")
    return docs, filenames

# -- 1. Extract topics and key concepts -----------------------------------

def extract_json_from_text(text):
    """Helper function to extract JSON content from markdown code blocks or raw text"""
    # Try to extract JSON from markdown code blocks
    json_match = re.search(r'```(?:json)?\s*(.*?)\s*```', text, re.DOTALL)
    if json_match:
        return json_match.group(1).strip()
    
    # If no code blocks, try to find JSON-like structures
    json_match = re.search(r'(\{.*\})', text, re.DOTALL)
    if json_match:
        return json_match.group(1).strip()
    
    # Return original text if no JSON structure found
    return text

def extract_concepts_from_docs(doc_texts, filenames, model="gpt-4o"):  
    """
    Call OpenAI API to extract topics and key concepts from each document.
    Returns list of dicts with keys 'topics' and 'key_concepts'.
    """
    extractions = []
    for i, text in enumerate(doc_texts):
        filename = filenames[i] if i < len(filenames) else f"doc_{i}"
        
        prompt = (
            "Extract high-level topics and key concepts from the following document. "
            f"Return JSON with keys 'topics' and 'key_concepts'.\n\n{text}"
        )
        response = client.chat.completions.create(
            model=model,
            messages=[
                {"role": "system", "content": "You are an expert Maths summarizer. Return your response as a JSON object with 'topics' and 'key_concepts' as arrays."},
                {"role": "user", "content": prompt}
            ],
        )
        
        output = response.choices[0].message.content
        print("Raw output:", output)
        
        try:
            # Clean and extract JSON from the output
            json_str = extract_json_from_text(output)
            data = json.loads(json_str)
            
            # Ensure we have the expected keys
            if "topics" in data and "key_concepts" in data:
                extraction = {
                    "filename": filename,
                    "topics": data["topics"],
                    "key_concepts": data["key_concepts"]
                }
                extractions.append(extraction)
                print(f"Successfully extracted {len(data['topics'])} topics and {len(data['key_concepts'])} key concepts from {filename}")
            else:
                print(f"Warning: Parsed JSON doesn't have expected keys: {data.keys()}")
                extractions.append({
                    "filename": filename,
                    "topics": [], 
                    "key_concepts": []
                })
                
        except json.JSONDecodeError as e:
            print(f"Failed to parse JSON: {e}")
            extractions.append({
                "filename": filename,
                "topics": [], 
                "key_concepts": []
            })
    
    return extractions

# -- 2. Construct the concept graph ----------------------------------------

def build_concept_graph(extractions, eps=1e-6):
    """
    Build a unified graph G where nodes are topics + key concepts and
    edges weighted by log(freq+eps) based on co-occurrence in docs.
    Returns: G (nx.Graph), topic_nodes, kc_nodes
    """
    freq = Counter()
    all_topics, all_kcs = set(), set()

    for ex in extractions:
        nodes = ex["topics"] + ex["key_concepts"]
        for i in range(len(nodes)):
            for j in range(i+1, len(nodes)):
                u, v = sorted((nodes[i], nodes[j]))
                freq[(u, v)] += 1
        all_topics.update(ex["topics"])
        all_kcs.update(ex["key_concepts"])

    G = nx.Graph()
    for (u, v), f in freq.items():
        weight = math.log(f + eps)
        G.add_edge(u, v, weight=weight)

    print(f"Graph built with {len(all_topics)} topics and {len(all_kcs)} key concepts")
    return G, all_topics, all_kcs

# -- Helpers for sampling --------------------------------------------------

def softmax(weights):
    exps = [math.exp(w) for w in weights]
    s = sum(exps) or 1.0
    return [e/s for e in exps]


def random_walk(G, start, steps):
    """
    Random walk on graph G for given steps from 'start',
    with transition probabilities via softmax over edge weights.
    """
    path = [start]
    current = start
    for _ in range(steps):
        nbrs = list(G[current])
        if not nbrs:
            break
        weights = [G[current][n]['weight'] for n in nbrs]
        probs = softmax(weights)
        current = random.choices(nbrs, probs)[0]
        path.append(current)
    return path

# -- 3. Concept combination sampling ---------------------------------------

def sample_concept_combinations(
    G, topic_nodes, kc_nodes,
    num_samples=100,
    topic_walk_steps=(1, 2),
    kc_walk_steps=(3, 4)
):
    """
    Generate sampled sets of topics and key concepts via multi-stage random walks.
    Returns list of dicts: {'topics': set, 'key_concepts': set}
    """
    # Safety check
    if not topic_nodes:
        print("Error: No topics found. Cannot sample combinations.")
        return []
        
    G_topic = G.subgraph(topic_nodes)
    G_topic_kc = G.subgraph(topic_nodes | kc_nodes)
    G_kc = G.subgraph(kc_nodes)
    samples = []

    topics_list = list(topic_nodes)
    print(f"Sampling from {len(topics_list)} topics")
    
    for _ in range(num_samples):
        t0 = random.choice(topics_list)
        t_steps = random.choice(topic_walk_steps)
        topic_path = random_walk(G_topic, t0, t_steps)
        sampled_topics = set(topic_path)

        kc_cands = [nbr for t in sampled_topics for nbr in G_topic_kc[t] if nbr in kc_nodes]
        if kc_cands:
            k0 = random.choice(kc_cands)
            k_steps = random.choice(kc_walk_steps)
            kc_path = random_walk(G_kc, k0, k_steps)
            sampled_kcs = set(kc_path)
        else:
            sampled_kcs = set()

        samples.append({
            "topics": list(sampled_topics),  # Convert sets to lists for JSON serialization
            "key_concepts": list(sampled_kcs)
        })
    
    return samples

# -- 4. Question generation ------------------------------------------------

# def generate_questions_for_samples(combos, docs, extractions, model="gpt-4o"):  
#     """
#     For each sampled combo, pick two docs via Jaccard on concept sets,
#     then call LLM to generate questions.
#     Returns list of dicts: {'sample': combo, 'questions': [...]}.
#     """
#     doc_concepts = [set(ex['topics'] + ex['key_concepts']) for ex in extractions]
#     # doc_concepts= doc_concepts[:1]
#     results = []
#     max_samples = 100
#     # combos = combos[:max_samples]
#     for i, combo in enumerate(combos):
#         combo_id = f"combo_{i+1}"
#         kg = set(combo['topics']) | set(combo['key_concepts'])
#         sims = []
#         for idx, dc in enumerate(doc_concepts):
#             inter = kg & dc
#             union = kg | dc
#             sims.append((len(inter) / (len(union) or 1), idx))
#         sims.sort(reverse=True)
#         top_idxs = [i for _, i in sims[:2]]
#         refs = [docs[i] for i in top_idxs]
#         ref_files = [extractions[i]["filename"] for i in top_idxs]
#         System_prompt =f"""
# Each question must follow these instructions:
# Model a Chemistry Scenario: Start from a real-world or idealized setup. Avoid abstract Physics problems or purely conceptual statements.
# Target a Solvable Quantity: Ask for a clear symbolic expression of a physical variable (e.g., tension, acceleration, energy).
# Force Multi-Step Reasoning: Ensure the question involves a sequence of physics laws, transformations, and derivations to reach the answer.
# Avoid Redundancy: Exclude extraneous details or variables that do not impact the final solution.
# Be Unique: Do not rephrase standard textbook problems; ensure originality and complexity.
# Single solution: Expect a single symbolic expression, unambiguous, presented in LaTeX. Multiple equivalent algebraic forms are allowed. No equations or floating-point approximations.
# Use rigorous, concise phrasing.
# Avoid colloquial or ambiguous terminology.
# Units must be consistent; symbols should follow standard notation.
# """
#         prompt = (
#             f"Generate a set of difficult Physics questions based on the following:\n"
            
#             f"Topics: {combo['topics']}\n"
#             f"Key Concepts: {combo['key_concepts']}\n"
#             f"Reference Doc 1:\n{refs[0]}\n"
#         )
#         if len(refs) > 1:
#             prompt += f"Reference Doc 2:\n{refs[1]}\n"
#         prompt += "Return a JSON array of questions."

#         # from ollama import chat
#         # from ollama import ChatResponse

#         # response: ChatResponse = chat(model='qwen3:8b', 
#         #                                messages=[
#         #         {"role": "system", "content": System_prompt},
#         #         {"role": "user", "content": prompt}
#         #     ])
#         # # print(response['message']['content'])
#         # # or access fields directly from the response object
#         # print(response.message.content)
#         # content = response.message.content
#         response = client.chat.completions.create(
#             model=model,
#             messages=[
#                 {"role": "system", "content": System_prompt},
#                 {"role": "user", "content": prompt}
#             ]
#         )
#         content = response.choices[0].message.content
        
#         try:
#             # Clean and extract JSON from the output
#             json_str = extract_json_from_text(content)
#             questions = json.loads(json_str)
#             if not isinstance(questions, list):
#                 # If the output is an object with a questions key
#                 if isinstance(questions, dict) and "questions" in questions:
#                     questions = questions["questions"]
#                 else:
#                     questions = [str(questions)]
#         except json.JSONDecodeError:
#             questions = [content]

#         results.append({
#             "id": combo_id,
#             "topics": combo['topics'],
#             "key_concepts": combo['key_concepts'],
#             "reference_files": ref_files,
#             "questions": questions
#         })
#         max_samples-=1
#         if max_samples == 0:
#             break


    # return results

# -- Save outputs to files ------------------------------------------------

def save_extractions(extractions, output_file="document_extractions.json"):
    """Save the extracted topics and key concepts for each document"""
    # Ensure the extractions are serializable (convert sets to lists)
    serializable_extractions = []
    for ex in extractions:
        serializable_extractions.append({
            "filename": ex["filename"],
            "topics": list(ex["topics"]),
            "key_concepts": list(ex["key_concepts"])
        })
    
    with open(output_file, "w") as f:
        json.dump(serializable_extractions, f, indent=2)
    
    print(f"Saved document extractions to {output_file}")

def save_questions_with_topics(questions, output_file="questions_with_topics.json"):
    """Save the generated questions with their topic combinations"""
    with open(output_file, "w") as f:
        json.dump(questions, f, indent=2)
    
    print(f"Saved questions with topic combinations to {output_file}")

# -- Main Execution --------------------------------------------------------

if __name__ == "__main__":
    # Create output directory if it doesn't exist
    output_dir = "output_new"
    os.makedirs(output_dir, exist_ok=True)
    
    # adjust this path to where your .pdf docs live
    docs_dir = "docs_new/"
    
    print("Loading documents...")
    docs, filenames = load_docs_from_dir(docs_dir)
    print(f"Loaded {len(docs)} documents")
    
    if not docs:
        print("No documents found. Please check the docs directory.")
        exit(1)
    
    # 1) Extract topics & KCs
    print("\nExtracting topics and key concepts...")
    extractions = extract_concepts_from_docs(docs, filenames)
    
    # Save extractions to file
    save_extractions(extractions, os.path.join(output_dir, "document_extractions_maths.json"))
    
    # Verify we have valid extractions
    valid_extractions = [ex for ex in extractions if ex["topics"] or ex["key_concepts"]]
    if not valid_extractions:
        print("No valid topics or key concepts extracted. Check your data and API responses.")
        exit(1)
    
    # 2) Build graph
    print("\nBuilding concept graph...")
    G, topic_nodes, kc_nodes = build_concept_graph(extractions)
    
    if not topic_nodes:
        print("No topics found in the graph. Cannot proceed.")
        exit(1)
    
    # 3) Sample combinations
    print("\nSampling concept combinations...")
    combos = sample_concept_combinations(G, topic_nodes, kc_nodes, num_samples=50)  # Reduced for testing
    
    if not combos:
        print("Failed to generate concept combinations.")
        exit(1)
    
    # Save topic combinations
    with open(os.path.join(output_dir, "topic_combinations_maths.json"), "w") as f:
        json.dump(combos, f, indent=2)
    
    # 4) Generate questions
    # print("\nGenerating questions for each combination...")
    # q_outputs = generate_questions_for_samples(combos, docs, extractions)
    
    # # Save questions with topics
    # save_questions_with_topics(q_outputs, os.path.join(output_dir, "questions_with_topics.json"))
    
    # # Display results
    # print("\n===== GENERATED QUESTIONS =====")
    # for idx, out in enumerate(q_outputs, 1):
    #     print(f"\nSample {idx}:")
    #     print(f"Topics: {out['topics']}")
    #     print(f"Key Concepts: {out['key_concepts']}")
    #     print(f"Reference Files: {out['reference_files']}")
    #     print("Questions:")
    #     for q in out['questions']:
    #         print(f" - {q}")
    
    # print(f"\nAll outputs saved to directory: {output_dir}") 

# if __name__ == "__main__":
#     # Create output directory if it doesn't exist
#     # Create output directory if it doesn't exist
#     output_dir = "output_apple"
#     os.makedirs(output_dir, exist_ok=True)
    
#     # adjust this path to where your .pdf docs live
#     docs_dir = "docs/"
    
#     print("Loading documents...")
#     docs, filenames = load_docs_from_dir(docs_dir)
#     print(f"Loaded {len(docs)} documents")
#     document_extractions = []
#     with open("output_apple/document_extractions.json", "r") as f:
#         document_extractions = json.load(f)
#     for doc in document_extractions:
#         print(doc)
#         break

#     topic_combinations = []
#     with open("output_apple/topic_combinations.json", "r") as f:
#         topic_combinations = json.load(f)
#     for combo in topic_combinations:
#         print(combo)
#         break

#     # generate questions 
#     print("Generating questions...")
#     q_outputs = generate_questions_for_samples(topic_combinations, docs, document_extractions)

#     # Save questions with topics
#     save_questions_with_topics(q_outputs, os.path.join(output_dir, "questions_with_topics.json"))
    
#     # Display results
#     print("\n===== GENERATED QUESTIONS =====")
#     for idx, out in enumerate(q_outputs, 1):
#         print(f"\nSample {idx}:")
#         print(f"Topics: {out['topics']}")
#         print(f"Key Concepts: {out['key_concepts']}")
#         print(f"Reference Files: {out['reference_files']}")
#         print("Questions:")
#         for q in out['questions']:
#             print(f" - {q}")
    
#     print(f"\nAll outputs saved to directory: {output_dir}")

CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox


Loading documents...


CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, def



CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, def

Loaded 39 documents

Extracting topics and key concepts...
Raw output: ```json
{
  "topics": [
    "Effective Field Theory (EFT)",
    "Scalar extensions of the top quark sector",
    "Phenomenology of the Standard Model and its extensions",
    "Top quark physics at the Large Hadron Collider (LHC)",
    "Theoretical modeling and constraints from high energy physics experiments"
  ],
  "key_concepts": [
    "EFT as an approximation tool for new physics beyond the Standard Model",
    "Dimension six operators in low energy expansions",
    "Matching conditions between full theories and EFTs at leading and next-to-leading order",
    "The role of NLO corrections in ensuring the validity of EFTs",
    "The importance of top quark pair production in exploring new physics",
    "Simplified models in high energy physics as a bridge between EFTs and UV complete theories",
    "Constraints on new physics models from LHC measurements",
    "Perturbative unitarity limits and their implications f

### download data from csv

In [1]:
import os
import argparse
import pandas as pd
import requests
from urllib.parse import urlparse, unquote

def download_files(
    csv_path="Research Papers - Math_New.csv",
    output_dir="docs_new"
) -> None:
    """
    Reads a CSV with a 'Link' column and downloads each URL to output_dir,
    saving each file with a .pdf extension.
    """
    # Load CSV
    df = pd.read_csv(csv_path)
    if 'Link' not in df.columns:
        raise ValueError("CSV must contain a 'Link' column.")

    # Ensure output directory exists
    os.makedirs(output_dir, exist_ok=True)

    total = len(df['Link'].dropna())
    for idx, url in enumerate(df['Link'].dropna(), start=1):
        try:
            # Parse URL and derive a safe filename
            parsed = urlparse(url)
            raw_name = os.path.basename(parsed.path)
            raw_name = unquote(raw_name)  # decode URL-encoded characters

            if raw_name:
                name, ext = os.path.splitext(raw_name)
                # If there's no .pdf extension, force it
                filename = f"{idx}.pdf" if ext.lower() != '.pdf' else raw_name
            else:
                filename = f"{idx}.pdf"

            out_path = os.path.join(output_dir, filename)

            # Stream download
            with requests.get(url, stream=True) as r:
                r.raise_for_status()
                with open(out_path, 'wb') as f:
                    for chunk in r.iter_content(chunk_size=8192):
                        if chunk:
                            f.write(chunk)

            print(f"[{idx}/{total}] Downloaded → {filename}")

        except Exception as e:
            print(f"[{idx}/{total}] FAILED  → {url}\n    {e}")

if __name__ == "__main__":
    download_files()




[1/40] Downloaded → 1.pdf
[2/40] Downloaded → 2311.08854.pdf
[3/40] Downloaded → 3.pdf
[4/40] Downloaded → 4.pdf
[5/40] Downloaded → 5.pdf
[6/40] Downloaded → 6.pdf
[7/40] Downloaded → 7.pdf
[8/40] Downloaded → 8.pdf
[9/40] Downloaded → 9.pdf
[10/40] Downloaded → 10.pdf
[11/40] Downloaded → 11.pdf
[12/40] Downloaded → 12.pdf
[13/40] Downloaded → 13.pdf
[14/40] Downloaded → 14.pdf
[15/40] Downloaded → 15.pdf
[16/40] Downloaded → 16.pdf
[17/40] Downloaded → 17.pdf
[18/40] Downloaded → 18.pdf
[19/40] Downloaded → 19.pdf
[20/40] Downloaded → 20.pdf
[21/40] Downloaded → 21.pdf
[22/40] Downloaded → 22.pdf
[23/40] Downloaded → 23.pdf
[24/40] Downloaded → 24.pdf
[25/40] Downloaded → 25.pdf
[26/40] Downloaded → 26.pdf
[27/40] Downloaded → 27.pdf
[28/40] Downloaded → 28.pdf
[29/40] Downloaded → 29.pdf
[30/40] Downloaded → 30.pdf
[31/40] Downloaded → 31.pdf
[32/40] Downloaded → 32.pdf
[33/40] Downloaded → 33.pdf
[34/40] Downloaded → 34.pdf
[35/40] Downloaded → 35.pdf
[36/40] Downloaded → 36.pdf
[

## Author - Critique Model

In [8]:
import os
import re
import uuid
import json
import logging
from langchain.chat_models import ChatOpenAI
from langchain.prompts import PromptTemplate, FewShotPromptTemplate
from langchain.chains import LLMChain
from langchain_google_genai import ChatGoogleGenerativeAI


# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)

def author_critique_loop(
    topics: list[str],
    concepts: list[str],
    guidelines: str,
    few_shot_examples: list[dict],
    max_turns: int = 4,
) -> list[dict]:
    """
    Runs an iterative author-critic loop to generate and refine physics questions.

    Logs the entire multi-turn conversation (author + critic) to a single file.

    Returns a list of dicts with:
        unique_id: str,
        question: str,
        topics: list[str],
        concepts: list[str],
        file_conversation_log: str  # path to the full conversation log
    """
    # Create a unique run-level conversation log
    run_id = uuid.uuid4().hex
    os.makedirs('logs', exist_ok=True)
    conv_log_path = f"logs/{run_id}.log"
    conv_logger = open(conv_log_path, 'w')

    def log(msg: str):
        conv_logger.write(msg + "\n")
        conv_logger.flush()

    # Initialize LLMs (temperature locked to 1.0)
    # author_llm = ChatOpenAI(model_name="gpt-4.1-mini", temperature=1.0)
    # critic_llm = ChatOpenAI(model_name="gpt-4.1", temperature=1.0)
    gemini_api = os.getenv("GEMINI_API_KEY")
    critic_llm = ChatGoogleGenerativeAI(model="gemini-2.5-flash-preview-05-20", temperature=1.0,google_api_key=gemini_api) 
    author_llm = ChatGoogleGenerativeAI(model="gemini-2.0-flash", temperature=1.0,google_api_key=gemini_api) 



    # Build Few-Shot prompt for author
    example_prompt = PromptTemplate(
        input_variables=["example"],
        template="Example:\n{{example}}\n---",
        template_format="jinja2"

    )
    author_fs_prompt = FewShotPromptTemplate(
        examples=few_shot_examples,
        example_prompt=example_prompt,
        prefix=(
            "You are a Mathematics education specialist. Generate novel university-level physics questions. "
            "Each question must start with <question> and end with </question>. Here are examples:"
        ),
        suffix=(
            "Now, based on the topics: {{topics}}, key concepts: {{concepts}}, and guidelines: {{guidelines}}, "
            "produce a numbered list of new, high-quality Mathematics questions, each wrapped in <question>...</question> tags."
        ),
        input_variables=["topics", "concepts", "guidelines"],
                template_format="jinja2"

    )
    author_chain = LLMChain(llm=author_llm, prompt=author_fs_prompt)

    # Build Critic prompt
    critic_prompt = PromptTemplate(
        input_variables=["questions", "guidelines"],
        template=(
            "You are a Mathematics assessment expert. Critique the following questions:\n"
            "{questions}\n"
            "Evaluate them against these guidelines:\n{guidelines}\n"
            "Provide concise, actionable feedback on how to improve."
        ),
    )
    critic_chain = LLMChain(llm=critic_llm, prompt=critic_prompt)

    # Turn 0: Generation
    log("=== Turn 0: Author generates questions ===")
    questions_text = author_chain.run(
        topics=topics, concepts=concepts, guidelines=guidelines
    )
    log(questions_text)

    # Turn 0: Critic feedback
    log("=== Turn 0: Critic feedback ===")
    feedback = critic_chain.run(questions=questions_text, guidelines=guidelines)
    log(feedback)

    # Refinement turns
    for turn in range(1, max_turns):
        log(f"=== Turn {turn}: Author refines questions ===")
        refine_prompt = PromptTemplate(
            input_variables=["questions", "feedback"],
            template=(
                "Refine these questions based on the feedback:\n{questions}\n"
                "Feedback:\n{feedback}\n"
                "Return an improved numbered list of physics questions, each wrapped in <question>...</question> tags."
            ),
        )
        refine_chain = LLMChain(llm=author_llm, prompt=refine_prompt)
        questions_text = refine_chain.run(questions=questions_text, feedback=feedback)
        log(questions_text)
        if turn == max_turns - 1:
            log("=== Final questions generated ===")
            break
        log(f"=== Turn {turn}: Critic feedback ===")
        feedback = critic_chain.run(questions=questions_text, guidelines=guidelines)
        log(feedback)

    # Close the conversation log
    conv_logger.close()

    # Parse final questions into structured entries
    entries: list[dict] = []
    pattern = re.compile(r'<question>(.*?)</question>', re.DOTALL)
    for match in pattern.findall(questions_text):
        question_body = match.strip()
        uid = uuid.uuid4().hex
        entries.append({
            "unique_id": uid,
            "question": question_body,
            "topics": topics,
            "concepts": concepts,
            "file_conversation_log": conv_log_path,
        })

    return entries


if __name__ == "__main__":
    # Chemistry-specific example usage
    # topics = [
    #     "Thermodynamics of Chemical Reactions",
    #     "Organic Reaction Mechanisms",
    #     "Coordination Chemistry",
    #     "Electrochemical Cells"
    # ]
    # concepts = [
    #     "Use of enthalpy, entropy, and Gibbs free energy to predict reaction spontaneity.",
    #     "Mechanistic steps in nucleophilic substitution and elimination reactions.",
    #     "Formation constants and isomerism in transition metal complexes.",
    #     "Calculating cell potential using standard reduction potentials and Nernst equation."
    # ]
    with open("output_new/topic_combinations_maths.json", "r") as f:
        topic_combinations = json.load(f)

  # Limit for testing topic combinations is last k
    topic_combinations = topic_combinations[20:] # Adjust as needed for testing
    print(f"Processing {len(topic_combinations)} topic combinations for Mathematics questions...")
    guidelines = (
    r"1. Input Format: the input should expect Fully text-based. No diagrams, visual aids, or references to external media. "
    r"2. Mathematical Structure: Start with a well-defined setup—real-world or idealized. Avoid vague, puzzle-like, or speculative scenarios. "
    r"3. Solvable Target: Ask for a clear symbolic result (e.g., function, limit, closed-form expression, numeric).Questions must not be proves or derivations."
    r"4. Single Output: each questions Expects a single, unambiguous symbolic answer in LaTeX. Equivalent forms are allowed."
    r"5. Concise and Relevant: Include only essential information. Eliminate redundant details or irrelevant symbols. "
    r"6. Originality: Ensure the problem is unique and non-trivial. Avoid rewording standard textbook problems. "
    r"7. Multi-Step Reasoning: Require a logical sequence of algebraic or analytic steps, involving definitions, identities, or theorems."
    r"8. Solution Formatting: Format the equations, numbers and variables using LaTeX syntax such that all inline math, numbers, variables are enclosed by $...$ and block math is enclosed by $$...$$. Do not use Markdown for any unformatted data. "
    r"9. Language and Notation: Be rigorous and concise. Avoid colloquial terms. Use standard, consistent mathematical notation throughout. "
    r"10. Questions must not have multiple parts to answer, there should be only one single solution to each question"
    r"11. Only questions should be generated, no answers, solutions or explanations should be provided, nor any hints."
    r"12. No numeric approximations. Do not rely on computational tools - use traditional mathematical methods."
)










    
    few_shot_examples = [
        {"example": (
            r"""As the sun set over the quiet town, a mathematician named Alex sat by the lake, tossing pebbles into the water. He pondered a curious geometric puzzle inspired by the ripples—how distances change when a point moves along a circle. His thoughts led him to an intriguing problem about a regular octagon inscribed in a circle.
Let $A_1, A_2, A_3, \ldots, A_8$ be the vertices of a regular octagon that lie on a circle of radius $2$. Let $P$ be a point on the circle and let $PA_i$ denote the distance between the points $P$ and $A_i$ for $i = 1, 2, \ldots, 8$.
If $P$ varies over the circle, then find the maximum value of $$PA_1 \cdot PA_2 \cdots PA_8.$$

"""
        )},
        {"example": r"""Consider the following systems  
  
$$ \dot{x_{1}} - \dot{y_{1}} = (x_{1} - y_{1}) - \nu_{1} (\cos{x_{1}} - \sin{x_{1}}), ~~~~\text{with} \quad \dot{x_{1}} = 1 \quad(1) $$  and  
$$ \dot{x_{2}} - \dot{y_{2}} = (x_{2} - y_{2}) - \nu_{2} (\sin{(-x_{2})} + \cos{(-x_{2})}), \text{with} \quad \dot{x_{2}} = 1 \quad(2) $$  
  
Now consider a scenario where the $x_{1,2}$- components from both systems synchronize whenever system (1) sends signals to the system (2) through the linear coupling term in the form $f(x_{2} - x_{1})$ and coupling strength $\Psi$. If $\nu = \nu_2 - \nu_{1}$ measures the order of mismatch, then what are the equations of the invariant manifold for systems (1) and (2) in the ${R}^3$- plane?"""
            
        },
    ]
    all_questions = []

    for combo in topic_combinations:
        topics = combo['topics']
        concepts = combo['key_concepts']
        # logger.info(f"Processing combination: {topics} | {concepts}")
        # Run the author–critic loop for this combination
        questions = author_critique_loop(
            topics, concepts, guidelines, few_shot_examples, max_turns=3
        )
        
        all_questions.append({
            "questions": questions
        })

    # Save all questions to a file
    with open("final_questions_maths.json", "w") as f:
        json.dump(all_questions, f, indent=2)


    with open("final_questions_maths.md", "w") as md:
        for batch in all_questions:
            for q in batch['questions']:
                md.write(f"### Question ID: {q['unique_id']}\n")
                md.write(f"**Question:** {q['question']}\n")

    file_path = "questions_6june_maths.csv"
    file_exists = os.path.isfile(file_path)
    is_empty = (not file_exists) or (os.path.getsize(file_path) == 0)
    import csv
    # Open with newline='' so that csv.writer can quote internal newlines correctly
    with open(file_path, "a", newline="", encoding="utf-8") as csv_file:
        writer = csv.writer(csv_file)  # default delimiter=',' and quoting=QUOTE_MINIMAL

        if is_empty:
            # Write header row only if file is new/empty
            writer.writerow(["question"])

        # Now append each question, letting csv.writer quote any embedded newline or commas
        for batch in all_questions:
            for q in batch["data"]:
                print(f"Writing question: {q['question']}")
                writer.writerow([q["question"]])  # Each question is a single cell, so we wrap it in a list


Processing 30 topic combinations for Mathematics questions...




KeyboardInterrupt: 

In [16]:
    # Save all questions to a file
with open("final_questions_maths.json", "w") as f:
        json.dump(all_questions, f, indent=2)
with open("final_questions_maths.md", "w") as md:
        for batch in all_questions:
            for q in batch['questions']:
                md.write(f"### Question ID: {q['unique_id']}\n")
                md.write(f"**Question:** {q['question']}\n")
file_path = "questions_6june_maths.csv"
file_exists = os.path.isfile(file_path)
is_empty = (not file_exists) or (os.path.getsize(file_path) == 0)
import csv
# Open with newline='' so that csv.writer can quote internal newlines correctly
with open(file_path, "a", newline="", encoding="utf-8") as csv_file:
        writer = csv.writer(csv_file)  # default delimiter=',' and quoting=QUOTE_MINIMAL
        if is_empty:
        # Write header row only if file is new/empty
            writer.writerow(["question"])
    # Now append each question, letting csv.writer quote any embedded newline or commas
        for batch in all_questions:
            for q in batch["questions"]:
                print(f"Writing question: {q['question']}")
                writer.writerow([q["question"]])  # Each question is a single cell, so we wrap it in a list


Writing question: A particle of mass $m$ moves in a two-dimensional potential given by $V(x, y) = \frac{1}{2}kx^2 + \frac{1}{2}ly^2 + \alpha xy$, where $k$, $l$, and $\alpha$ are positive constants, and $\alpha^2 < kl$.  Determine the condition on the *initial displacements* (expressed as a ratio) that will excite only the *lower* frequency normal mode of oscillation for this system.
Writing question: A uniform solid cylinder of mass $M$ and radius $R$ is placed on an inclined plane that makes an angle $\theta$ with the horizontal. The cylinder is released from rest and rolls without slipping down the plane. A light, inextensible string is wrapped around a groove of radius $r < R$ on the cylinder. The string passes over a massless, frictionless pulley at the top of the incline and is attached to a hanging mass $m$. Determine the value of the hanging mass $m$ that will hold the cylinder stationary on the incline.
Writing question: A rocket of initial mass $m_0$ (including fuel) moves ve

In [15]:
all_questions

[{'questions': [{'unique_id': 'b0da8844e3454e8b9804e467501aba7b',
    'question': 'A particle of mass $m$ moves in a two-dimensional potential given by $V(x, y) = \\frac{1}{2}kx^2 + \\frac{1}{2}ly^2 + \\alpha xy$, where $k$, $l$, and $\\alpha$ are positive constants, and $\\alpha^2 < kl$.  Determine the condition on the *initial displacements* (expressed as a ratio) that will excite only the *lower* frequency normal mode of oscillation for this system.',
    'topics': ['Induced Representations',
     'Semidirect Products',
     'Character Theory'],
    'concepts': ['Induced Representation via Tensor Product',
     'Artin-Wedderburn Decomposition',
     'Principal Series Representations',
     'Characters of GL2(q)'],
    'file_conversation_log': 'logs/9db7b97f189a410b81a397336cb3fc54.log'},
   {'unique_id': '07d80337083b4801ba309bddda40241e',
    'question': 'A uniform solid cylinder of mass $M$ and radius $R$ is placed on an inclined plane that makes an angle $\\theta$ with the horizo