In [4]:
import os
import glob
import json
import math
import random
import re
from collections import Counter
import networkx as nx
import pdfplumber
from openai import OpenAI
from dotenv import load_dotenv
load_dotenv()

# Set the OpenAI API key from environment variable
api_key = os.getenv("OPENAI_API_KEY")
client = OpenAI(api_key=api_key)

# -- Document Loading ------------------------------------------------------

def load_docs_from_dir(dir_path):
    """
    Load all .pdf files from a directory and extract their full text.
    Returns a list of strings, one per PDF.
    """
    docs = []
    filenames = []
    for filepath in glob.glob(os.path.join(dir_path, '*.pdf')):
        try:
            text_pages = []
            with pdfplumber.open(filepath) as pdf:
                for page in pdf.pages:
                    text_pages.append(page.extract_text() or "")
            full_text = "\n".join(text_pages)
            docs.append(full_text)
            filenames.append(os.path.basename(filepath))
        except Exception as e:
            # skip unreadable PDFs
            print(f"Warning: could not load {filepath}: {e}")
    return docs, filenames

# -- 1. Extract topics and key concepts -----------------------------------

def extract_json_from_text(text):
    """Helper function to extract JSON content from markdown code blocks or raw text"""
    # Try to extract JSON from markdown code blocks
    json_match = re.search(r'```(?:json)?\s*(.*?)\s*```', text, re.DOTALL)
    if json_match:
        return json_match.group(1).strip()
    
    # If no code blocks, try to find JSON-like structures
    json_match = re.search(r'(\{.*\})', text, re.DOTALL)
    if json_match:
        return json_match.group(1).strip()
    
    # Return original text if no JSON structure found
    return text

def extract_concepts_from_docs(doc_texts, filenames, model="gpt-4.1-nano"):  
    """
    Call OpenAI API to extract topics and key concepts from each document.
    Returns list of dicts with keys 'topics' and 'key_concepts'.
    """
    extractions = []
    for i, text in enumerate(doc_texts):
        filename = filenames[i] if i < len(filenames) else f"doc_{i}"
        
        prompt = (
            "Extract high-level topics and key concepts from the following document. "
            f"Return JSON with keys 'topics' and 'key_concepts'.\n\n{text}"
        )
        response = client.chat.completions.create(
            model=model,
            messages=[
                {"role": "system", "content": "You are an expert summarizer. Return your response as a JSON object with 'topics' and 'key_concepts' as arrays."},
                {"role": "user", "content": prompt}
            ],
        )
        
        output = response.choices[0].message.content
        print("Raw output:", output)
        
        try:
            # Clean and extract JSON from the output
            json_str = extract_json_from_text(output)
            data = json.loads(json_str)
            
            # Ensure we have the expected keys
            if "topics" in data and "key_concepts" in data:
                extraction = {
                    "filename": filename,
                    "topics": data["topics"],
                    "key_concepts": data["key_concepts"]
                }
                extractions.append(extraction)
                print(f"Successfully extracted {len(data['topics'])} topics and {len(data['key_concepts'])} key concepts from {filename}")
            else:
                print(f"Warning: Parsed JSON doesn't have expected keys: {data.keys()}")
                extractions.append({
                    "filename": filename,
                    "topics": [], 
                    "key_concepts": []
                })
                
        except json.JSONDecodeError as e:
            print(f"Failed to parse JSON: {e}")
            extractions.append({
                "filename": filename,
                "topics": [], 
                "key_concepts": []
            })
    
    return extractions

# -- 2. Construct the concept graph ----------------------------------------

def build_concept_graph(extractions, eps=1e-6):
    """
    Build a unified graph G where nodes are topics + key concepts and
    edges weighted by log(freq+eps) based on co-occurrence in docs.
    Returns: G (nx.Graph), topic_nodes, kc_nodes
    """
    freq = Counter()
    all_topics, all_kcs = set(), set()

    for ex in extractions:
        nodes = ex["topics"] + ex["key_concepts"]
        for i in range(len(nodes)):
            for j in range(i+1, len(nodes)):
                u, v = sorted((nodes[i], nodes[j]))
                freq[(u, v)] += 1
        all_topics.update(ex["topics"])
        all_kcs.update(ex["key_concepts"])

    G = nx.Graph()
    for (u, v), f in freq.items():
        weight = math.log(f + eps)
        G.add_edge(u, v, weight=weight)

    print(f"Graph built with {len(all_topics)} topics and {len(all_kcs)} key concepts")
    return G, all_topics, all_kcs

# -- Helpers for sampling --------------------------------------------------

def softmax(weights):
    exps = [math.exp(w) for w in weights]
    s = sum(exps) or 1.0
    return [e/s for e in exps]


def random_walk(G, start, steps):
    """
    Random walk on graph G for given steps from 'start',
    with transition probabilities via softmax over edge weights.
    """
    path = [start]
    current = start
    for _ in range(steps):
        nbrs = list(G[current])
        if not nbrs:
            break
        weights = [G[current][n]['weight'] for n in nbrs]
        probs = softmax(weights)
        current = random.choices(nbrs, probs)[0]
        path.append(current)
    return path

# -- 3. Concept combination sampling ---------------------------------------

def sample_concept_combinations(
    G, topic_nodes, kc_nodes,
    num_samples=100,
    topic_walk_steps=(1, 2),
    kc_walk_steps=(3, 4)
):
    """
    Generate sampled sets of topics and key concepts via multi-stage random walks.
    Returns list of dicts: {'topics': set, 'key_concepts': set}
    """
    # Safety check
    if not topic_nodes:
        print("Error: No topics found. Cannot sample combinations.")
        return []
        
    G_topic = G.subgraph(topic_nodes)
    G_topic_kc = G.subgraph(topic_nodes | kc_nodes)
    G_kc = G.subgraph(kc_nodes)
    samples = []

    topics_list = list(topic_nodes)
    print(f"Sampling from {len(topics_list)} topics")
    
    for _ in range(num_samples):
        t0 = random.choice(topics_list)
        t_steps = random.choice(topic_walk_steps)
        topic_path = random_walk(G_topic, t0, t_steps)
        sampled_topics = set(topic_path)

        kc_cands = [nbr for t in sampled_topics for nbr in G_topic_kc[t] if nbr in kc_nodes]
        if kc_cands:
            k0 = random.choice(kc_cands)
            k_steps = random.choice(kc_walk_steps)
            kc_path = random_walk(G_kc, k0, k_steps)
            sampled_kcs = set(kc_path)
        else:
            sampled_kcs = set()

        samples.append({
            "topics": list(sampled_topics),  # Convert sets to lists for JSON serialization
            "key_concepts": list(sampled_kcs)
        })
    
    return samples

# -- 4. Question generation ------------------------------------------------

def generate_questions_for_samples(combos, docs, extractions, model="gpt-4o"):  
    """
    For each sampled combo, pick two docs via Jaccard on concept sets,
    then call LLM to generate questions.
    Returns list of dicts: {'sample': combo, 'questions': [...]}.
    """
    doc_concepts = [set(ex['topics'] + ex['key_concepts']) for ex in extractions]
    # doc_concepts= doc_concepts[:1]
    results = []
    max_samples = 100
    # combos = combos[:max_samples]
    for i, combo in enumerate(combos):
        combo_id = f"combo_{i+1}"
        kg = set(combo['topics']) | set(combo['key_concepts'])
        sims = []
        for idx, dc in enumerate(doc_concepts):
            inter = kg & dc
            union = kg | dc
            sims.append((len(inter) / (len(union) or 1), idx))
        sims.sort(reverse=True)
        top_idxs = [i for _, i in sims[:2]]
        refs = [docs[i] for i in top_idxs]
        ref_files = [extractions[i]["filename"] for i in top_idxs]
        System_prompt =f"""
Each question must follow these instructions:
Model a Physical Scenario: Start from a real-world or idealized setup. Avoid abstract Physics problems or purely conceptual statements.
Target a Solvable Quantity: Ask for a clear symbolic expression of a physical variable (e.g., tension, acceleration, energy).
Force Multi-Step Reasoning: Ensure the question involves a sequence of physics laws, transformations, and derivations to reach the answer.
Avoid Redundancy: Exclude extraneous details or variables that do not impact the final solution.
Be Unique: Do not rephrase standard textbook problems; ensure originality and complexity.
Single solution: Expect a single symbolic expression, unambiguous, presented in LaTeX. Multiple equivalent algebraic forms are allowed. No equations or floating-point approximations.
Use rigorous, concise phrasing.
Avoid colloquial or ambiguous terminology.
Units must be consistent; symbols should follow standard notation.
"""
        prompt = (
            f"Generate a set of difficult Physics questions based on the following:\n"
            
            f"Topics: {combo['topics']}\n"
            f"Key Concepts: {combo['key_concepts']}\n"
            f"Reference Doc 1:\n{refs[0]}\n"
        )
        if len(refs) > 1:
            prompt += f"Reference Doc 2:\n{refs[1]}\n"
        prompt += "Return a JSON array of questions."

        # from ollama import chat
        # from ollama import ChatResponse

        # response: ChatResponse = chat(model='qwen3:8b', 
        #                                messages=[
        #         {"role": "system", "content": System_prompt},
        #         {"role": "user", "content": prompt}
        #     ])
        # # print(response['message']['content'])
        # # or access fields directly from the response object
        # print(response.message.content)
        # content = response.message.content
        response = client.chat.completions.create(
            model=model,
            messages=[
                {"role": "system", "content": System_prompt},
                {"role": "user", "content": prompt}
            ]
        )
        content = response.choices[0].message.content
        
        try:
            # Clean and extract JSON from the output
            json_str = extract_json_from_text(content)
            questions = json.loads(json_str)
            if not isinstance(questions, list):
                # If the output is an object with a questions key
                if isinstance(questions, dict) and "questions" in questions:
                    questions = questions["questions"]
                else:
                    questions = [str(questions)]
        except json.JSONDecodeError:
            questions = [content]

        results.append({
            "id": combo_id,
            "topics": combo['topics'],
            "key_concepts": combo['key_concepts'],
            "reference_files": ref_files,
            "questions": questions
        })
        max_samples-=1
        if max_samples == 0:
            break


    return results

# -- Save outputs to files ------------------------------------------------

def save_extractions(extractions, output_file="document_extractions.json"):
    """Save the extracted topics and key concepts for each document"""
    # Ensure the extractions are serializable (convert sets to lists)
    serializable_extractions = []
    for ex in extractions:
        serializable_extractions.append({
            "filename": ex["filename"],
            "topics": list(ex["topics"]),
            "key_concepts": list(ex["key_concepts"])
        })
    
    with open(output_file, "w") as f:
        json.dump(serializable_extractions, f, indent=2)
    
    print(f"Saved document extractions to {output_file}")

def save_questions_with_topics(questions, output_file="questions_with_topics.json"):
    """Save the generated questions with their topic combinations"""
    with open(output_file, "w") as f:
        json.dump(questions, f, indent=2)
    
    print(f"Saved questions with topic combinations to {output_file}")

# -- Main Execution --------------------------------------------------------

if __name__ == "__main__":
    # Create output directory if it doesn't exist
    output_dir = "output_new"
    os.makedirs(output_dir, exist_ok=True)
    
    # adjust this path to where your .pdf docs live
    docs_dir = "doc_new/"
    
    print("Loading documents...")
    docs, filenames = load_docs_from_dir(docs_dir)
    print(f"Loaded {len(docs)} documents")
    
    if not docs:
        print("No documents found. Please check the docs directory.")
        exit(1)
    
    # 1) Extract topics & KCs
    print("\nExtracting topics and key concepts...")
    extractions = extract_concepts_from_docs(docs, filenames)
    
    # Save extractions to file
    save_extractions(extractions, os.path.join(output_dir, "document_extractions.json"))
    
    # Verify we have valid extractions
    valid_extractions = [ex for ex in extractions if ex["topics"] or ex["key_concepts"]]
    if not valid_extractions:
        print("No valid topics or key concepts extracted. Check your data and API responses.")
        exit(1)
    
    # 2) Build graph
    print("\nBuilding concept graph...")
    G, topic_nodes, kc_nodes = build_concept_graph(extractions)
    
    if not topic_nodes:
        print("No topics found in the graph. Cannot proceed.")
        exit(1)
    
    # 3) Sample combinations
    print("\nSampling concept combinations...")
    combos = sample_concept_combinations(G, topic_nodes, kc_nodes, num_samples=50)  # Reduced for testing
    
    if not combos:
        print("Failed to generate concept combinations.")
        exit(1)
    
    # Save topic combinations
    with open(os.path.join(output_dir, "topic_combinations.json"), "w") as f:
        json.dump(combos, f, indent=2)
    
#     # 4) Generate questions
#     print("\nGenerating questions for each combination...")
#     q_outputs = generate_questions_for_samples(combos, docs, extractions)
    
#     # Save questions with topics
#     save_questions_with_topics(q_outputs, os.path.join(output_dir, "questions_with_topics.json"))
    
#     # Display results
#     print("\n===== GENERATED QUESTIONS =====")
#     for idx, out in enumerate(q_outputs, 1):
#         print(f"\nSample {idx}:")
#         print(f"Topics: {out['topics']}")
#         print(f"Key Concepts: {out['key_concepts']}")
#         print(f"Reference Files: {out['reference_files']}")
#         print("Questions:")
#         for q in out['questions']:
#             print(f" - {q}")
    
#     print(f"\nAll outputs saved to directory: {output_dir}") 

# if __name__ == "__main__":
#     # Create output directory if it doesn't exist
#     # Create output directory if it doesn't exist
#     output_dir = "output_irodov"
#     os.makedirs(output_dir, exist_ok=True)
    
#     # adjust this path to where your .pdf docs live
#     docs_dir = "irodov_docs/"
    
#     print("Loading documents...")
#     docs, filenames = load_docs_from_dir(docs_dir)
#     print(f"Loaded {len(docs)} documents")
#     document_extractions = []
#     with open("output_irodov/document_extractions.json", "r") as f:
#         document_extractions = json.load(f)
#     for doc in document_extractions:
#         print(doc)
#         break

#     topic_combinations = []
#     with open("output_irodov/topic_combinations.json", "r") as f:
#         topic_combinations = json.load(f)
#     for combo in topic_combinations:
#         print(combo)
#         break

    # # generate questions 
    # print("Generating questions...")
    # q_outputs = generate_questions_for_samples(topic_combinations, docs, document_extractions)

    # # Save questions with topics
    # save_questions_with_topics(q_outputs, os.path.join(output_dir, "questions_with_topics.json"))
    
    # # Display results
    # print("\n===== GENERATED QUESTIONS =====")
    # for idx, out in enumerate(q_outputs, 1):
    #     print(f"\nSample {idx}:")
    #     print(f"Topics: {out['topics']}")
    #     print(f"Key Concepts: {out['key_concepts']}")
    #     print(f"Reference Files: {out['reference_files']}")
    #     print("Questions:")
    #     for q in out['questions']:
    #         print(f" - {q}")
    
    # print(f"\nAll outputs saved to directory: {output_dir}")

CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox


Loading documents...


CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, def

Loaded 46 documents

Extracting topics and key concepts...
Raw output: {
  "topics": [
    "Effective Quark Models with Nonlocal Interactions",
    "Properties and Structure of Light and Heavy Mesons",
    "Bethe-Salpeter Equation and Separable Interaction Kernels",
    "Calculation of Meson Decay Constants and Form Factors",
    "Electromagnetic Decays of Pseudoscalar and Vector Mesons",
    "Transition Form Factors of Pions and Heavy Mesons",
    "Radiative Decays of Heavy Quarkonia and Vector Mesons",
    "Theoretical Methods and Loop Integration Techniques in Meson Physics",
    "Comparison with Experimental Data and Lattice QCD Results",
    "Applications to Light- and Heavy-Flavor Meson Phenomenology"
  ],
  "key_concepts": [
    "Nonlocal Gaussian Interaction Kernel",
    "Bethe-Salpeter Equation in Ladder Approximation",
    "Meson Vertex Function and Normalization Conditions",
    "Electromagnetic Transition Form Factors (F(Q^2))",
    "Two-photon Decay Widths (π0 → γγ, η → γγ

### download data from csv

In [2]:
import os
import argparse
import pandas as pd
import requests
from urllib.parse import urlparse, unquote

def download_files(
    csv_path="Research Papers - Physics_New.csv",
    output_dir="doc_new"
) -> None:
    """
    Reads a CSV with a 'Link' column and downloads each URL to output_dir,
    saving each file with a .pdf extension.
    """
    # Load CSV
    df = pd.read_csv(csv_path)
    if 'Link' not in df.columns:
        raise ValueError("CSV must contain a 'Link' column.")

    # Ensure output directory exists
    os.makedirs(output_dir, exist_ok=True)

    total = len(df['Link'].dropna())
    for idx, url in enumerate(df['Link'].dropna(), start=1):
        try:
            # Parse URL and derive a safe filename
            parsed = urlparse(url)
            raw_name = os.path.basename(parsed.path)
            raw_name = unquote(raw_name)  # decode URL-encoded characters

            if raw_name:
                name, ext = os.path.splitext(raw_name)
                # If there's no .pdf extension, force it
                filename = f"{idx}.pdf" if ext.lower() != '.pdf' else raw_name
            else:
                filename = f"{idx}.pdf"

            out_path = os.path.join(output_dir, filename)

            # Stream download
            with requests.get(url, stream=True) as r:
                r.raise_for_status()
                with open(out_path, 'wb') as f:
                    for chunk in r.iter_content(chunk_size=8192):
                        if chunk:
                            f.write(chunk)

            print(f"[{idx}/{total}] Downloaded → {filename}")

        except Exception as e:
            print(f"[{idx}/{total}] FAILED  → {url}\n    {e}")

if __name__ == "__main__":
    download_files()


[1/46] Downloaded → 1.pdf
[2/46] Downloaded → 2.pdf
[3/46] Downloaded → 3.pdf
[4/46] Downloaded → 4.pdf
[5/46] Downloaded → 5.pdf
[6/46] Downloaded → 6.pdf
[7/46] Downloaded → 7.pdf
[8/46] Downloaded → 8.pdf
[9/46] Downloaded → 9.pdf
[10/46] Downloaded → 10.pdf
[11/46] Downloaded → 11.pdf
[12/46] Downloaded → 12.pdf
[13/46] Downloaded → 13.pdf
[14/46] Downloaded → 14.pdf
[15/46] Downloaded → 15.pdf
[16/46] Downloaded → 16.pdf
[17/46] Downloaded → 17.pdf
[18/46] Downloaded → 18.pdf
[19/46] Downloaded → 19.pdf
[20/46] Downloaded → 20.pdf
[21/46] Downloaded → 21.pdf
[22/46] Downloaded → 22.pdf
[23/46] Downloaded → 23.pdf
[24/46] Downloaded → 24.pdf
[25/46] Downloaded → 25.pdf
[26/46] Downloaded → 26.pdf
[27/46] Downloaded → 27.pdf
[28/46] Downloaded → 28.pdf
[29/46] Downloaded → 29.pdf
[30/46] Downloaded → 30.pdf
[31/46] Downloaded → 31.pdf
[32/46] Downloaded → 32.pdf
[33/46] Downloaded → 33.pdf
[34/46] Downloaded → 34.pdf
[35/46] Downloaded → 35.pdf
[36/46] Downloaded → 36.pdf
[37/46] Do

## Author - Critique Model

In [7]:
import os
import re
import uuid
import json
from dotenv import load_dotenv
import os
import logging
from langchain.chat_models import ChatOpenAI
from langchain.prompts import PromptTemplate, FewShotPromptTemplate
from langchain.chains import LLMChain
from langchain_google_genai import ChatGoogleGenerativeAI

# Load environment variables from .env
load_dotenv()

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)

def author_critique_loop(
    topics: list[str],
    concepts: list[str],
    guidelines: str,
    few_shot_examples: list[dict],
    max_turns: int = 3,
) -> list[dict]:
    """
    Runs an iterative author-critic loop to generate and refine physics questions.

    Logs the entire multi-turn conversation (author + critic) to a single file.

    Returns a list of dicts with:
        unique_id: str,
        question: str,
        topics: list[str],
        concepts: list[str],
        file_conversation_log: str  # path to the full conversation log
    """
    # Create a unique run-level conversation log
    run_id = uuid.uuid4().hex
    os.makedirs('logs', exist_ok=True)
    conv_log_path = f"logs/{run_id}.log"
    conv_logger = open(conv_log_path, 'w')

    def log(msg: str):
        conv_logger.write(msg + "\n")
        conv_logger.flush()

    # Initialize LLMs (temperature locked to 1.0)
    # author_llm = ChatOpenAI(model_name="gpt-4.1-mini", temperature=1.0)
    # critic_llm = ChatOpenAI(model_name="gpt-4.1", temperature=1.0)
    gemini_api = os.getenv("GEMINI_API_KEY")
    critic_llm = ChatGoogleGenerativeAI(model="gemini-2.5-flash-preview-05-20", temperature=1.0,google_api_key=gemini_api) 
    author_llm = ChatGoogleGenerativeAI(model="gemini-2.0-flash", temperature=1.0,google_api_key=gemini_api) 

    # Build Few-Shot prompt for author
    example_prompt = PromptTemplate(
        input_variables=["example"],
        template="Example:\n{{example}}\n---",
        template_format="jinja2"
    )
    author_fs_prompt = FewShotPromptTemplate(
        examples=few_shot_examples,
        example_prompt=example_prompt,
        prefix=(
            "You are a physics education specialist. Generate medium novel Undergraduate level physics questions. "
            "Each question must start with <question> and end with </question> "
        ),
        suffix=(
            "Now, based on the topics: {{topics}}, key concepts: {{concepts}}, and guidelines: {{guidelines}}, "
            "produce list of new, high-quality physics questions, each wrapped in <question>...</question> tags"
        ),
        input_variables=["topics", "concepts", "guidelines"],
        template_format="jinja2",
    )
    author_chain = LLMChain(llm=author_llm, prompt=author_fs_prompt)

    # Build Critic prompt
    critic_prompt = PromptTemplate(
        input_variables=["questions", "guidelines"],
        template=(
            "You are a physics assessment expert. Critique the following questions:\n"
            "{questions}\n"
            "Evaluate them against these guidelines:\n{guidelines}\n"
            "Provide concise, actionable feedback on how to improve." \
            "the markdown LaTeX syntax must be correct, and the questions must be wrapped in <question>...</question> tags.\n"
        ),
    )
    critic_chain = LLMChain(llm=critic_llm, prompt=critic_prompt)

    # Turn 0: Generation
    log("=== Turn 0: Author generates questions ===")
    questions_text = author_chain.run(
        topics=topics, concepts=concepts, guidelines=guidelines
    )
    log(questions_text)

    # Turn 0: Critic feedback
    log("=== Turn 0: Critic feedback ===")
    feedback = critic_chain.run(questions=questions_text, guidelines=guidelines)
    log(feedback)

    # Refinement turns
    for turn in range(1, max_turns):
        log(f"=== Turn {turn}: Author refines questions ===")
        refine_prompt = PromptTemplate(
            input_variables=["questions", "feedback"],
            template=(
                "Refine these questions based on the feedback:\n{questions}\n"
                "Feedback:\n{feedback}\n"
                "Return an improved numbered list of physics questions, each wrapped in <question>...</question> tags."
            ),
        )
        refine_chain = LLMChain(llm=author_llm, prompt=refine_prompt)
        questions_text = refine_chain.run(questions=questions_text, feedback=feedback)
        log(questions_text)
        if turn == max_turns - 1:
            log("=== Final questions generated ===")
            break
        log(f"=== Turn {turn}: Critic feedback ===")
        feedback = critic_chain.run(questions=questions_text, guidelines=guidelines)
        log(feedback)

    # Close the conversation log
    conv_logger.close()

    # Parse final questions into structured entries
    entries: list[dict] = []
    pattern = re.compile(r'<question>(.*?)</question>', re.DOTALL)
    for match in pattern.findall(questions_text):
        question_body = match.strip()
        uid = uuid.uuid4().hex
        entries.append({
            "unique_id": uid,
            "topics": topics,
            "concepts": concepts,
            "question": question_body,
            "file_conversation_log": conv_log_path
        })
        print(f"Generated question: {question_body}")

    return entries


    # Example usage
    # topics = [
    #     "Standard Model of Particle Physics",
    #     "Electroweak Symmetry Breaking",
    #     "Elementary Particles and Forces",
    # ]
    # concepts = [
    #     "Theories like the Seesaw Mechanism, Majorana Neutrinos, and Sterile Neutrinos address neutrino masses.",
    #     "Neutrino oscillations suggest nonzero masses, contradicting the Standard Model's massless assumption.",
    #     "Radiative corrections adjust particle masses at higher energy scales, posing a fine-tuning problem.",
    #     "Quantum Field Theory is foundational to the Standard Model, explaining fundamental forces excluding gravity.",
    # ]
    # take the combination of topics and concepts from the file output_apple/topic_combinations.json

with open ("output_new/topic_combinations.json", "r") as f:
    topic_combinations = json.load(f)
topic_combinations = topic_combinations[20:]
# topics = topic_combinations[0]['topics']
guidelines = (
r"1. Model a Physics Scenario: Start from a real-world or idealized setup that requires conceptual understanding and physical reasoning."
r"2. Expect answers as either a symbolic expression (e.g., $F = ma$) or a single numerical value with appropriate SI units (e.g., $a = 9.8 \, \mathrm{m/s^2}$)."
r"3. Force Multi-Step Reasoning: Ensure the solution involves two or more physical principles or steps "
r"5. Be Unique: Do not copy or rephrase standard textbook problems. Create novel, conceptually rich scenarios."
r"6. Single Solution: Each question must yield only one correct symbolic or numerical result. Multiple equivalent symbolic forms are acceptable; no ambiguous answers."
r"7. Use rigorous, concise phrasing: Formulate questions in clear, professional language appropriate for advanced learners. Make sure no Hints are included in the questions."
r"8. All assumptions must be clearly stated, and all variables must be defined for each question separately. "
r"9. Units must be consistent; all numerical answers must include correct SI units. Symbols should follow conventional physics notation (e.g., $v$ for velocity, $E$ for energy)."
r"10. Questions must not be multipart: Each question must focus on solving for a single target quantity only. and no hints should be provided."
r"11. Question Formatting: Format the equations, numbers and variables using LaTeX syntax such that all inline math, numbers, variables are enclosed by $...$ and block math is enclosed by $$...$$. Avoid markdown for all unformatted data. "
) 
few_shot_examples = [
#         {"example": r"""Unique Stationary Point of a Relaxion  
# The relaxion potential is  
#  $V(φ) = g Λ^3 φ + Λ_b^4 cos(φ/f)$,  
# with the given inequality  
#  $g Λ^3 f < Λ_b^4$.  
# Show that there is exactly one solution $φ∈(0,πf)$ of  
#  $\frac{dV}{dφ} = 0$ 
# and express that $φ$ in closed form."""},
        {"example": r"""The wave function for a Gaussian wave packet is given by $$\braket{x'|\psi}=(2\pi d^2)^{-\frac{4}{37}}exp\left[ \frac{i\braket{p}x'}{\bar h}- \frac{(x'-\braket{x})^2}{53d^2}\right]$$which satisfies the minimum uncertainty relation $\Delta x \cdot \Delta p=\frac{\bar h}{2}.$ Find $\braket{x'| \Delta x | \psi}.$"""},
    {"example": r"""Using the Variational Principle, find out the energy of the first excited state of a damped oscillator with a trial wave function 
$\psi(x,y) = De^{\frac{-bt}{2m}}cos(\omega_{damp}t + \varphi)$, with $\omega_{damp} = \sqrt{\frac{k}{m} - (\frac{b}{m}})^2$, as long as $b^2 < 4mk$, where $D,\, k,\, b$ and $m$ are arbitrary constants; $t$ is the duration of the oscillation, $\varphi$ is a phase constant, and $\omega_{damp}$ is the angular frequency of a damped oscillator."""}
   , {"example": r"""Using the Variational Principle, find out the energy of the first excited state of a damped oscillator with a trial wave function $$\psi_n(x,t)=N_n(t)\cdot e^{-2\gamma t}\cdot H_n\left(m\omega_{damp}^\frac{1}{2}x^2e^{\frac{\gamma}{2}t}\right)\cdot \exp\left[-m\omega_{damp}^2\hbar x^2e^{-\gamma t}+i\theta_n(t,x)\right]$$ 
Where, $N_n(t)$ is a normalization constant, 
$\theta(t,x)$ includes time- and position-dependent phase terms, and
$ω_{damp}$ is the angular frequency of a damped oscillator."""}
    ]

all_questions = []
for combo in topic_combinations:
    topics = combo['topics']
    concepts = combo['key_concepts']
    print(f"Topics: {topics}")
    print(f"Key Concepts: {concepts}")
    final_questions = author_critique_loop(
        topics, concepts, guidelines, few_shot_examples, max_turns=3
    )
    all_questions.append(
        {
        "data":final_questions
        }
        
        )
    # print(f"Final Questions:\n{final_questions}\n")
    # save the final questions to a file
    # Save all questions to a file
with open("final_questions_physics.json", "w") as f:
    json.dump(all_questions, f, indent=2)
# write in a csv file with the questions, topics, and concepts
file_path = "questions_6june_physics.csv"
file_exists = os.path.isfile(file_path)
is_empty = (not file_exists) or (os.path.getsize(file_path) == 0)
import csv
# Open with newline='' so that csv.writer can quote internal newlines correctly
with open(file_path, "a", newline="", encoding="utf-8") as csv_file:
    writer = csv.writer(csv_file)  # default delimiter=',' and quoting=QUOTE_MINIMAL
    if is_empty:
        # Write header row only if file is new/empty
        writer.writerow(["question"])
    # Now append each question, letting csv.writer quote any embedded newline or commas
    for batch in all_questions:
        for q in batch["data"]:
            print(f"Writing question: {q['question']}")
            writer.writerow([q["question"]])  # Each question is a single cell, so we wrap it in a list


Topics: ['Universal operations on time-frequency modes', 'Quantum interference with time-frequency modes', 'Theoretical modeling and validation of quantum experiments']
Key Concepts: ['Simulation and measurement of collision-free four-photon events', 'High spectral purity and indistinguishability of generated photons', 'Electro-optic phase modulation for frequency mixing']
Generated question: Consider a quantum experiment designed to investigate time-frequency entanglement. Two photons, A and B, are generated through spontaneous parametric down-conversion (SPDC) in a type-I crystal. Assume that initially, photon A is in a frequency mode $\omega_A$ and photon B is in a frequency mode $\omega_B$. Photon A then passes through an electro-optic modulator (EOM), which applies a time-dependent phase $\phi(t) = \alpha \sin(\Omega t)$, where $\alpha$ is the modulation amplitude and $\Omega$ is the modulation frequency. Assume only the carrier and first-order sidebands contribute significantly t



KeyboardInterrupt: 

In [8]:
    # Save all questions to a file
with open("final_questions_physics.json", "w") as f:
        json.dump(all_questions, f, indent=2)

file_path = "questions_6june_physics.csv"
file_exists = os.path.isfile(file_path)
is_empty = (not file_exists) or (os.path.getsize(file_path) == 0)
import csv
# Open with newline='' so that csv.writer can quote internal newlines correctly
with open(file_path, "a", newline="", encoding="utf-8") as csv_file:
        writer = csv.writer(csv_file)  # default delimiter=',' and quoting=QUOTE_MINIMAL
        if is_empty:
            # Write header row only if file is new/empty
            writer.writerow(["question"])
        # Now append each question, letting csv.writer quote any embedded newline or commas
        for batch in all_questions:
            for q in batch["data"]:
                print(f"Writing question: {q['question']}")
                writer.writerow([q["question"]])
        # 2) write a Markdown file you can grab directly
with open("final_questions_physics.md", "w") as md:
        for batch in all_questions:
            for q in batch["data"]:
                # Write the question with its unique ID and topics
                md.write(f"### Question ID: {q['unique_id']}\n")
                # Write the question text
                md.write("**Question:**\n")
            # q["question"] already contains things like $\theta$
                md.write(q["question"] + "\n\n")

Writing question: Consider a quantum experiment designed to investigate time-frequency entanglement. Two photons, A and B, are generated through spontaneous parametric down-conversion (SPDC) in a type-I crystal. Assume that initially, photon A is in a frequency mode $\omega_A$ and photon B is in a frequency mode $\omega_B$. Photon A then passes through an electro-optic modulator (EOM), which applies a time-dependent phase $\phi(t) = \alpha \sin(\Omega t)$, where $\alpha$ is the modulation amplitude and $\Omega$ is the modulation frequency. Assume only the carrier and first-order sidebands contribute significantly to the detected frequencies. Both photons then pass through a symmetric beam splitter with amplitude transmissivity $T$ and amplitude reflectivity $R$, such that $|T|^2+|R|^2=1$. Assuming the frequency difference makes the photons distinguishable at the beam splitter inputs, calculate the probability, $P_{D1,D2}$, of detecting one photon at detector D1 with frequency $\omega_A