## Graph based question genration 

https://openreview.net/pdf?id=CEE9cAQJ10

In [14]:
#!/usr/bin/env python3
"""
physics_qapipeline.py

Graph-based Synthetic QA Pipeline for Physics:
1. Extract & cluster KPs
2. Build Knowledge Point Relationships Graph (KPRG)
3. Enumerate KP combinations (one-hop, two-hop, three-hop, community)
4. Generate new problems & solutions
5. (Optional) Evaluate problems & solutions
6. Save KPRG and synthetic QA pairs
"""

import argparse
from itertools import combinations
import networkx as nx
import pandas as pd
from together import Together
import json
# load env
import os
from dotenv import load_dotenv
import logging

# Load environment variables from .env file
load_dotenv()

# Retrieve API key
api_key = os.getenv("TOGETHER_API_KEY")
print(api_key)

# Configure logging
dlogging = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
logger = logging.getLogger(__name__)

# Initialize Together client
client = Together(api_key=api_key)  # requires TOGETHER_API_KEY in env

# --- 1. Extract Knowledge Points ---

def extract_knowledge_points(question: str, solution: str) -> list[str]:
    """
    Extract knowledge points from a question and its solution.
    Returns a list of knowledge points.
    """
    logger.info(f"Extracting knowledge points from question: {question[:100]}...")
    
    prompt = f"""As a physics education specialist, analyze the given physics problem and its solution to
extract specific physics knowledge points. Provide ≤10 concise, directly relevant points.

Physics Problem: {question}
Solution: {solution}

Output format:
Knowledge Points:
1.
2.
..."""
    try:
        resp = client.chat.completions.create(
            model="Qwen/Qwen3-235B-A22B-fp8-tput",
            messages=[{"role":"user","content":prompt}]
        )
        logger.info("LLM call completed successfully")
        
        output = resp.choices[0].message.content
        logger.info(f"Raw LLM output: {output}")
        
        lines = output.splitlines()
        kps = [l.split('.',1)[1].strip() for l in lines if l.strip().startswith(tuple(str(i) for i in range(1,11)))]
        
        logger.info(f"Extracted {len(kps)} knowledge points:")
        for i, kp in enumerate(kps, 1):
            logger.info(f"{i}. {kp}")
            
        return kps
    except Exception as e:
        logger.error(f"Error extracting knowledge points: {e}")
        return []

def build_kp_graph(clustered_kps_list):
    G = nx.Graph()

    # 1) Add all the knowledge points as nodes
    for cluster in clustered_kps_list:
        for kp in cluster:
            G.add_node(kp)

    # 2) Now build co-occurrence edges as before
    cooccur = {}
    for kps in clustered_kps_list:
        for a, b in combinations(kps, 2):
            pair = tuple(sorted((a, b)))
            cooccur[pair] = cooccur.get(pair, 0) + 1

    for (a, b), w in cooccur.items():
        G.add_edge(a, b, weight=w)

    return G

def extract_kp_combinations(G: nx.Graph) -> list[tuple[str,...]]:
    """
    Extract knowledge point combinations from the graph.
    Returns list of tuples containing connected knowledge points.
    """
    logger.info("Extracting knowledge point combinations from graph...")
    
    # Check if graph is empty
    if not G.nodes():
        logger.error("Graph is empty - no nodes found")
        return []
        
    combos = []
    try:
        # one-hop: direct edges
        combos += [(u, v) for u, v in G.edges()]
        logger.info(f"Found {len(combos)} one-hop combinations")
        
        # two-hop: nodes at distance exactly 2
        two_hop = []
        for u, v in combinations(G.nodes(), 2):
            if nx.has_path(G, u, v) and nx.shortest_path_length(G, u, v) == 2:
                two_hop.append((u, v))
        combos.extend(two_hop)
        logger.info(f"Found {len(two_hop)} two-hop combinations")
        
        # three-hop: core KPI
        try:
            core = max(G.degree, key=lambda x: x[1])[0]
            three_hop = []
            for node in G.nodes():
                if node != core and nx.has_path(G, core, node) and nx.shortest_path_length(G, core, node) == 3:
                    three_hop.append((core, node))
            combos.extend(three_hop)
            logger.info(f"Found {len(three_hop)} three-hop combinations")
        except ValueError as e:
            logger.error(f"Error finding core node: {e}")
        
        # community: cliques of size 3
        cliques = []
        for clique in nx.find_cliques(G):
            if len(clique) == 3:
                cliques.append(tuple(clique))
        combos.extend(cliques)
        logger.info(f"Found {len(cliques)} clique combinations")
        
        # Remove duplicates while preserving order
        unique_combos = list(dict.fromkeys(combos))
        logger.info(f"Total unique combinations found: {len(unique_combos)}")
        return unique_combos
        
    except Exception as e:
        logger.error(f"Error extracting combinations: {e}")
        return [] 

def cluster_knowledge_points(kps: list[str]) -> str:
    """
    Cluster a list of knowledge points and return a representative point.
    Returns a single knowledge point that best represents the cluster.
    """
    if not kps:
        logger.warning("No knowledge points to cluster")
        return ""
        
    logger.info(f"Clustering {len(kps)} knowledge points")
    bullets = "\n".join(f"- {kp}" for kp in kps)
    
    prompt = f"""Given these physics KPs, pick the one best representing the group:
{bullets}

Provide:
Best Knowledge Point: <your choice>
Reason: <brief>"""
    
    try:
        resp = client.chat.completions.create(
            model="Qwen/Qwen3-235B-A22B-fp8-tput",
            messages=[{"role":"user","content":prompt}]
        )
        logger.info("LLM call completed successfully")
        
        output = resp.choices[0].message.content
        logger.info(f"Raw LLM output: {output}")
        
        for line in output.splitlines():
            if line.startswith("Best Knowledge Point:"):
                rep = line.split(":",1)[1].strip()
                logger.info(f"Selected representative point: {rep}")
                return rep
                
        logger.warning("No representative point found in output")
        return kps[0]  # fallback to first point
        
    except Exception as e:
        logger.error(f"Error clustering knowledge points: {e}")
        return kps[0] if kps else ""  # fallback to first point or empty string

# --- 5. Generate new problem for any KP combo ---
def generate_new_problem(kp_list: list[str]) -> str:
    body = "\n".join(f"knowledge point {i+1}: {kp}" for i, kp in enumerate(kp_list))
    prompt = f"""You are a physics teacher. Using these knowledge points, construct a new, original physics
problem requiring their combined understanding. Ensure no logic errors and sufficient difficulty.
and follow these guidelines:
        "1. Model a Physical Scenario: Start from a real-world or idealized setup."
        "2. Target a Solvable Quantity: Ask for a clear symbolic expression of a physical variable (e.g., tension, acceleration, energy)."
        "3. Force Multi-Step Reasoning: Ensure the question involves a sequence of physics laws, transformations, and derivations to reach the answer."
        "4. Avoid Redundancy: Exclude extraneous details or variables that do not impact the final solution."
        "5. Be Unique: Do not rephrase standard textbook problems; ensure originality and complexity."
        "6. Single solution: Expect a single symbolic expression, unambiguous, presented in LaTeX. Multiple equivalent algebraic forms are allowed. No equations or floating-point approximations."
        "7. Use rigorous, concise phrasing."
        "8. Avoid colloquial or ambiguous terminology."
        "9. Units must be consistent; symbols should follow standard notation."
{body}

Format:
New Problem: <your problem>"""
    try:
        resp = client.chat.completions.create(
            model="deepseek-ai/DeepSeek-R1-Distill-Qwen-14B",
            messages=[{"role":"user","content":prompt}]
        )
        content = resp.choices[0].message.content
        logger.info(f"generate_new_problem raw output: {content!r}")

        # Safe extraction
        parts = content.split("New Problem:", 1)
        if len(parts) == 2:
            problem = parts[1].strip()
        else:
            # fallback to whole content if LLM forgot the label
            problem = content.strip()

        logger.info(f"generate_new_problem: extracted problem snippet: {problem[:80]}…")
        return problem

    except Exception as e:
        logger.error(f"generate_new_problem: error for KP combo {kp_list}: {e}")
        return ""

# --- 6. Generate solution (Prompt A.3) ---
# def generate_solution(problem: str) -> str:
#     prompt = f"""{problem}

# Please solve step by step and put the final answer within \\boxed{{}}."""
#     try:
#         resp = client.chat.completions.create(
#             model="Qwen/Qwen3-235B-A22B-fp8-tput",
#             messages=[{"role":"user","content":prompt}]
#         )
#         logger.info("generate_solution: LLM call completed.")
#         output = resp.choices[0].message.content
#         logger.info(f"generate_solution: solution length {len(output)} characters.")
#         return output.strip()
#     except Exception as e:
#         logger.error(f"generate_solution: error solving problem '{problem[:50]}...': {e}")
#         return ""



def main(output_graph="kp_graph.graphml", output_synth_csv="output_synth_csv.csv"):
    logger.info("Starting main process...")
    
    try:
        # Read input data
        # logger.info("Reading input data from merged_output.json")
        # with open('merged_output.json', 'r') as f:
        #     data = json.load(f)
        
        # # Create DataFrame
        # rows = [
        #     {"question": entry.get("question", ""), "Solution": entry.get("Solution", "")}
        #     for entry in data.values()
        # ]
        # df = pd.DataFrame(rows)
        # df = df.head(2)
        # logger.info(f"Loaded {len(df)} rows from input data")
        
        # # Process each row to extract knowledge points
        # clustered_kps_list = []
        # for idx, row in df.iterrows():
        #     try:
        #         logger.info(f"Processing row {idx+1}/{len(df)}")
        #         kps = extract_knowledge_points(row.question, row.Solution)
        #         if not kps:
        #             logger.warning(f"Row {idx}: no KPs extracted, skipping.")
        #             continue
        #         clustered_kps_list.append(kps)
        #         logger.info(f"Row {idx}: extracted {len(kps)} KPs")
                    
        #         # rep = cluster_knowledge_points(kps)
        #         # if not rep:
        #         #     logger.warning(f"Row {idx}: clustering returned empty, skipping.")
        #         #     continue
                    
        #         # clustered_kps_list.append([rep])
        #         # logger.info(f"Row {idx}: successfully processed")
                
        #     except Exception as e:
        #         logger.error(f"Row {idx}: error processing question: {e}")
        #         continue
        
        # if not clustered_kps_list:
        #     logger.error("No knowledge points were extracted from any rows")
        #     return
            
        # # Build and save KPRG
        # logger.info("Building knowledge point relationship graph...")
        # G = build_kp_graph(clustered_kps_list)
        
        G = nx.read_graphml(output_graph)
        if not G.nodes():
            logger.error("Built graph is empty - no nodes were created")
            return
            
        # logger.info(f"Built graph with {len(G.nodes())} nodes and {len(G.edges())} edges")
        # nx.write_graphml(G, output_graph)
        # logger.info(f"KPRG saved to {output_graph}")
        
        # Enumerate combinations
        logger.info("Extracting knowledge point combinations...")
        combos = extract_kp_combinations(G)
        
        if not combos:
            logger.error("No combinations were found in the graph")
            return
            
        logger.info(f"Found {len(combos)} KP combinations")
        
        # Generate questions for combinations
        synth_qa = []
        for i, combo in enumerate(combos[:10]):
            try:
                logger.info(f"Processing combination {i+1}/10")
                combo_list = list(combo)
                new_q = generate_new_problem(combo_list)
                
                if not new_q:
                    logger.warning(f"Combo {combo_list}: no question generated, skipping.")
                    continue
                    
                synth_qa.append({
                    "combination": combo_list,
                    "question": new_q,
                    "solution": "Stay tuned !! will appear soon"
                })
                logger.info(f"Successfully generated question for combination {i+1}")
                
            except Exception as e:
                logger.error(f"Error processing combination {combo_list}: {e}")
                continue
        
        if not synth_qa:
            logger.error("No questions were generated")
            return
            
        # Save results
        out_df = pd.DataFrame(synth_qa)
        out_df.to_csv(output_synth_csv, index=False)
        logger.info(f"Synthetic QA saved to {output_synth_csv}")
        
    except Exception as e:
        logger.error(f"Error in main process: {e}")
        return 
if __name__ == "__main__":
    main()
    

2025-05-15 17:48:27,073 - __main__ - INFO - Starting main process...
2025-05-15 17:48:27,082 - __main__ - INFO - Extracting knowledge point combinations...
2025-05-15 17:48:27,082 - __main__ - INFO - Extracting knowledge point combinations from graph...
2025-05-15 17:48:27,082 - __main__ - INFO - Found 630 one-hop combinations
2025-05-15 17:48:27,088 - __main__ - INFO - Found 0 two-hop combinations
2025-05-15 17:48:27,089 - __main__ - INFO - Found 0 three-hop combinations
2025-05-15 17:48:27,090 - __main__ - INFO - Found 0 clique combinations
2025-05-15 17:48:27,090 - __main__ - INFO - Total unique combinations found: 630
2025-05-15 17:48:27,090 - __main__ - INFO - Found 630 KP combinations
2025-05-15 17:48:27,090 - __main__ - INFO - Processing combination 1/10


tgp_v1_dFRxcuqhGrWLXJhawQ4s4P7wNBCuJxUHwkqy0iYHfAM


2025-05-15 17:48:37,365 - __main__ - INFO - generate_new_problem raw output: "<think>\nOkay, so I've got this physics problem to solve, and I need to think through it step by step. Let me try to break it down.\n\nThe problem is about the photoelectric effect. It says that a photon of energy E_photon is incident on a metal surface, and the work function of the metal is phi. The electrons that get ejected have a maximum kinetic energy K_max. Then, these electrons are moving in a uniform magnetic field B, creating a circular path with radius r. I need to find the stopping potential V_0 in terms of E_photon, phi, and r, using the given constants e, m, and B.\n\nAlright, let's start by recalling what I know about the photoelectric effect. The basic idea is that when light (or photons) hits a metal surface, it can eject electrons. This is the photoelectric effect. Einstein explained this with his theory, which relates the energy of the incoming photon to the kinetic energy of the ejected ele

## author - critique model 

  author = ChatOpenAI(model="gpt-4o", temperature=1.0)


2025-05-14 15:12:29,227 - __main__ - INFO - Building author chain
2025-05-14 15:12:29,228 - __main__ - INFO - Building critic chain
2025-05-14 15:12:44,187 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-05-14 15:12:44,191 - __main__ - INFO - Generated questions turn 0: ### Question 1

**Topic:** Standard Model of Particle Physics  
**Key Concepts:** Theories like the Seesaw Mechanism; Majorana Neutrinos

**Scenario:**  
Consider a type-I seesaw mechanism with a single generation of neutrinos. A sterile neutrino of mass \( M_S \) is introduced. The active neutrino's mass matrix is given by

\[
\mathcal{M} = \begin{bmatrix} 0 & M_D \\ M_D & M_S \end{bmatrix}
\]

where \( M_D \) is the Dirac mass term. 

**Task:**  
Develop a symbolic expression for the mass \( m_\nu \) of the active neutrino (Majorana mass) after diagonalizing the mass matrix. Assume \( M_S \gg M_D \).

**Guidance Points:**  
- Model the scenario using the approximat

Final Generated Questions:
 Below is a refined version of the questions, incorporating the feedback to enhance depth, specificity, and originality. The aim is to transform each question into a more challenging and educational task, inline with advanced learning goals.

---

### Question 1

**Topic:** Standard Model of Particle Physics  
**Key Concepts:** Seesaw Mechanism; Majorana Neutrinos; Perturbation Theory

**Scenario:**  
Consider a type-I seesaw mechanism with a single generation of neutrinos. The mass matrix for the active and sterile neutrinos is given by

\[ 
\mathcal{M} = \begin{bmatrix} 0 & M_D \\ M_D & M_S \end{bmatrix} 
\]

where \( M_D \) is the Dirac mass term, and \( M_S \) is the sterile neutrino mass with \( M_S \gg M_D \).

**Task:**  
1. Diagonalize \(\mathcal{M}\) to second-order in \( M_D/M_S \) and find the expressions for both the light and heavy neutrino masses.
2. The mixing angle \(\theta\) between active and sterile states can be expressed as \(\theta \sime

In [None]:
import pandas as pd
import re

# open the jee_graphLM.csv file 
df = pd.read_csv("graphLLM_jee.csv")

# drop the solution column 
df = df.drop(columns=["solution"])

# define the function to remove <think>…</think> sections
think_pattern = re.compile(r'<think\b.*?>.*?</think>', flags=re.DOTALL | re.IGNORECASE)
def remove_think_tags(text: str) -> str:
    """
    Remove any substring between <think> and </think> tags (including the tags themselves).
    """
    # if text is NaN or None, just return it
    if not isinstance(text, str):
        return text
    # remove all <think>...</think> blocks
    cleaned = think_pattern.sub('', text)
    # collapse any extra whitespace left behind
    return re.sub(r'\s+', ' ', cleaned).strip()

# apply it to the question column
df['question'] = df['question'].apply(remove_think_tags)

# show the first 5 rows to verify
print(df.head())




                                         combination  \
0  ["Photoelectric effect: Photoelectric equation...   
1  ["Photoelectric effect: Photoelectric equation...   
2  ["Photoelectric effect: Photoelectric equation...   
3  ["Photoelectric effect: Photoelectric equation...   
4  ["Photoelectric effect: Photoelectric equation...   

                                            question  
0  To determine the stopping potential \( V_0 \) ...  
1  The ratio of the maximum kinetic energy of the...  
2  To determine the maximum kinetic energy of ele...  
3  To determine the stopping potential \( V_0 \) ...  
4  To solve for Planck's constant \( h \) in term...  


In [16]:
# optionally save to a new CSV
df.to_csv("jee_graphLM_no_solution.csv", index=False)

In [17]:
# open the jee_graphLM_no_solution.csv file 
df = pd.read_csv("jee_graphLM_no_solution.csv")

# print the first 5 rows 
print(df.head())

# remove any row where it has <think> tags in the question column 
df = df[~df['question'].str.contains('<think>')]

# print the first 5 rows 
print(df.head())

# save the dataframe to a new csv file 
df.to_csv("jee_graphLM_no_solution.csvv", index=False)





                                         combination  \
0  ["Photoelectric effect: Photoelectric equation...   
1  ["Photoelectric effect: Photoelectric equation...   
2  ["Photoelectric effect: Photoelectric equation...   
3  ["Photoelectric effect: Photoelectric equation...   
4  ["Photoelectric effect: Photoelectric equation...   

                                            question  
0  To determine the stopping potential \( V_0 \) ...  
1  The ratio of the maximum kinetic energy of the...  
2  To determine the maximum kinetic energy of ele...  
3  To determine the stopping potential \( V_0 \) ...  
4  To solve for Planck's constant \( h \) in term...  
                                         combination  \
0  ["Photoelectric effect: Photoelectric equation...   
1  ["Photoelectric effect: Photoelectric equation...   
2  ["Photoelectric effect: Photoelectric equation...   
3  ["Photoelectric effect: Photoelectric equation...   
4  ["Photoelectric effect: Photoelectric equation... 