## load libraries

In [27]:
import os
import csv 
import sys
import json
import re
import pandas as pd
import numpy as np
import ollama


### Prompt format 

In [28]:
guidlines = r"""
**LaTeX Guidelines:**
* **General:** Use LaTeX for ALL mathematical notation ONLY. Use Markdown for tables/lists ONLY; other text unformatted must be in plain text.
* **Delimiters:** Inline math: `\\( your_latex \\)`. Display math (standalone equations on own lines): `\\[ your_latex \\]`. AVOID `$` or `$$`.
* **Multi-Step Equations:** ALWAYS use `\begin{aligned} ... \end{aligned}`. Align with `&` (e.g., `&=`). Each step on a new line using `\\\\`.
* **Punctuation & Spacing:** Grammatically integrate and punctuate all equations. Use `\,` before terminal punctuation in display equations, between a numerical value and its unit (e.g., `\mathrm{5\,kg}`), and as a thousands separator.
* **Symbols & Text in Math:** Use LaTeX commands for symbols (e.g., `\Delta`). For text within math, use `\text{normal text with spaces}`. For units, descriptive labels, or upright text in math, use `\mathrm{upright text}` (note: `\mathrm{}` doesn't auto-space).
* **Typeface:** Variables are italic (default). Units and descriptive labels are upright/plaintext. Vectors are bold and italic (e.g., `\mathbf{v}` or `\textit{\textbf{v}}`).
* **Units:** Default to SI units unless specified. Format like `\mathrm{5\,kg}`.
* **Brackets:** Use `\left( \right)` and its counterparts for dynamic sizing.
* **Final Answers:** Do NOT round intermediate calculations. Round final answers (default two decimal places unless specified).provide the final answer in a boxed format like this:\n\n\\boxed{{Your final answer here}}\n\n

**Instructions:**
* Provide answers in a step-by-step, tutor-like manner.
* Provide answers in ONLY Latex and not markdown, wherever you need fomatting in the answer you should do it using Latex.
* Clearly state each step and all assumptions made.
* Perform all calculations precisely, without any approximation.
* Strictly adhere to all LaTeX Guidelines provided above.
"""

### process from json file

In [29]:
# import json

# def process_questions(input_json_path: str, output_json_path: str):
#     """
#     Reads questions from a JSON file of the form:
#       [
#         {
#           "data": [
#             {
#               "unique_id": "...",
#               "topics": [...],
#               "concepts": [...],
#               "question": "...",
#               "file_conversation_log": "..."
#             },
#             ...
#           ]
#         }
#       ]
#     Generates an explanation and answer for each entry (using your generate_prompt +
#     ollama_api_call), then writes out the same structure to output_json_path,
#     with each entry extended to include 'explanation' and 'answer'.
#     """
#     # 1. Load the input JSON
#     # with open(input_json_path, 'r') as f:
#     #     json_list = json.load(f)
        
#     questions = [
# r"""Thermodynamics of a Carbonate Decomposition  
# $$\text{CoCO}_3(s) \rightarrow \text{CoO}(s) + \text{CO}_2(g)$$  
# $\Delta H^\circ = +115.4\ \text{kJ mol}^{-1}$, $\Delta S^\circ = +147\ \text{J mol}^{-1} \text{K}^{-1}$ (298 K)  
# At what temperature (in °C) does $\Delta G^\circ = 0$?""",

# r"""Nernst Equation for a Mixed-Redox Cell  
# At 25 °C, the cell is:  
# Pt | V³⁺ (0.100 M), V²⁺ (0.0150 M) ‖ Ag⁺ (0.0500 M) | Ag  
# Standard potentials:  
# V²⁺ ⇌ V³⁺ + e E° = +0.255 V  
# Ag⁺ + e⁻ ⇌ Ag E° = +0.799 V  
# Calculate the cell potential $E_{\text{cell}}$ (in V).""",

# r"""Calculate the pH at the equivalence point when 50.0 mL of 0.100 M NH₃ is titrated with 0.100 M HCl, given $K_b(\text{NH}_3)=1.8×10^{-5}$ and $K_w=1.0×10^{-14}$.""",

# r"""Electronic solvation free energy  
# A DFT calculation gives electronic energies for methyl α-D-glucopyranoside of  
# $E_{\text{gas}} = -362.314$ hartree and $E_{\text{soln}} = -362.503$ hartree.  
# (1 hartree = 2625.5 kJ mol⁻¹.)  
# Calculate the electronic component of the solvation free energy  
# $\Delta G_{\text{solv}} = E_{\text{soln}} - E_{\text{gas}}$ in kJ mol⁻¹. Report to three decimal places.""",

# r"""Calculate the initial rate (mol$\cdot$L$^{-1}\cdot$s$^{-1}$) of cis-to-trans isomerization for a photoactivatable azobenzene derivative with quantum yield $\phi = 0.42$ under $650$ nm illumination with photon flux $1.5 \times 10^{16}$ photons$\cdot$s$^{-1}\cdot$cm$^{-2}$ in a $1$ mL solution contained in a $1$ cm$^{2}$ cuvette (pathlength $1$ cm) at initial concentration $5~\mu$M. Assume that 100% of the incident photons are absorbed by the solution (i.e., no transmission). Use Avogadro’s number $N_A = 6.022 \times 10^{23}$ mol$^{-1}$.""",

# r"""A dendritic polymer bearing eight peripheral photo-cleavable o-nitrobenzyl ester groups is dissolved at $0.5$ mM in $1$ mL solution (pathlength $1$ cm), which is continuously illuminated at $365$ nm with intensity $10$ mW$\cdot$cm$^{-2}$ for $5$ minutes.  
# Given molar extinction coefficient $\varepsilon = 4.5 \times 10^{3}$ L$\cdot$mol$^{-1}\cdot$cm$^{-1}$ and quantum yield of cleavage $\phi = 0.25$, calculate the concentration (mol$\cdot$L$^{-1}$) of cleaved ester groups produced, assuming all photons absorbed contribute to cleavage and uniform illumination across the cuvette cross-section. The polymer contains eight cleavable groups per molecule.""",

# r"""A supramolecular intracellular assembly shows sigmoidal kinetics characterized by two activation free energy barriers at $310$ K:  
# nucleation barrier $\Delta G^\ddagger_{\mathrm{nucleation}} = 75$ kJ$\cdot$mol$^{-1}$ and  
# elongation barrier $\Delta G^\ddagger_{\mathrm{elongation}} = 40$ kJ$\cdot$mol$^{-1}$.  
# Given the Arrhenius pre-exponential factor for nucleation $A_{\mathrm{nucleation}}$ is 100 times larger than that for elongation $A_{\mathrm{elongation}}$,  
# calculate the ratio of their rate constants $k_{\mathrm{nucleation}}/k_{\mathrm{elongation}}$ at $310$ K.  
# Use the gas constant $R = 8.314$ J$\cdot$mol$^{-1}\cdot$K$^{-1}$.""",

# r"""A synthetic intracellular assembly formed from peptide amphiphiles has a critical micelle concentration (CMC) of $15~\mu$M at $37^\circ$C.  
# After covalent attachment of a photo-cleavable group, the CMC increases to $45~\mu$M.  
# Define the change in standard Gibbs free energy of micellization as  
# $\Delta \Delta G^\circ = \Delta G^\circ_{\mathrm{modified}} - \Delta G^\circ_{\mathrm{original}}$.  
# Calculate $\Delta \Delta G^\circ$ (kJ$\cdot$mol$^{-1}$) at $310$ K, assuming ideal solution behavior and using $R = 8.314$ J$\cdot$mol$^{-1}\cdot$K$^{-1}$.""",

# r"""During photochemical control of intracellular assembly, the photoinduced rate constant $k_{\mathrm{photo}}$ follows first-order kinetics and depends on photon flux $I$ (photons$\cdot$cm$^{-2}\cdot$s$^{-1}$) as $k_{\mathrm{photo}} = \sigma I$, where $\sigma$ is the absorption cross-section.  
# Given $\sigma = 2.5 \times 10^{-17}$ cm$^{2}$ and a target $k_{\mathrm{photo}} = 0.01$ s$^{-1}$ under $700$ nm illumination, calculate the required light intensity $I_{\mathrm{W}}$ in W$\cdot$cm$^{-2}$.  
# Use Planck’s constant $h = 6.626 \times 10^{-34}$ J$\cdot$s and speed of light $c = 3.00 \times 10^{8}$ m$\cdot$s$^{-1}$.  
# Show all steps starting from determining the photon flux $I$.""",

# r"""Upon NIR irradiation at 800 nm, a photoactivatable supramolecular monomer is illuminated in a $1$ cm pathlength cuvette with monomer concentration $10~\mu$M and molar absorptivity $\varepsilon = 5{,}000$ L$\cdot$mol$^{-1}\cdot$cm$^{-1}$.  
# Incident light intensity is $2 \times 10^{15}$ photons$\cdot$s$^{-1}\cdot$cm$^{-2}$ over a $0.5$ cm$^{2}$ cross-sectional area.  
# Calculate the fraction of incident photons absorbed by the solution, assuming Beer–Lambert law applies and uniform illumination.""",

# r"""Activation free energy $\Delta G^\ddagger$  
# Data:  
# • NMR coalescence at $T_c = 268$ K; $\Delta\nu = 120$ Hz  
# • Rate at coalescence $k_c = \pi \cdot \Delta\nu/\sqrt{2}$  
# • Eyring: $k_c = (k_B T_c/h)\cdot e^{–\Delta G^\ddagger/(R T_c)}$  
# • $R = 8.314$ J$\cdot$K⁻¹$\cdot$mol⁻¹  
# You may calculate $(k_B T_c/h)$ in one line.  
# Report $\Delta G^\ddagger$ in kJ$\cdot$mol⁻¹ to three significant figures.""",

# r"""**By given Spectroscopic Data Analysis, predict the name of the structure:**

# **Molecular Formula:** $C_{20}H_{16}FN_{5}O_{2}S_{2}$

# **IR (KBr, cm$^{-1}$):**

# * 3352: N–H stretching (secondary amine)  
# * 3220: N–H stretching (secondary amide)  
# * 2891: C–H stretching ($\mathsf{-CH_3}$ group)  
# * 2982: C–H stretching (aromatic ring)  
# * 2772: C–H stretching ($\mathsf{-CH_2}$ group)  
# * 1704: C=O stretching (secondary amide)  
# * 1552: C=C stretching (aromatic ring)  
# * 1290: C–H bending  
# * 1194, 1088: C–O–C stretching  
# * 1118: C–F stretching  
# * 775: 1,2-disubstituted benzene ring  

# **$^{1}$H NMR (300 MHz, DMSO-d$_{6}$, $\delta$ ppm):**

# * 9.17 (s, 1H, Het–NH–CO–Ar)  
# * 8.03–6.58 (m, 9H, Ar–H)  
# * 4.44 (s, 2H, Het–CH$_{2}$)  
# * 4.02 (s, 1H, Ar–NH–CH$_{2}$)  
# * 2.47 (s, 3H, –N–C(CH$_{3}$)–C–)  

# **$^{13}$C NMR (100 MHz, DMSO-d$_{6}$, $\delta$ ppm):**

# $$
# 177.8,\ 166.3,\ 163.2,\ 156.2,\ 155.1,\ 154.2,\ 133.9,\ 132.5,\ 130.9\ (\times2),\ 129.2,\ 128.0,\ 125.7,\ 120.4,\ 116.3,\ 115.1,\ 104.1,\ 70.2,\ 17.4
# $$

# **LC-MS:**  
# $$
# m/z = 441.07\ (\mathrm{[M]^+})
# $$"""
# ]

#     all_questions = []
#     for question in questions:
#     #     # 1. Load the input JSON

#     # 2. Iterate through all entries and augment
#     # for container in json_list:
#     #     for entry in container.get('data', []):
#     #         q_text = entry.get('question')
#     #         if not q_text:
#     #             # skip anything without a question
#     #             continue

#     #         prompt = generate_prompt(q_text)
#     #         print(f"Processing question: {q_text}")
#     #         explanation, answer = ollama_api_call(prompt)
#     #         print(f"Generated explanation: {explanation}")
#     #         print(f"Extracted answer: {answer}")
#     #         entry['explanation'] = explanation
#     #         entry['answer'] = answer
#       prompt = generate_prompt(question)
#       print(f"Processing question: {question}")
#       explanation, answer = ollama_api_call(prompt)
#       print(f"Generated explanation: {explanation}")
#       print(f"Extracted answer: {answer}")
#       all_questions.append({
          
#       'question': question,
#           'explanation': explanation,
#           'answer': answer
#       })



#     return all_questions
#     # # 3. Dump back out
#     # with open(output_json_path, 'w') as f:
#     #     json.dump(json_list, f, indent=2, ensure_ascii=False)






# # Example usage:
# processed = process_questions('final_questions_physics.json','output_with_answers.json')


### complete answer generation and refinining

In [30]:
import pandas as pd
import csv
def generate_prompt_initial_answer(question,guidelines):
    prompt_template = """
    You are an Experienced tutor who need to provide answer to the provided question. 
    Question: {question}
    Your responses MUST strictly follow these guidelines:
    {guidelines}
    """
    return prompt_template.format(question=question,guidelines=guidelines)

from together import Together
# load .env file
from dotenv import load_dotenv
load_dotenv()
# Set the API key
# TOGETHER_API_KEY = os.getenv("TOGETHER_API_KEY")
# client = Together(api_key=TOGETHER_API_KEY)
def ollama_api_call(prompt):

    response = ollama.chat(
    model="gemma3:1b",
    messages=[
        {"role": "user", "content": prompt}
    ],
    )

# # Now parse the JSON safely
    output = response["message"]["content"]
    # remove everything up to and including the </think> tag
    if "</think>" in output:
        output = output.split("</think>")[-1].strip()

    return output
import os
import pandas as pd



In [None]:
def checker_prompt(question, answer,guidlines):
    prompt_template = """
    You are a meticulous checker LLM. Your task is to evaluate a "Maker LLM's" response against a set of strict instructions. Go through each instruction *one by one* and determine if the Maker LLM has followed it. Provide specific, actionable feedback for each instruction that was *not* followed. If an instruction was followed perfectly, you do not need to mention it. If all instructions were followed and the Maker LLM performed exceptionally, state "No feedback".
    Question: {question}
    Answer: {answer}
    Guidelines: {guidlines}
**Evaluation:**

1.  **Role Adherence:**
    * *Check:* Did the Maker LLM act as an "Experienced tutor"?
    * *Feedback (if not followed):* "The response did not fully embody the persona of an 'Experienced tutor'. Consider using more tutorial-like language and explaining concepts as if to a student."

2.  **LaTeX Guidelines - General:**
    * *Check:* Is LaTeX used for ALL mathematical notation? Is Markdown used ONLY for tables/lists? Is other text unformatted?
    * *Feedback (if not followed):* "Ensure all mathematical notation strictly uses LaTeX. Avoid using plain text or Markdown for mathematical expressions. Markdown should be reserved only for tables and lists."

3.  **LaTeX Guidelines - Delimiters:**
    * *Check:* Are inline math delimiters `\\( \\)` used? Are display math delimiters `\\[ \\]` used? Are `$` or `$$` avoided?
    * *Feedback (if not followed):* "Please correct the LaTeX delimiters. Use `\\( \\)` for inline math and `\\[ \\]` for display math. Do not use `$` or `$$`."

4.  **LaTeX Guidelines - Multi-Step Equations:**
    * *Check:* Is `\begin{aligned} ... \end{aligned}` used for multi-step equations? Is alignment with `&` used? Is each step on a new line using `\\\\`?
    * *Feedback (if not followed):* "For multi-step equations, always use `\begin{aligned} ... \end{aligned}`. Ensure proper alignment with `&` and use `\\\\` for new lines between steps."

5.  **LaTeX Guidelines - Punctuation & Spacing:**
    * *Check:* Are equations grammatically integrated and punctuated? Is `\,` used before terminal punctuation in display equations? Is `\,` used between a numerical value and its unit? Is `\,` used as a thousands separator?
    * *Feedback (if not followed):* "Review punctuation and spacing within LaTeX. Ensure equations are grammatically integrated. Use `\,` for spacing before terminal punctuation, between numerical values and units (e.g., `\mathrm{5\,kg}`), and as a thousands separator."

6.  **LaTeX Guidelines - Symbols & Text in Math:**
    * *Check:* Are LaTeX commands used for symbols (e.g., `\Delta`)? Is `\text{normal text with spaces}` used for text within math? Is `\mathrm{upright text}` used for units, descriptive labels, or upright text in math?
    * *Feedback (if not followed):* "Make sure to use correct LaTeX commands for symbols (e.g., `\Delta`). Use `\text{}` for normal text within math and `\mathrm{}` for units or upright text."

7.  **LaTeX Guidelines - Typeface:**
    * *Check:* Are variables italic (default)? Are units and descriptive labels upright/plaintext? Are vectors bold and italic (`\mathbf{v}` or `\textit{\textbf{v}}`)?
    * *Feedback (if not followed):* "Verify typeface consistency. Variables should be italic by default. Units and descriptive labels must be upright. Vectors should be bold and italic."

8.  **LaTeX Guidelines - Units:**
    * *Check:* Are SI units used by default? Is formatting `\mathrm{5\,kg}` used?
    * *Feedback (if not followed):* "Please ensure SI units are used unless otherwise specified, and format them correctly using `\mathrm{}` with `\,` for spacing (e.g., `\mathrm{5\,kg}`)."

9.  **LaTeX Guidelines - Brackets:**
    * *Check:* Are `\left( \right)` and its counterparts used for dynamic sizing?
    * *Feedback (if not followed):* "Utilize `\left( \right)` and corresponding dynamic sizing brackets (e.g., `\left[ \right]`, `\left\{ \right\} `) for all parenthetical expressions."

10. **LaTeX Guidelines - Final Answers:**
    * *Check:* Are intermediate calculations *not* rounded? Are final answers rounded to two decimal places (or as specified)? Is `\boxed{}` *not* used?
    * *Feedback (if not followed):* "Do not round any intermediate calculations. Ensure final answers are rounded to two decimal places (unless specified otherwise). Avoid using `\boxed{}` for final answers."

11. **Instructions - Step-by-step & Tutor-like:**
    * *Check:* Is the answer provided in a step-by-step, tutor-like manner?
    * *Feedback (if not followed):* "The response needs to be more clearly structured in a step-by-step, tutor-like manner to guide the student effectively."

12. **Instructions - Assumptions:**
    * *Check:* Are all assumptions clearly stated?
    * *Feedback (if not followed):* "Please explicitly state all assumptions made during the problem-solving process."

13. **Instructions - Precise Calculations:**
    * *Check:* Are all calculations performed precisely, without any approximation (except for final rounding)?
    * *Feedback (if not followed):* "Review your calculations to ensure precision and avoid approximation at intermediate steps. Only round the final answer."

14. **Instructions - Adherence to LaTeX Guidelines:**
    * *Check:* Does the response *strictly* adhere to all LaTeX Guidelines? (This is an overarching check, and specific feedback from points 2-10 will cover most of it, but this serves as a final reinforcement).
    * *Feedback (if not followed):* "There are still instances where the LaTeX Guidelines were not strictly adhered to. Please review all LaTeX guidelines carefully and ensure full compliance."

    """
    return prompt_template.format(question=question, answer=answer, guidlines=guidlines)
def accept_feedback_prompt(question, answer, feedback, guidlines):
    prompt_template = """
    You are a expert tasked with improving the answer based on the feedback provided on guidelines.
    Please provide a revised answer that addresses the feedback.
    Question: {question}
    Answer: {answer}
    Guidelines: {guidlines}
    Feedback: {feedback}

    """
    return prompt_template.format(question=question, answer=answer, feedback=feedback, guidlines=guidlines)
def accept_feedback(question, answer, guidlines):
    prompt = checker_prompt(question, answer, guidlines)
    response = ollama.chat(
        model="gemma3:1b",
        messages=[
            {"role": "user", "content": prompt}
        ],
    )
    
    output = response["message"]["content"]
    # remove everything up to and including the </think> tag
    if "</think>" in output:
        output = output.split("</think>")[-1].strip()

    print("Suyash checker output:", output)
    accept_feedback_prompt_ = accept_feedback_prompt(question, answer, feedback=output, guidlines=guidlines)
    response = ollama.chat(
        model="gemma3:1b",
        messages=[
            {"role": "user", "content": accept_feedback_prompt_}
        ],
    )
    output = response["message"]["content"]
    # remove everything up to and including the </think> tag
    if "</think>" in output:
        output = output.split("</think>")[-1].strip()
    # Check if the response contains a boxed answer
    comeplte_answer= output
   
    #check boxed answer
    boxed_answer = re.search(r'\\boxed\{(.+)\}', output)
    if boxed_answer:
        # Extract the answer from the boxed format
        boxed_answer = boxed_answer.group(1).strip()
    else:
        boxed_answer = ""
    return comeplte_answer, boxed_answer



In [32]:
def preprocess_csv(file_path):
    # Read the CSV file
    df = pd.read_csv(file_path,sep=',', encoding='utf-8')

    # print(df.columns)
    # print(df.head())
    # Initialize new columns (empty strings by default)
    df['explaination'] = ''
    df['answer'] = ''    
    # Define the valid Task IDs: "SDP79" through "SDP92" (inclusive)

    # Loop over the DataFrame rows
    for idx, row in df.iterrows():
        # question is in 5th column
        if idx==0 or idx==1:
            # Skip the first two rows
            continue
        if idx>3:
            break
        question = str(row.iloc[4]).strip()
        prompt = generate_prompt_initial_answer(question,guidlines)
        
        # Get explanation and answer from the Together API
        explaination = ollama_api_call(prompt)
    
        
        # Print to console
        # print(f"TaskID: {task_id}")
        print(f"Suyash Question: {question}")
        print(f"Suyash First Explanation: {explaination}")

        complete_answer, boxed_answer = accept_feedback(question, explaination, guidlines)
        # Print the complete answer and boxed answer
        print(f"Suyash Complete Answer: {complete_answer}")
        print(f"Suyash Boxed Answer: {boxed_answer}")

        df.at[idx, 'explaination'] = explaination
        df.at[idx, 'answer'] = boxed_answer 




    
    # Construct a new filename by appending "_with_explanations.csv"
    # base, _ = os.path.splitext(file_path)
    # output_path = f"{base}_with_answer.csv"
    # # Write the updated DataFrame to the new CSV
    # df.to_csv(output_path, index=False)
    # print(f"✅  Output written to: {output_path}")

preprocess_csv("Synthetic Data Evaluation  - Chemistry (1).csv")  


Suyash Question: Base-Promoted Elimination (single output)  
Provide the full IUPAC name (including stereochemistry) of the single major alkene formed when (R)-3-bromo-4-methylhexane is treated with excess KOtBu in tert-butanol at 40 °C.
Suyash First Explanation: ```latex
\textbf{Reaction:}
The reaction is the addition of KOtBu to (R)-3-bromo-4-methylhexane, forming a single alkene.
\begin{enumerate}
    \item The reaction is carried out in tert-butanol at 40 °C.
    \item KOtBu is a strong base, which deprotonates the alpha carbon of the alkyl halide.
    \item The resulting enolate ion attacks the carbonyl group of the tert-butanol.
    \item The mechanism involves the formation of an intermediate that undergoes elimination to produce the alkene.
\end{enumerate}

\textbf{Step 1:} KOtBu acts as a strong base, removing the proton from the alpha carbon of (R)-3-bromo-4-methylhexane. This forms an enolate ion.  The enolate ion is stabilized by the adjacent tert-butyl group.
\begin{align*