In [1]:
from pdf2image import convert_from_path

pdf_path = "2.pdf"
images = convert_from_path(pdf_path)  # Each page = 1 image
images 

[<PIL.PpmImagePlugin.PpmImageFile image mode=RGB size=5512x7778>,
 <PIL.PpmImagePlugin.PpmImageFile image mode=RGB size=5512x7778>,
 <PIL.PpmImagePlugin.PpmImageFile image mode=RGB size=5512x7778>,
 <PIL.PpmImagePlugin.PpmImageFile image mode=RGB size=5512x7778>,
 <PIL.PpmImagePlugin.PpmImageFile image mode=RGB size=5512x7778>,
 <PIL.PpmImagePlugin.PpmImageFile image mode=RGB size=5512x7778>,
 <PIL.PpmImagePlugin.PpmImageFile image mode=RGB size=5512x7778>,
 <PIL.PpmImagePlugin.PpmImageFile image mode=RGB size=5512x7778>,
 <PIL.PpmImagePlugin.PpmImageFile image mode=RGB size=5512x7778>,
 <PIL.PpmImagePlugin.PpmImageFile image mode=RGB size=5512x7778>,
 <PIL.PpmImagePlugin.PpmImageFile image mode=RGB size=5512x7778>,
 <PIL.PpmImagePlugin.PpmImageFile image mode=RGB size=5512x7778>,
 <PIL.PpmImagePlugin.PpmImageFile image mode=RGB size=5512x7778>,
 <PIL.PpmImagePlugin.PpmImageFile image mode=RGB size=5512x7778>,
 <PIL.PpmImagePlugin.PpmImageFile image mode=RGB size=5512x7778>,
 <PIL.PpmI

In [2]:
import os

output_dir = "pdf_pages"
os.makedirs(output_dir, exist_ok=True)

# Save images to disk
for i, img in enumerate(images):
    img_path = os.path.join(output_dir, f"page_{i+1}.jpg")
    img.save(img_path, "JPEG")
    print(f"Saved: {img_path}")

Saved: pdf_pages\page_1.jpg
Saved: pdf_pages\page_2.jpg
Saved: pdf_pages\page_3.jpg
Saved: pdf_pages\page_4.jpg
Saved: pdf_pages\page_5.jpg
Saved: pdf_pages\page_6.jpg
Saved: pdf_pages\page_7.jpg
Saved: pdf_pages\page_8.jpg
Saved: pdf_pages\page_9.jpg
Saved: pdf_pages\page_10.jpg
Saved: pdf_pages\page_11.jpg
Saved: pdf_pages\page_12.jpg
Saved: pdf_pages\page_13.jpg
Saved: pdf_pages\page_14.jpg
Saved: pdf_pages\page_15.jpg
Saved: pdf_pages\page_16.jpg


In [3]:
# Preprocessing the images

import cv2
import numpy as np

def preprocess_image(image, is_handwritten=True):
    """
    Improved adaptive preprocessing for handwritten images
    with uneven shadows or lighting.
    """
    # Convert to grayscale
    if len(image.shape) == 3:
        gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    else:
        gray = image

    # Apply light Gaussian blur
    blurred = cv2.GaussianBlur(gray, (5, 5), 0)

    # Adaptive thresholding (block size and C adjusted)
    adaptive = cv2.adaptiveThreshold(
        blurred, 255,
        cv2.ADAPTIVE_THRESH_MEAN_C,
        cv2.THRESH_BINARY_INV,  # Invert to make text white on black (for optional cleaning)
        35, 15  # Larger block size + higher constant to adapt to shadows
    )

    # Invert back to black text on white
    adaptive = cv2.bitwise_not(adaptive)

    # Light morphological close to connect letters (optional)
    if is_handwritten:
        kernel = np.ones((1, 1), np.uint8)
        processed = cv2.morphologyEx(adaptive, cv2.MORPH_CLOSE, kernel)
        return processed

    return adaptive


In [4]:
from PIL import Image

preprocessed_images = []

for pil_image in images:
    # Convert PIL to OpenCV image
    cv_img = np.array(pil_image)
    cv_img = cv_img[:, :, ::-1].copy()  # RGB to BGR

    # Preprocess with handwritten mode
    processed = preprocess_image(cv_img, is_handwritten=True)
    
    preprocessed_images.append(processed)

preprocessed_images


[array([[255, 255, 255, ..., 255, 255, 255],
        [255, 255, 255, ..., 255, 255, 255],
        [255, 255, 255, ..., 255, 255, 255],
        ...,
        [255, 255,   0, ..., 255, 255, 255],
        [255, 255, 255, ..., 255, 255, 255],
        [255, 255, 255, ..., 255, 255, 255]], dtype=uint8),
 array([[255, 255, 255, ..., 255, 255, 255],
        [255, 255, 255, ..., 255, 255, 255],
        [255, 255, 255, ..., 255, 255, 255],
        ...,
        [255, 255, 255, ..., 255, 255, 255],
        [255, 255, 255, ..., 255, 255, 255],
        [255, 255, 255, ..., 255, 255, 255]], dtype=uint8),
 array([[255, 255, 255, ..., 255, 255, 255],
        [255, 255, 255, ..., 255, 255, 255],
        [255, 255, 255, ..., 255, 255, 255],
        ...,
        [255, 255, 255, ..., 255, 255, 255],
        [255, 255, 255, ..., 255, 255, 255],
        [255, 255, 255, ..., 255, 255, 255]], dtype=uint8),
 array([[255, 255, 255, ..., 255, 255, 255],
        [255, 255, 255, ..., 255, 255, 255],
        [255, 25

In [5]:
# Final processed images to perform ocr

import os
from PIL import Image

# Step 1: Create output folder if it doesn't exist
output_dir = "preprocessed_pages"
os.makedirs(output_dir, exist_ok=True)

# Step 2: Save each image to that folder
for i, img in enumerate(preprocessed_images):
    pil_img = Image.fromarray(img)  # Convert NumPy array to PIL image
    img_path = os.path.join(output_dir, f"page_{i+1}.png")
    pil_img.save(img_path)
    print(f"Saved: {img_path}")


Saved: preprocessed_pages\page_1.png
Saved: preprocessed_pages\page_2.png
Saved: preprocessed_pages\page_3.png
Saved: preprocessed_pages\page_4.png
Saved: preprocessed_pages\page_5.png
Saved: preprocessed_pages\page_6.png
Saved: preprocessed_pages\page_7.png
Saved: preprocessed_pages\page_8.png
Saved: preprocessed_pages\page_9.png
Saved: preprocessed_pages\page_10.png
Saved: preprocessed_pages\page_11.png
Saved: preprocessed_pages\page_12.png
Saved: preprocessed_pages\page_13.png
Saved: preprocessed_pages\page_14.png
Saved: preprocessed_pages\page_15.png
Saved: preprocessed_pages\page_16.png


In [8]:
import os
import re
import json
import time
from PIL import Image
from pdf2image import convert_from_path
import google.generativeai as genai

# ==============================================================================
# SECTION 1: CONFIGURATION
# ==============================================================================

# --- Folder and File Paths ---
# Folder containing the preprocessed images of the answer sheets
PREPROCESSED_ANSWERS_FOLDER = "preprocessed_pages"
# Path to the PDF file containing the questions
QUESTION_PDF_PATH = "2 - question.pdf"
# Folder for the final output
FINAL_OUTPUT_FOLDER = "Student_QNA"
# The final structured output file
FINAL_JSON_OUTPUT_FILE = os.path.join(FINAL_OUTPUT_FOLDER, "student_final_verified_qna.json")

# --- AI Configuration ---
# Configure Gemini API using the correct environment variable name
try:
    # IMPORTANT: The environment variable should be GEMINI_API_KEY
    api_key = os.getenv("GEMINI_API_KEY")
    if not api_key:
        raise ValueError("GEMINI_API_KEY environment variable not set.")
    genai.configure(api_key=api_key)
    model = genai.GenerativeModel("gemini-1.5-pro-latest")
    print("[✓] Gemini AI configured successfully.")
except Exception as e:
    print(f"[FATAL ERROR] Failed to configure Gemini. Error: {e}")
    exit()

# ==============================================================================
# SECTION 2: THE "SINGLE-PASS" MASTER PROMPT
# ==============================================================================

MASTER_QNA_PROMPT = """
You are an expert examination evaluator AI. Your task is to accurately match questions from a provided question paper with their corresponding handwritten answers from a set of images and generate a single, structured JSON output.

You will be given:
1.  A PDF file containing all the exam questions (`question_paper.pdf`).
2.  A series of images (`answer_page_X.png`) containing the student's handwritten answers.

**Your process must be:**

1.  **Iterate Through Questions:** Systematically go through the question paper, question by question, from start to finish.
2.  **Locate the Answer:** For each question (e.g., "Question 6"), locate the student's corresponding handwritten answer in the provided images. Use question numbers (e.g., "Ans-6", "Question-6") as your primary guide.
3.  **Transcribe Accurately:** Transcribe the handwritten answer text *exactly* as it appears, preserving all original wording, spelling, and grammar. Do not correct anything.
4.  **Handle Unanswered Questions:** If you cannot find a corresponding answer for a question, you MUST explicitly mark it as "Not Answered". Do not skip it.
5.  **Ignore Crossed-Out Text:** If any text is visibly struck through or crossed out, ignore it completely and do not include it in the transcription.

**MANDATORY OUTPUT FORMAT:**

-   Your final output must be a single, well-formed JSON array `[...]`.
-   Each element in the array must be a JSON object `{...}` representing one question.
-   Each object must contain these four keys:
    -   `"question_number"`: The specific number of the question (e.g., "1 (i)", "6", "12").
    -   `"question_text"`: The full text of the question.
    -   `"answer_text"`: The student's transcribed answer. If not answered, this must be the string "Not Answered".
    -   `"status"`: A status string, either "Answered" or "Not Answered".

**Example JSON Object:**
{
    "question_number": "12",
    "question_text": "What is Barter System?",
    "answer_text": "Bartering is the direct exchange of one goods with another goods without the use of money For eg for the services of Carpenter or blacksmith of he is given quintal of wheat then it is bartering.",
    "status": "Answered"
}

Do not add any text, notes, or explanations outside of the final JSON array.
"""

# ==============================================================================
# SECTION 3: CORE LOGIC
# ==============================================================================

def generate_structured_qna(question_pdf_path, answer_image_paths):
    """
    Performs the main Q&A matching and verification in a single AI call.
    """
    print("\n--- STEP 2: Preparing files for AI processing ---")
    try:
        # Prepare the list of files to send to the AI
        # The first item is the master prompt
        prompt_parts = [MASTER_QNA_PROMPT]

        # Add the question paper PDF
        print(f" -> Uploading question paper: {question_pdf_path}")
        prompt_parts.append(genai.upload_file(path=question_pdf_path, display_name="question_paper.pdf"))

        # Add all the answer page images
        for i, img_path in enumerate(answer_image_paths):
            print(f" -> Uploading answer page: {os.path.basename(img_path)}")
            prompt_parts.append(genai.upload_file(path=img_path, display_name=f"answer_page_{i+1}.png"))

    except Exception as e:
        print(f"[FATAL ERROR] Failed to upload files for processing. Error: {e}")
        return None

    print("\n--- STEP 3: Executing single-pass AI evaluation ---")
    print(" -> This may take a few moments as the AI processes all documents at once...")
    try:
        # Make the single, powerful API call
        response = model.generate_content(prompt_parts)

        # Extract the JSON from the response
        json_match = re.search(r'```json\s*([\s\S]*?)\s*```', response.text)
        if json_match:
            json_str = json_match.group(1)
            return json.loads(json_str)
        else:
            print("[WARNING] Could not find a JSON block in the AI's response. Trying to parse the whole text.")
            # Fallback for when the AI forgets the markdown block
            return json.loads(response.text)

    except json.JSONDecodeError:
        print("[ERROR] Failed to parse JSON from the AI response. The response may be malformed.")
        print("--- AI Response Text ---")
        print(response.text)
        print("------------------------")
        return None
    except Exception as e:
        print(f"[ERROR] An unexpected error occurred during AI generation: {e}")
        return None


# ==============================================================================
# SECTION 4: MAIN EXECUTION WORKFLOW
# ==============================================================================

if __name__ == "__main__":
    print("--- E-Paper Checking Bot Workflow Initialized ---")
    os.makedirs(FINAL_OUTPUT_FOLDER, exist_ok=True)

    print("\n--- STEP 1: Locating Answer Sheet Images ---")
    if not os.path.exists(PREPROCESSED_ANSWERS_FOLDER):
        print(f"[FATAL ERROR] Answer folder not found: '{PREPROCESSED_ANSWERS_FOLDER}'")
        exit()

    # Get a sorted list of all answer image files
    answer_files = sorted(
        [os.path.join(PREPROCESSED_ANSWERS_FOLDER, f) for f in os.listdir(PREPROCESSED_ANSWERS_FOLDER) if f.lower().endswith((".png", ".jpg", ".jpeg"))],
        key=lambda x: int(re.search(r'\d+', os.path.basename(x)).group())
    )

    if not answer_files:
        print(f"[FATAL ERROR] No image files found in '{PREPROCESSED_ANSWERS_FOLDER}'.")
        exit()
    
    print(f" -> Found {len(answer_files)} answer pages to process.")

    # Generate the final structured data
    final_qna_data = generate_structured_qna(QUESTION_PDF_PATH, answer_files)

    print("\n--- STEP 4: Finalizing and Saving Output ---")
    if final_qna_data and isinstance(final_qna_data, list):
        with open(FINAL_JSON_OUTPUT_FILE, "w", encoding="utf-8") as f:
            # Save as a pretty-printed JSON file, which is ideal for machine processing
            json.dump(final_qna_data, f, indent=4, ensure_ascii=False)
        print(f"[SUCCESS] Final file created with {len(final_qna_data)} question-answer pairs.")
        print(f" -> Output saved to: {FINAL_JSON_OUTPUT_FILE}")
    else:
        print("[FAILURE] No valid Q&A data was generated. The output file was not created.")

    print("\n✅ Workflow complete.")

[✓] Gemini AI configured successfully.
--- E-Paper Checking Bot Workflow Initialized ---

--- STEP 1: Locating Answer Sheet Images ---
 -> Found 16 answer pages to process.

--- STEP 2: Preparing files for AI processing ---
 -> Uploading question paper: 2 - question.pdf
 -> Uploading answer page: page_1.png
 -> Uploading answer page: page_2.png
 -> Uploading answer page: page_3.png
 -> Uploading answer page: page_4.png
 -> Uploading answer page: page_5.png
 -> Uploading answer page: page_6.png
 -> Uploading answer page: page_7.png
 -> Uploading answer page: page_8.png
 -> Uploading answer page: page_9.png
 -> Uploading answer page: page_10.png
 -> Uploading answer page: page_11.png
 -> Uploading answer page: page_12.png
 -> Uploading answer page: page_13.png
 -> Uploading answer page: page_14.png
 -> Uploading answer page: page_15.png
 -> Uploading answer page: page_16.png

--- STEP 3: Executing single-pass AI evaluation ---
 -> This may take a few moments as the AI processes all docum

In [1]:
import os
import re
import json
import google.generativeai as genai

# ==============================================================================
# SECTION 1: CONFIGURATION
# ==============================================================================

# --- File Paths ---
# Path to the question paper PDF
QUESTION_PDF_PATH = "2 - question.pdf"
# Path to the model answer key PDF
OFFICIAL_ANSWER_PDF_PATH = "2 - answer.pdf"
# Output folder name
FINAL_OUTPUT_FOLDER = "Original_Answer"
# Path to the final JSON output file
FINAL_JSON_OUTPUT_FILE = os.path.join(FINAL_OUTPUT_FOLDER, "original_answer.json")

# --- AI Configuration ---
# Configure your Gemini API key here
try:
    # Get API key from environment variable
    api_key = os.getenv("GEMINI_API_KEY")
    if not api_key:
        raise ValueError("GEMINI_API_KEY environment variable not set.")
    genai.configure(api_key=api_key)
    model = genai.GenerativeModel("gemini-1.5-pro-latest")
    print("[✓] Gemini AI configured successfully.")
except Exception as e:
    print(f"[FATAL ERROR] Failed to configure Gemini. Error: {e}")
    exit()

# ==============================================================================
# SECTION 2: THE MASTER PROMPT (***RE-ENGINEERED FOR COMPLETENESS***)
# ==============================================================================

MASTER_PROMPT = """
You are a meticulous and highly precise data extraction AI. Your primary directive is to create a **COMPLETE** and **VERBATIM** JSON representation of the provided question paper and its official answers. You *must* process every question from start to finish without any omissions.

You will be given:
1.  `question_paper.pdf`: Contains exam questions.
2.  `official_answer_key.pdf`: Contains the official answers.

**CRITICAL RULES FOR EXECUTION:**

1.  ***ABSOLUTE COMPLETENESS & VERIFICATION***:
    -   You **MUST** process **ALL 23 questions** from the `question_paper.pdf`, from Question 1 to Question 23.
    -   Do not stop early under any circumstances.
    -   **Before finishing, you must perform a final self-check to ensure all 23 questions and their sub-parts are present in your final JSON output.**

2.  ***VERBATIM (EXACT) EXTRACTION***:
    -   All extracted text must be a *character-for-character copy*.
    -   Do not translate, summarize, rephrase, or alter any text.
    -   The original languages in the source documents (e.g., English and Hindi) must be preserved perfectly.

3.  ***QUESTION TEXT SPECIFICATIONS***:
    -   For each question number, extract the full **English version** of the question text.
    -   If a question has multiple-choice options (e.g., A, B, C, D), you **MUST** include those options as part of the `question_text`.

4.  ***"OR" (अथवा) QUESTION HANDLING***:
    -   Many questions have an 'OR' (अथवा) option. For these, your `question_text` **MUST** include the text for **BOTH** the main question and the 'OR' question.
    -   The `official_answer_text` should be the single answer provided in the answer key, which may correspond to either the main question or the 'OR' part. Extract whichever answer is present.

**MANDATORY OUTPUT FORMAT:**

-   Your final output must be a single, well-formed JSON array `[...]`.
-   Each object must contain these three keys:
    -   `"question_number"`: The specific number (e.g., "1 (i)", "6", "23").
    -   `"question_text"`: The full English text of the question (and its 'OR' part, if present), copied verbatim.
    -   `"official_answer_text"`: The full text of the corresponding answer from the answer key, copied verbatim in its original language.

**Example for a Multiple-Choice Question:**
{
    "question_number": "1 (i)",
    "question_text": "Study of a firm, an industries, price of a good is done under -\n(A) Macro economics\n(B) Micro economics\n(C) Both (A and B)\n(D) National income",
    "official_answer_text": "(ब) व्यष्टि अर्थशास्त्र"
}

**Example for a Question with an "OR" part:**
{
    "question_number": "6",
    "question_text": "What are the central problems of an economy?\nOR\nDefine Economic Problem.",
    "official_answer_text": "(i) किन वस्तुओं का कितनी मात्रा में उत्पादन किया जाये। (ii) वस्तुओं का उत्पादन किसके लिऐ किया जाये। (iii) वस्तुओं के उत्पादन का ढंग क्या हो?"
}

Do not add any text, notes, or explanations outside of the final JSON array.
"""

# ==============================================================================
# SECTION 3: CORE LOGIC
# ==============================================================================

def generate_official_qna_json(question_pdf, answer_pdf):
    """
    Uses AI to extract and combine questions and official answers.
    """
    print("\n--- STEP 2: Preparing files for AI processing ---")
    try:
        # List of prompt and files to send to the AI
        prompt_parts = [
            MASTER_PROMPT,
            genai.upload_file(path=question_pdf, display_name="question_paper.pdf"),
            genai.upload_file(path=answer_pdf, display_name="official_answer_key.pdf")
        ]
    except Exception as e:
        print(f"[FATAL ERROR] Failed to upload files. Error: {e}")
        return None

    print("\n--- STEP 3: Starting AI extraction ---")
    print(" -> This may take a moment...")
    try:
        # Send the request to the AI
        response = model.generate_content(prompt_parts)

        # Extract the JSON from the response
        json_match = re.search(r'```json\s*([\s\S]*?)\s*```', response.text)
        if json_match:
            json_str = json_match.group(1)
            return json.loads(json_str)
        else:
            print("[WARNING] No JSON block found in AI response. Attempting to parse the full text.")
            return json.loads(response.text)

    except json.JSONDecodeError:
        print("[ERROR] Failed to parse JSON from AI response. The response may be malformed.")
        print("--- AI Response Text ---")
        print(response.text)
        print("------------------------")
        return None
    except Exception as e:
        print(f"[ERROR] An unexpected error occurred during AI generation: {e}")
        return None


# ==============================================================================
# SECTION 4: MAIN EXECUTION WORKFLOW
# ==============================================================================

if __name__ == "__main__":
    print("--- Official Q&A JSON Generator Workflow Started ---")
    os.makedirs(FINAL_OUTPUT_FOLDER, exist_ok=True)

    print("\n--- STEP 1: Verifying PDF file paths ---")
    if not os.path.exists(QUESTION_PDF_PATH) or not os.path.exists(OFFICIAL_ANSWER_PDF_PATH):
        print(f"[FATAL ERROR] Please ensure that '{QUESTION_PDF_PATH}' and '{OFFICIAL_ANSWER_PDF_PATH}' exist.")
        exit()

    print(" -> All required PDF files found.")

    # Generate the final structured data
    final_qna_data = generate_official_qna_json(QUESTION_PDF_PATH, OFFICIAL_ANSWER_PDF_PATH)

    print("\n--- STEP 4: Finalizing and saving the output ---")
    if final_qna_data and isinstance(final_qna_data, list):
        with open(FINAL_JSON_OUTPUT_FILE, "w", encoding="utf-8") as f:
            json.dump(final_qna_data, f, indent=4, ensure_ascii=False)
        print(f"[SUCCESS] Final file created with {len(final_qna_data)} question-answer pairs.")
        print(f" -> Output saved to: {FINAL_JSON_OUTPUT_FILE}")
    else:
        print("[FAILURE] No valid Q&A data was generated. The output file was not created.")

    print("\n✅ Workflow complete.")

  from .autonotebook import tqdm as notebook_tqdm


[✓] Gemini AI configured successfully.
--- Official Q&A JSON Generator Workflow Started ---

--- STEP 1: Verifying PDF file paths ---
 -> All required PDF files found.

--- STEP 2: Preparing files for AI processing ---

--- STEP 3: Starting AI extraction ---
 -> This may take a moment...

--- STEP 4: Finalizing and saving the output ---
[SUCCESS] Final file created with 50 question-answer pairs.
 -> Output saved to: Original_Answer\original_answer.json

✅ Workflow complete.


In [3]:
import os
import json
import re
import google.generativeai as genai

# ==============================================================================
# SECTION 1: CONFIGURATION
# ==============================================================================

# --- File Paths ---
# Path to the JSON file with official answers
ORIGINAL_ANSWERS_PATH = os.path.join("Original_Answer", "original_answer.json")
# Path to the JSON file with the student's answers
STUDENT_ANSWERS_PATH = os.path.join("Student_QNA", "student_final_verified_qna.json")
# Output folder for the final report
EVALUATION_OUTPUT_FOLDER = "Final_Evaluation"
# Path to the final text report file
EVALUATION_REPORT_FILE = os.path.join(EVALUATION_OUTPUT_FOLDER, "evaluation_report.txt")

# --- AI Configuration ---
try:
    api_key = os.getenv("GEMINI_API_KEY")
    if not api_key:
        raise ValueError("GEMINI_API_KEY environment variable not set.")
    genai.configure(api_key=api_key)
    # Configure the model for evaluation tasks
    evaluation_model = genai.GenerativeModel("gemini-1.5-pro-latest")
    print("[✓] Gemini AI configured successfully for evaluation.")
except Exception as e:
    print(f"[FATAL ERROR] Failed to configure Gemini. Error: {e}")
    exit()


# ==============================================================================
# SECTION 2: AI EVALUATION PROMPT
# ==============================================================================

EVALUATION_PROMPT_TEMPLATE = """
You are an expert, impartial examiner. Your task is to evaluate a student's answer against the official model answer and provide a score based on semantic correctness.

**Context:**
- The student's answer and the official answer may be in different languages (e.g., English and Hindi).
- Your evaluation must be based on the *meaning and core concepts*, not just keyword matching.

**Official Answer:**
---
{official_answer}
---

**Student's Answer:**
---
{student_answer}
---

**Your Task:**
1.  Compare the student's answer to the official answer.
2.  Provide a numerical score from 0 to 100, where 100 means the student's answer perfectly conveys the same meaning as the official answer, and 0 means it is completely incorrect or irrelevant.
3.  Provide a brief, one-sentence justification for your score.

**Mandatory Output Format:**
You MUST return your response in a single line with the format: `score|justification`
**Example 1:** `90|The student correctly explained the concept but missed one minor detail mentioned in the official answer.`
**Example 2:** `0|The student's answer is factually incorrect and does not align with the official answer.`
**Example 3:** `100|The student's answer perfectly matches the core concepts of the official answer.`
"""


# ==============================================================================
# SECTION 3: CORE LOGIC
# ==============================================================================

def load_and_merge_data(original_file, student_file):
    """Loads data from both JSON files and merges them for comparison."""
    print(f"-> Loading official answers from: {original_file}")
    try:
        with open(original_file, 'r', encoding='utf-8') as f:
            original_data = {item['question_number']: item for item in json.load(f)}
    except FileNotFoundError:
        print(f"[ERROR] Official answer file not found at '{original_file}'")
        return None
        
    print(f"-> Loading student answers from: {student_file}")
    try:
        with open(student_file, 'r', encoding='utf-8') as f:
            student_data = {item['question_number']: item for item in json.load(f)}
    except FileNotFoundError:
        print(f"[ERROR] Student answer file not found at '{student_file}'")
        return None

    merged_data = []
    print("-> Merging and preparing data for evaluation...")
    for q_num, original_q in original_data.items():
        student_q = student_data.get(q_num)
        if student_q:
            merged_data.append({
                "question_number": q_num,
                "question_text": original_q.get("question_text", "N/A"),
                "official_answer": original_q.get("official_answer_text", ""),
                "student_answer": student_q.get("answer_text", ""),
                "status": student_q.get("status", "Not Answered")
            })
    return merged_data


def evaluate_answers(data_to_evaluate):
    """Iterates through data and uses AI to score each answered question."""
    print("\n--- Starting Automated Evaluation ---")
    evaluated_results = []
    total_questions = len(data_to_evaluate)
    
    for i, item in enumerate(data_to_evaluate):
        print(f"-> Evaluating question {i+1}/{total_questions}: '{item['question_number']}'...", end='')
        if item["status"] != "Answered" or not item["student_answer"] or item["student_answer"] == "Not Answered":
            item["score"] = 0
            item["justification"] = "Question was not answered by the student."
            print(" SKIPPED (Not Answered)")
        else:
            prompt = EVALUATION_PROMPT_TEMPLATE.format(
                official_answer=item["official_answer"],
                student_answer=item["student_answer"]
            )
            try:
                response = evaluation_model.generate_content(prompt)
                # Parse the response: score|justification
                parts = response.text.strip().split('|')
                if len(parts) == 2:
                    item["score"] = int(re.sub(r'[^0-9]', '', parts[0])) # Clean non-numeric chars
                    item["justification"] = parts[1].strip()
                    print(f" SCORE: {item['score']}%")
                else:
                    raise ValueError("Response was not in the expected 'score|justification' format.")
            except Exception as e:
                item["score"] = -1  # Use -1 to indicate an evaluation error
                item["justification"] = f"AI evaluation failed: {e}"
                print(f" FAILED ({e})")

        evaluated_results.append(item)
    return evaluated_results


def generate_evaluation_report(results):
    """Generates a detailed text file report from the evaluation results."""
    print("\n--- Generating Evaluation Report ---")
    os.makedirs(EVALUATION_OUTPUT_FOLDER, exist_ok=True)
    
    total_score = 0
    answered_count = 0
    
    with open(EVALUATION_REPORT_FILE, 'w', encoding='utf-8') as f:
        f.write("======================================================================\n")
        f.write("                 AUTOMATED ANSWER EVALUATION REPORT\n")
        f.write("======================================================================\n\n")

        for item in results:
            f.write(f"--- Question {item['question_number']} ---\n")
            f.write(f"Status: {item['status']}\n")
            f.write(f"Score: {item['score']}%\n")
            f.write(f"Justification: {item['justification']}\n\n")
            f.write("👤 Student's Answer:\n")
            f.write(f"{item['student_answer']}\n\n")
            f.write("📚 Official Answer:\n")
            f.write(f"{item['official_answer']}\n")
            f.write("----------------------------------------------------------------------\n\n")
            
            if item['status'] == 'Answered' and item['score'] != -1:
                total_score += item['score']
                answered_count += 1
                
        # Final Summary
        average_score = (total_score / answered_count) if answered_count > 0 else 0
        f.write("======================================================================\n")
        f.write("                            FINAL SUMMARY\n")
        f.write("======================================================================\n")
        f.write(f"Total Questions: {len(results)}\n")
        f.write(f"Questions Answered by Student: {answered_count}\n")
        f.write(f"Average Score on Answered Questions: {average_score:.2f}%\n")
        f.write("======================================================================\n")
    
    print(f"[SUCCESS] Evaluation report saved to: {EVALUATION_REPORT_FILE}")


# ==============================================================================
# SECTION 4: MAIN EXECUTION WORKFLOW
# ==============================================================================

if __name__ == "__main__":
    print("--- Paper Checker Bot Initialized ---")
    
    # 1. Load and merge data from both JSON files
    merged_data = load_and_merge_data(ORIGINAL_ANSWERS_PATH, STUDENT_ANSWERS_PATH)
    
    if merged_data:
        # 2. Evaluate the answers using the AI model
        final_results = evaluate_answers(merged_data)
        
        # 3. Generate a detailed report file
        generate_evaluation_report(final_results)
        
        # NOTE: The interactive query mode has been removed from this script
        # as per your request to separate the tasks.
        
        print("\n✅ Evaluation and Report Generation Complete.")
    else:
        print("\n[FAILURE] Could not proceed due to errors in loading data. Exiting.")

[✓] Gemini AI configured successfully for evaluation.
--- Paper Checker Bot Initialized ---
-> Loading official answers from: Original_Answer\original_answer.json
-> Loading student answers from: Student_QNA\student_final_verified_qna.json
-> Merging and preparing data for evaluation...

--- Starting Automated Evaluation ---
-> Evaluating question 1/50: '1 (i)'... SCORE: 0%
-> Evaluating question 2/50: '1 (ii)'... SCORE: 0%
-> Evaluating question 3/50: '1 (iii)'... SCORE: 0%
-> Evaluating question 4/50: '1 (iv)'... SCORE: 100%
-> Evaluating question 5/50: '1 (v)'... SCORE: 100%
-> Evaluating question 6/50: '1 (vi)'... SCORE: 0%
-> Evaluating question 7/50: '2 (i)'... SCORE: 100%
-> Evaluating question 8/50: '2 (ii)'... SCORE: 100%
-> Evaluating question 9/50: '2 (iii)'... SCORE: 100%
-> Evaluating question 10/50: '2 (iv)'... SCORE: 0%
-> Evaluating question 11/50: '2 (v)'... SCORE: 0%
-> Evaluating question 12/50: '2 (vi)'... SCORE: 0%
-> Evaluating question 13/50: '2 (vii)'... SCORE: 