In [8]:
import os
import json
import fitz  # PyMuPDF for PDF text extraction
import csv
import xml.etree.ElementTree as ET

# Folder paths
PDF_FOLDER = "ncert_pdfs"
TEXT_FOLDER = "ncert_texts"
STUDENT_SUBMISSIONS_FOLDER = "student_submissions"
MODEL_ANSWER_FOLDER = "ncert_model_answers"
GRADING_REPORTS_FOLDER = "grading_reports"
TRAINING_DATA_FOLDER = "training_data"  # Folder to store the structured samples

# Create the output folder if it doesn't exist
os.makedirs(TRAINING_DATA_FOLDER, exist_ok=True)
os.makedirs(TEXT_FOLDER, exist_ok=True)

# === Functions ===

def extract_text_from_pdf(pdf_path):
    """
    Extracts text from a PDF file using PyMuPDF.
    """
    doc = fitz.open(pdf_path)
    text = ""
    for page in doc:
        text += page.get_text()
    return text


def clean_text(text):
    """
    Cleans and normalizes the text.
    - Lowercases the text.
    - Removes extra spaces.
    - Strips leading/trailing whitespace.
    """
    text = text.lower()
    text = " ".join(text.split())  # Remove extra spaces
    return text


def parse_json(json_path):
    """
    Parses a JSON file and extracts the relevant text fields.
    """
    with open(json_path, "r", encoding="utf-8") as f:
        data = json.load(f)
    # Extract content based on your format
    return json.dumps(data, indent=4)


def parse_csv(csv_path):
    """
    Parses a CSV file and extracts content into a single string.
    """
    content = ""
    with open(csv_path, newline="", encoding="utf-8") as csvfile:
        reader = csv.reader(csvfile)
        for row in reader:
            content += " ".join(row) + "\n"
    return content


def parse_xml(xml_path):
    """
    Parses an XML file and extracts text content.
    """
    tree = ET.parse(xml_path)
    root = tree.getroot()
    content = ""
    for elem in root.iter():
        content += elem.text.strip() if elem.text else ""
    return content


def process_pdfs_to_text():
    """
    Converts all PDFs in the PDF folder to text files.
    """
    for class_name in os.listdir(PDF_FOLDER):
        class_path = os.path.join(PDF_FOLDER, class_name)

        if os.path.isdir(class_path):
            for subject in os.listdir(class_path):
                subject_path = os.path.join(class_path, subject)

                if os.path.isdir(subject_path):
                    text_subject_path = os.path.join(TEXT_FOLDER, class_name, subject)
                    os.makedirs(text_subject_path, exist_ok=True)

                    for pdf_file in os.listdir(subject_path):
                        if pdf_file.endswith(".pdf"):
                            pdf_path = os.path.join(subject_path, pdf_file)
                            text_path = os.path.join(text_subject_path, pdf_file.replace(".pdf", ".txt"))

                            print(f"Extracting text from {pdf_file}...")
                            text = extract_text_from_pdf(pdf_path)
                            cleaned_text = clean_text(text)

                            # Save cleaned text
                            with open(text_path, "w", encoding="utf-8") as f:
                                f.write(cleaned_text)

                            print(f"✅ Saved: {text_path}")


def match_answers_with_model(student_answer, model_answer, grading_feedback):
    """
    Creates a structured sample with matched student and model answers + grading.
    """
    sample = {
        "input": {
            "student_answer": student_answer,
            "model_answer": model_answer
        },
        "output": {
            "score": grading_feedback.get("score", 0),
            "grade": grading_feedback.get("grade", "F"),
            "feedback": grading_feedback.get("feedback", "No feedback available.")
        }
    }
    return sample


def extract_grading_feedback(feedback_text):
    """
    Extracts score, grade, and feedback from the grading report.
    """
    score = 0
    grade = "F"
    feedback = "No feedback provided."

    # Extract score, grade, and feedback
    lines = feedback_text.splitlines()
    for line in lines:
        if "Score:" in line:
            score = int(line.split(":")[1].strip())
        elif "Grade:" in line:
            grade = line.split(":")[1].strip()
        else:
            feedback += f"\n{line}"

    return {
        "score": score,
        "grade": grade,
        "feedback": feedback
    }


def process_training_data():
    """
    Iterates through student submissions and corresponding model answers
    to create structured samples with grading feedback.
    """
    for class_name in os.listdir(STUDENT_SUBMISSIONS_FOLDER):
        student_class_path = os.path.join(STUDENT_SUBMISSIONS_FOLDER, class_name)

        if os.path.isdir(student_class_path):
            for subject in os.listdir(student_class_path):
                student_subject_path = os.path.join(student_class_path, subject)

                model_subject_path = os.path.join(MODEL_ANSWER_FOLDER, class_name, subject)
                grading_subject_path = os.path.join(GRADING_REPORTS_FOLDER, class_name, subject)

                if os.path.isdir(student_subject_path):
                    for submission_file in os.listdir(student_subject_path):
                        if submission_file.endswith("_submissions.txt"):
                            
                            # Load student answer
                            student_path = os.path.join(student_subject_path, submission_file)
                            with open(student_path, "r", encoding="utf-8") as f:
                                student_answer = f.read()

                            # Identify corresponding model answer
                            chapter = submission_file.replace("_submissions.txt", "")
                            model_file = f"{chapter}.txt"
                            model_path = os.path.join(model_subject_path, model_file)

                            if os.path.exists(model_path):
                                with open(model_path, "r", encoding="utf-8") as f:
                                    model_answer = f.read()
                            else:
                                model_answer = "Model answer not found."

                            # Load grading report
                            grading_file = f"{chapter}_grading.txt"
                            grading_path = os.path.join(grading_subject_path, grading_file)

                            if os.path.exists(grading_path):
                                with open(grading_path, "r", encoding="utf-8") as f:
                                    grading_feedback_text = f.read()
                                grading_feedback = extract_grading_feedback(grading_feedback_text)
                            else:
                                grading_feedback = {
                                    "score": 0,
                                    "grade": "F",
                                    "feedback": "No feedback available."
                                }

                            # Create structured training sample
                            training_sample = match_answers_with_model(student_answer, model_answer, grading_feedback)

                            # Save sample to JSON file
                            output_file = os.path.join(TRAINING_DATA_FOLDER, f"{class_name}_{subject}_{chapter}.json")
                            with open(output_file, "w", encoding="utf-8") as f:
                                json.dump(training_sample, f, indent=4)

                            print(f"✅ Saved training sample: {output_file}")


# === Execute the pipeline ===
print("🔧 Extracting PDFs to text...")
process_pdfs_to_text()

print("\n🔧 Processing training data...")
process_training_data()

print("✅ Data preprocessing and alignment completed!")


🔧 Extracting PDFs to text...
Extracting text from Chapter1.pdf.pdf...
✅ Saved: ncert_texts\class_11\biology\Chapter1.txt.txt
Extracting text from Chapter10.pdf.pdf...
✅ Saved: ncert_texts\class_11\biology\Chapter10.txt.txt
Extracting text from Chapter11.pdf.pdf...
✅ Saved: ncert_texts\class_11\biology\Chapter11.txt.txt
Extracting text from Chapter12.pdf.pdf...
✅ Saved: ncert_texts\class_11\biology\Chapter12.txt.txt
Extracting text from Chapter13.pdf.pdf...
✅ Saved: ncert_texts\class_11\biology\Chapter13.txt.txt
Extracting text from Chapter14.pdf.pdf...
✅ Saved: ncert_texts\class_11\biology\Chapter14.txt.txt
Extracting text from Chapter15.pdf.pdf...
✅ Saved: ncert_texts\class_11\biology\Chapter15.txt.txt
Extracting text from Chapter16.pdf.pdf...
✅ Saved: ncert_texts\class_11\biology\Chapter16.txt.txt
Extracting text from Chapter17.pdf.pdf...
✅ Saved: ncert_texts\class_11\biology\Chapter17.txt.txt
Extracting text from Chapter18.pdf.pdf...
✅ Saved: ncert_texts\class_11\biology\Chapter18.t

ValueError: invalid literal for int() with base 10: '92/100**'