### Install dependencies and import libraries

In [1]:
import os
from typing import List, Optional
from dotenv import load_dotenv
from pydantic import BaseModel, Field
from langchain_community.document_loaders import PyPDFLoader
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate

In [2]:
# Load environment variables
load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

In [3]:
pdf_path = "C:\\Users\\HP\\Documents\\repos\\essay-checker-agentic-rag\\data\\raw\\essay_rubric.pdf"
loader = PyPDFLoader(pdf_path)
docs = loader.load()
docs

[Document(metadata={'producer': 'Microsoft® Word 2024', 'creator': 'Microsoft® Word 2024', 'creationdate': '2025-12-16T14:21:40+08:00', 'author': 'hp404sk7@outlook.com', 'moddate': '2025-12-16T14:21:40+08:00', 'source': 'C:\\Users\\HP\\Documents\\repos\\essay-checker-agentic-rag\\data\\raw\\essay_rubric.pdf', 'total_pages': 5, 'page': 0, 'page_label': '1'}, page_content='ESAC Integrated Writing Assessment (IWA) \nRubric: Generative AI in Higher Education \nEssay Topic \nThe integration of Generative Artificial Intelligence (GenAI) in higher education: \nchallenges to academic integrity, benefits for learning, and the extent of \ninstitutional adoption. \n \nAssessment Criteria Overview \nCriterion Weight \n1. Task Response & Argument 20% \n2. Critical Thinking & Evaluation 20% \n3. Use of Sources & Referencing 20% \n4. Academic Writing Style & Register 15% \n5. Organisation & Cohesion 15% \n6. Language Accuracy & Control 10% \nTotal 100% \n \n1. Task Response & Argument (20%) \nBand De

In [4]:
#Define the Rubric Schema ---
# This structure is critical for the Final Judge. 
# It needs to know exactly what an "A" looks like vs. a "B" for EACH criterion.

from typing import List, Optional
from pydantic import BaseModel, Field

# 1. Performance Level (The 'Cell' in the table)
class PerformanceLevel(BaseModel):
    grade_label: str = Field(..., description="The label (e.g., 'High Distinction', 'A', 'Band 5').")
    score_range: str = Field(..., description="The point range (e.g., '80-100', '16-20').")
    # ESAC rubrics often have multiple bullet points per cell. Capturing them as a list is better.
    descriptor_points: List[str] = Field(
        ..., 
        description="A list of specific qualifiers/bullets found in this cell (e.g. ['Uses complex grammar', 'Minimal errors'])."
    )

# 2. Criterion (The 'Row' in the table)
class AssessmentCriterion(BaseModel):
    category: Optional[str] = Field(
        None, 
        description="The broader category this criterion belongs to (e.g., 'Language', 'Content', 'Structure')."
    )
    name: str = Field(..., description="The specific skill being assessed (e.g., 'Referencing Conventions', 'Critical Analysis').")
    weight: str = Field(..., description="The weight of this criterion (e.g., '30%', '10 marks').")
    levels: List[PerformanceLevel] = Field(..., description="The grading scale for this specific criterion.")

# 3. The Full Rubric
class RubricExtractionResult(BaseModel):
    title: str = Field(..., description="Title of the rubric.")
    context_notes: List[str] = Field(
        default_factory=list, 
        description="Any global rules found (e.g., 'Plagiarism results in 0', 'Word count penalty')."
    )
    criteria: List[AssessmentCriterion]

In [5]:
#Setup the Model (DeepSeek via OpenAI API) ---
llm = ChatOpenAI(
    model="deepseek-ai/DeepSeek-V3",
    openai_api_key=OPENAI_API_KEY,
    openai_api_base="https://api.siliconflow.cn/v1",
    temperature=0  # Keep it 0 for consistent analysis
)

# Bind the schema to the model
structured_llm = llm.with_structured_output(RubricExtractionResult)

In [6]:
system_prompt = """
You are an AI specialized in "Rubric Digitization". 
Your task is to extract grading criteria from raw PDF text (which often has broken table formatting) and structure it into a precise JSON format.

**Extraction Rules:**
1. **Reconstruct the Grid:** Input text often comes from tables where rows and columns are jumbled. You must logically group text based on context.
2. **Identify Criteria (Rows):** Look for distinct skills being assessed (e.g., "Argumentation", "Structure").
3. **Identify Levels (Columns):** Look for grade headers (e.g., "HD", "D", "C", "P") and map the descriptions under them correctly.
4. **Capture Descriptors:** The most important output is the **description** text for each level. This is what the grading AI will use to judge the student.

If a criterion has a weight (e.g., "20%"), extract it.
"""

In [7]:
prompt = ChatPromptTemplate.from_messages([
    ("system", system_prompt),
    ("human", "{text}")
])

extraction_chain = prompt | structured_llm

In [8]:
# Execution Logic ---

def extract_rubric(pdf_path: str):
    print(f"Loading Rubric PDF from: {pdf_path}")
    loader = PyPDFLoader(pdf_path)
    docs = loader.load()
    
    if not docs:
        print("Error: No documents found.")
        return None

    # Rubrics often span 1-2 pages. Merging them helps the LLM see the whole table structure.
    full_text = "\n\n".join([d.page_content for d in docs])
    
    print(f"Digitizing Rubric ({len(full_text)} characters)...")
    
    try:
        result = extraction_chain.invoke({"text": full_text})
        return result.dict()

    except Exception as e:
        print(f"Error during rubric extraction: {e}")
        return None

In [9]:
rubric_json = extract_rubric(pdf_path)
print(rubric_json)

Loading Rubric PDF from: C:\Users\HP\Documents\repos\essay-checker-agentic-rag\data\raw\essay_rubric.pdf
Digitizing Rubric (5293 characters)...
{'title': 'ESAC Integrated Writing Assessment (IWA) Rubric', 'context_notes': ['Rubric for ESAC Integrated Writing Assessment (IWA) on Generative AI in Higher Education', 'Essay covers challenges to academic integrity, benefits for learning, and institutional adoption'], 'criteria': [{'category': 'Task Response & Argument', 'name': 'Task Response & Argument', 'weight': '20%', 'levels': [{'grade_label': 'Band 9 (Excellent)', 'score_range': '20%', 'descriptor_points': ['Fully addresses all parts of the task with a clear, insightful, and original argument', 'Demonstrates a nuanced understanding of GenAI’s challenges and benefits', 'Position on university adoption is sophisticated and consistently sustained']}, {'grade_label': 'Band 8 (Strong)', 'score_range': '20%', 'descriptor_points': ['Addresses all aspects of the task effectively with a clear 

C:\Users\HP\AppData\Local\Temp\ipykernel_20828\2679192163.py:19: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.12/migration/
  return result.dict()


In [10]:
#export to JSON file
import json
from pathlib import Path

RUBRICS_JSON_PATH = "C:\\Users\\HP\\Documents\\repos\\essay-checker-agentic-rag\\data\\processed\\extracted_rubrics.json"
output_path = Path(RUBRICS_JSON_PATH)
output_path.parent.mkdir(parents=True, exist_ok=True)

with open(RUBRICS_JSON_PATH, 'w', encoding='utf-8') as f:
    f.write(json.dumps(rubric_json, indent=2))

print(f"Saved {len(rubric_json)} records to {output_path}")

Saved 3 records to C:\Users\HP\Documents\repos\essay-checker-agentic-rag\data\processed\extracted_rubrics.json


## Final

In [None]:
import os
from typing import List, Optional
from dotenv import load_dotenv
from pydantic import BaseModel, Field
from langchain_community.document_loaders import PyPDFLoader
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate

# Load environment variables
load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

# --- Step 1: Define the Rubric Schema ---
# This structure is critical for the Final Judge. 
# It needs to know exactly what an "A" looks like vs. a "B" for EACH criterion.

class PerformanceLevel(BaseModel):
    grade_label: str = Field(..., description="The label for this level (e.g., 'High Distinction', 'A', 'Level 4').")
    score_range: str = Field(..., description="The point range or percentage for this level (e.g., '80-100', '16-20 points').")
    description: str = Field(..., description="The detailed text describing the requirements to achieve this specific level.")

class AssessmentCriterion(BaseModel):
    name: str = Field(..., description="The name of the criterion (e.g., 'Critical Thinking', 'Grammar & Syntax').")
    weight: str = Field(..., description="The weight of this criterion (e.g., '30%', '20 points').")
    levels: List[PerformanceLevel] = Field(..., description="The breakdown of how this specific criterion is graded across different performance levels.")

class RubricExtractionResult(BaseModel):
    title: str = Field(..., description="The title of the assignment or rubric.")
    total_points: str = Field(..., description="The total points possible for the assignment.")
    criteria: List[AssessmentCriterion] = Field(..., description="The list of all assessment criteria found in the rubric.")

# --- Step 2: Setup the Model (DeepSeek via OpenAI API) ---
llm = ChatOpenAI(
    model="deepseek-ai/DeepSeek-V3",
    openai_api_key=OPENAI_API_KEY,
    openai_api_base="https://api.siliconflow.cn/v1",
    temperature=0
)

# Bind the schema
structured_llm = llm.with_structured_output(RubricExtractionResult)

# --- Step 3: Refined Prompt for Table Reconstruction ---
system_prompt = """
You are an AI specialized in "Rubric Digitization". 
Your task is to extract grading criteria from raw PDF text (which often has broken table formatting) and structure it into a precise JSON format.

**Extraction Rules:**
1. **Reconstruct the Grid:** Input text often comes from tables where rows and columns are jumbled. You must logically group text based on context.
2. **Identify Criteria (Rows):** Look for distinct skills being assessed (e.g., "Argumentation", "Structure").
3. **Identify Levels (Columns):** Look for grade headers (e.g., "HD", "D", "C", "P") and map the descriptions under them correctly.
4. **Capture Descriptors:** The most important output is the **description** text for each level. This is what the grading AI will use to judge the student.

If a criterion has a weight (e.g., "20%"), extract it.
"""

prompt = ChatPromptTemplate.from_messages([
    ("system", system_prompt),
    ("human", "{text}")
])

chain = prompt | structured_llm

# --- Step 4: Execution Logic ---

def extract_rubric(pdf_path: str):
    print(f"Loading Rubric PDF from: {pdf_path}")
    loader = PyPDFLoader(pdf_path)
    docs = loader.load()
    
    if not docs:
        print("Error: No documents found.")
        return None

    # Rubrics often span 1-2 pages. Merging them helps the LLM see the whole table structure.
    full_text = "\n\n".join([d.page_content for d in docs])
    
    print(f"Digitizing Rubric ({len(full_text)} characters)...")
    
    try:
        result = chain.invoke({"text": full_text})
        return result.dict()

    except Exception as e:
        print(f"Error during rubric extraction: {e}")
        return None

# --- Usage Example ---
# pdf_path = r"C:\Users\HP\Documents\rubrics\assignment_1_rubric.pdf"
# rubric_json = extract_rubric(pdf_path)

# if rubric_json:
#     import json
#     print(json.dumps(rubric_json, indent=2))