## Language Checker Pipeline
- checks the quality of the essay from a linguistics perspective
- including grammar vocabulary, structure etc.

In [1]:
import os
from typing import List, Optional
from dotenv import load_dotenv
from pydantic import BaseModel, Field
from langchain_community.document_loaders import PyPDFLoader
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate

In [2]:
# Load environment variables
load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

In [3]:
#loading the documents
pdf_path = "C:\\Users\\HP\\Documents\\repos\\essay-checker-agentic-rag\\data\\raw\\essay_content.pdf"
loader = PyPDFLoader(pdf_path)
docs = loader.load()
if not docs:
        print("Error: No documents found.")
docs

[Document(metadata={'producer': 'Microsoft® Word 2024', 'creator': 'Microsoft® Word 2024', 'creationdate': '2025-12-16T14:12:31+08:00', 'author': 'hp404sk7@outlook.com', 'moddate': '2025-12-16T14:12:31+08:00', 'source': 'C:\\Users\\HP\\Documents\\repos\\essay-checker-agentic-rag\\data\\raw\\essay_content.pdf', 'total_pages': 3, 'page': 0, 'page_label': '1'}, page_content='2. Student Essay Submission \nTitle \nBeyond Prohibition: Integrating Generative AI into Higher Education Assessment \nand Learning \n \nIntroduction \nThe rapid emergence of Generative Artificial Intelligence (GenAI) tools, such as ChatGPT \nand Claude, has fundamentally disrupted the landscape of higher education. While \ndigital tools have long supported academic study, GenAI’s ability to synthesise complex \ninformation and generate human-like text presents unprecedented challenges to \nestablished educational norms. \nA key concern among educators is the threat these tools pose to academic integrity, \nparticular

In [4]:
#Define the language check schema (output format)
# We need specific linguistic data points for the final grader.

class GrammarError(BaseModel):
    original_text: str = Field(..., description="The exact text snippet containing the error.")
    correction: str = Field(..., description="The corrected version of the text.")
    error_type: str = Field(..., description="Type of error (e.g., 'Subject-Verb Agreement', 'Punctuation', 'Spelling').")
    explanation: str = Field(..., description="Brief explanation of why this is an error.")

class VocabularyAnalysis(BaseModel):
    score: int = Field(..., description="A score from 1-10 rating the vocabulary sophistication.")
    repetitive_words: List[str] = Field(..., description="List of words used excessively (excluding common stop words).")
    advanced_words_used: List[str] = Field(..., description="List of sophisticated or domain-specific words used correctly.")
    feedback: str = Field(..., description="Qualitative feedback on word choice and variety.")

class StructureAnalysis(BaseModel):
    sentence_variety_score: int = Field(..., description="A score from 1-10 on sentence length and structure variety.")
    flow_issues: List[str] = Field(..., description="List of specific issues with flow, transitions, or paragraph cohesion.")
    feedback: str = Field(..., description="Qualitative feedback on the overall structure and flow.")

class LanguageAnalysisResult(BaseModel):
    grammar_issues: List[GrammarError] = Field(default_factory=list, description="List of specific grammar and mechanics errors found.")
    vocabulary: VocabularyAnalysis = Field(..., description="Analysis of the student's vocabulary usage.")
    structure: StructureAnalysis = Field(..., description="Analysis of sentence structure and essay flow.")
    overall_tone: str = Field(..., description="Description of the essay's tone (e.g., 'Formal', 'Casual', 'Inconsistent').")
    summary_critique: str = Field(..., description="A concise summary of the linguistic quality for the final grader.")

In [5]:
#Setup the Model (DeepSeek via OpenAI API) ---
llm = ChatOpenAI(
    model="deepseek-ai/DeepSeek-V3",
    openai_api_key=OPENAI_API_KEY,
    openai_api_base="https://api.siliconflow.cn/v1",
    temperature=0  # Keep it 0 for consistent analysis
)

# Bind the robust schema
structured_llm = llm.with_structured_output(LanguageAnalysisResult)

In [6]:
# Define the Prompt ---
system_prompt = """
You are a strict Linguistic Professor and Editor. 
Your goal is to analyze the student's essay purely on **language mechanics, style, and structure**. 
Do NOT grade the content or arguments; focus only on HOW it is written.

Analyze the text for:
1. Grammar, punctuation, and spelling errors (be specific).
2. Vocabulary sophistication and redundancy.
3. Sentence structure variety (simple vs. complex) and flow.
4. Tone consistency.

Provide a detailed structured output that a final grader can use to penalize or reward the student.
"""

In [7]:
prompt = ChatPromptTemplate.from_messages([
    ("system", system_prompt),
    ("human", "Here is the student's essay:\n\n{text}")
])

chain = prompt | structured_llm

In [11]:
# execute 
full_text = "\n\n".join([d.page_content for d in docs])
print(f"Analyzing {len(docs)} pages ({len(full_text)} characters)...")
try:
    # Invoke the chain with the full text
    result = chain.invoke({"text": full_text})
    result_json = result.dict()

except Exception as e:
    print(f"Error during analysis: {e}")

result_json

Analyzing 3 pages (5227 characters)...


C:\Users\HP\AppData\Local\Temp\ipykernel_4436\2905930422.py:7: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.12/migration/
  result_json = result.dict()


{'grammar_issues': [{'original_text': 'synthesise',
   'correction': 'synthesize',
   'error_type': 'Spelling (British vs. American English)',
   'explanation': "'Synthesise' is British spelling; consistency should be maintained if American spelling is used elsewhere."},
  {'original_text': 'standard essay writing',
   'correction': 'standard essay-writing',
   'error_type': 'Hyphenation',
   'explanation': 'Compound adjective before a noun requires hyphenation.'},
  {'original_text': 'Chen et al. (2024) demonstrate',
   'correction': 'Chen et al. (2024) demonstrate',
   'error_type': 'Subject-verb agreement',
   'explanation': "'Chen et al.' is plural (referring to multiple authors), so 'demonstrate' is correct."}],
 'vocabulary': {'score': 8,
  'repetitive_words': [],
  'advanced_words_used': ['unprecedented',
   'Socratic',
   'scaffold',
   'pedagogical'],
  'feedback': 'Precise and discipline-appropriate. No unnecessary repetition.'},
 'structure': {'sentence_variety_score': 4,
  

In [12]:
# export to JSON file
LANGUAGE_ANALYSIS_OUTPUT_PATH = "C:\\Users\\HP\\Documents\\repos\\essay-checker-agentic-rag\\data\\processed\\language_analysis_output.json"

import json
from pathlib import Path

Path(LANGUAGE_ANALYSIS_OUTPUT_PATH).write_text(json.dumps(result_json, indent=2))
print(f"Output saved to {LANGUAGE_ANALYSIS_OUTPUT_PATH}")

Output saved to C:\Users\HP\Documents\repos\essay-checker-agentic-rag\data\processed\language_analysis_output.json


### Final

In [None]:
# 1. Install dependencies (if not already installed)
# !pip install langchain langchain-openai langchain-community pydantic pypdf python-dotenv

import os
from typing import List, Optional
from dotenv import load_dotenv
from pydantic import BaseModel, Field
from langchain_community.document_loaders import PyPDFLoader
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate

# Load environment variables
load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

# --- Step 1: Define the "Language Check" Schema ---
# We need specific linguistic data points for the final grader.

class GrammarError(BaseModel):
    original_text: str = Field(..., description="The exact text snippet containing the error.")
    correction: str = Field(..., description="The corrected version of the text.")
    error_type: str = Field(..., description="Type of error (e.g., 'Subject-Verb Agreement', 'Punctuation', 'Spelling').")
    explanation: str = Field(..., description="Brief explanation of why this is an error.")

class VocabularyAnalysis(BaseModel):
    score: int = Field(..., description="A score from 1-10 rating the vocabulary sophistication.")
    repetitive_words: List[str] = Field(..., description="List of words used excessively (excluding common stop words).")
    advanced_words_used: List[str] = Field(..., description="List of sophisticated or domain-specific words used correctly.")
    feedback: str = Field(..., description="Qualitative feedback on word choice and variety.")

class StructureAnalysis(BaseModel):
    sentence_variety_score: int = Field(..., description="A score from 1-10 on sentence length and structure variety.")
    flow_issues: List[str] = Field(..., description="List of specific issues with flow, transitions, or paragraph cohesion.")
    feedback: str = Field(..., description="Qualitative feedback on the overall structure and flow.")

class LanguageAnalysisResult(BaseModel):
    grammar_issues: List[GrammarError] = Field(default_factory=list, description="List of specific grammar and mechanics errors found.")
    vocabulary: VocabularyAnalysis = Field(..., description="Analysis of the student's vocabulary usage.")
    structure: StructureAnalysis = Field(..., description="Analysis of sentence structure and essay flow.")
    overall_tone: str = Field(..., description="Description of the essay's tone (e.g., 'Formal', 'Casual', 'Inconsistent').")
    summary_critique: str = Field(..., description="A concise summary of the linguistic quality for the final grader.")

# --- Step 2: Setup the Model (DeepSeek via OpenAI API) ---
llm = ChatOpenAI(
    model="deepseek-ai/DeepSeek-V3",
    openai_api_key=OPENAI_API_KEY,
    openai_api_base="https://api.siliconflow.cn/v1",
    temperature=0  # Keep it 0 for consistent analysis
)

# Bind the robust schema
structured_llm = llm.with_structured_output(LanguageAnalysisResult)

# --- Step 3: Define the Prompt ---
system_prompt = """
You are a strict Linguistic Professor and Editor. 
Your goal is to analyze the student's essay purely on **language mechanics, style, and structure**. 
Do NOT grade the content or arguments; focus only on HOW it is written.

Analyze the text for:
1. Grammar, punctuation, and spelling errors (be specific).
2. Vocabulary sophistication and redundancy.
3. Sentence structure variety (simple vs. complex) and flow.
4. Tone consistency.

Provide a detailed structured output that a final grader can use to penalize or reward the student.
"""

prompt = ChatPromptTemplate.from_messages([
    ("system", system_prompt),
    ("human", "Here is the student's essay:\n\n{text}")
])

chain = prompt | structured_llm

# --- Step 4: Robust Execution Pipeline ---

def run_language_check(pdf_path: str):
    print(f"Loading PDF from: {pdf_path}")
    loader = PyPDFLoader(pdf_path)
    docs = loader.load()
    
    if not docs:
        print("Error: No documents found.")
        return None

    # CRITICAL CHANGE: 
    # For language checking (flow, repetition, consistency), the model needs 
    # the FULL context. DeepSeek has a large context window, so we merge pages.
    full_text = "\n\n".join([d.page_content for d in docs])
    
    print(f"Analyzing {len(docs)} pages ({len(full_text)} characters)...")
    
    try:
        # Invoke the chain with the full text
        result = chain.invoke({"text": full_text})
        
        # Output is already a Pydantic object (LanguageAnalysisResult)
        return result.dict()

    except Exception as e:
        print(f"Error during analysis: {e}")
        return None

# --- Usage ---
pdf_path = r"C:\Users\HP\Documents\repos\essay-checker-agentic-rag\data\raw\essay_content.pdf"
result_json = run_language_check(pdf_path)

if result_json:
    import json
    # Print pretty JSON
    print(json.dumps(result_json, indent=2))
else:
    print("Analysis failed.")