In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
pip install openai pdfplumber pytesseract Pillow python-dotenv

In [None]:
import os
import openai
import pdfplumber
import pytesseract
from PIL import Image
from dotenv import load_dotenv
load_dotenv()
import re

PDF_FOLDER = "/content/drive/MyDrive/creators-copilot/tests/evaluate/answer_sheets"
USE_OCR_IF_TEXT_EMPTY = True

In [None]:
OPENAI_API_KEY="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"
openai.api_key = os.getenv("OPENAI_API_KEY")

In [None]:
def extract_text_from_pdf(file_path):
    full_text = ""
    with pdfplumber.open(file_path) as pdf:
        for page in pdf.pages:
            text = page.extract_text()
            if text:
                full_text += text + "\n"
            elif USE_OCR_IF_TEXT_EMPTY:
                # OCR fallback
                image = page.to_image(resolution=300)
                pil_img = image.original
                ocr_text = pytesseract.image_to_string(pil_img)
                full_text += ocr_text + "\n"
    return full_text.strip()


def parse_questions_answers(text):
    """
    Parse a blob of Q&A text into structured list of (question, answer) pairs.
    Expects format: **Question N:** <question>\n**Student Answer:** <answer>
    """
    pattern = re.compile(r"\*\*Question\s*(\d+):\*\*\s*(.*?)\n\*\*Student Answer:\*\*\s*(.*?)(?=\n\*\*Question|\Z)", re.DOTALL)
    matches = pattern.findall(text)

    qa_list = []
    for qnum, question, answer in matches:
        question = question.strip()
        answer = answer.strip()
        qa_list.append({
            "question_number": int(qnum),
            "question": question,
            "answer": answer
        })
    return qa_list


def add_max_marks(qa_list, max_marks):
    for qa in qa_list:
        qa["max_marks"] = max_marks.get(qa["question_number"], 0)
    return qa_list



# Input grading prompt manually
def input_grading_prompt():
    print("\n✏️ Paste your grading prompt (type END to finish):")
    lines = []
    while True:
        line = input()
        if line.strip() == "END":
            break
        lines.append(line)
    grading_prompt = "\n".join(lines)

    full_prompt = f"{grading_prompt}\n\nHere are the student's answers:\n{answer_text}"
    return full_prompt



def call_openai(prompt):
    response = openai.ChatCompletion.create(
        model="gpt-4",  # or gpt-3.5-turbo
        messages=[{"role": "user", "content": prompt}],
        temperature=0.3
    )
    return response['choices'][0]['message']['content']


def main():
    print("Grading PDFs from:", PDF_FOLDER)
    for filename in os.listdir(PDF_FOLDER):
        if not filename.lower().endswith(".pdf"):
            continue

        file_path = os.path.join(PDF_FOLDER, filename)
        print(f"\n📄 Processing: {filename}")

        # Extract text
        answer_text = extract_text_from_pdf(file_path)

        print("\n✅ Extracted text:")
        print(answer_text[:1000])  # Show first 1000 characters

        # Input grading prompt manually
        full_prompt = input_grading_prompt()

        print("\n🧠 Sending to OpenAI...")
        result = call_openai(full_prompt)
        print("\n🎯 Grading result:\n")
        print(result)

        # Optionally save to file
        result_file = os.path.join(PDF_FOLDER, filename.replace(".pdf", "_graded.txt"))
        with open(result_file, "w", encoding="utf-8") as f:
            f.write(result)
        print(f"✅ Saved grading to: {result_file}")

## Text extraction

In [None]:
file_path = os.path.join(PDF_FOLDER, 'extracted_business_(marwa).pdf')
max_marks = {q_num: 4 for q_num in range(1,11)}

In [None]:
answer_text = extract_text_from_pdf(file_path)

In [None]:
qa_list = parse_questions_answers(answer_text)
qa_list = add_max_marks(qa_list, max_marks)

In [None]:
ms_prompt = """Generate a numbered Scheme of Evaluation in this format, for each Question X:

Question X: <verbatim question>

Answer template: List the core concepts or their clear equivalents that a full answer must reference, arranged in logical order.

Marking Scheme:

(A marks) A bullet that describes the first key expectation (concept or application) in context.

(B marks) A bullet for the second expectation.

…

(–C marks) A bullet describing any automatic deduction (for example missing required context or terms).

Ensure that:

The sum of A + B + … equals the total marks for Question X.

Assessment Objectives are stated briefly at the top (for example, Conceptual Understanding, Application & Problem-Solving, Relevance & Specificity).

Please include a Notes section with each question’s mark scheme:

Open-ended questions: any one comprehensive, context-relevant answer can earn full marks.

Minor calculation errors should not cost any marks if reasoning is sound.

Coding expectations or no alternative approaches rules when applicable.

Bullets must call out the concept or context the student needs to cover, but need not use verbatim phrasing—equivalent meaning is fine. Provide marks liberally if the student shows rich understanding even if not using the right terms.
"""

In [None]:
full_prompt = f"{ms_prompt}\n\nPlease create a mark scheme for the following questions: {qa_list}"

In [None]:
client = openai.OpenAI(api_key=OPENAI_API_KEY)
def call_openai(prompt):
    response = client.chat.completions.create(
        model="gpt-4",  # or "gpt-3.5-turbo"
        messages=[{"role": "user", "content": prompt}],
        temperature=0.3
    )
    return response.choices[0].message.content
response = call_openai(full_prompt)

In [None]:
from PIL import Image
import os

def compress_to_target(input_path, output_path, target_kb, tol_kb=5):
    img = Image.open(input_path)
    # JPEG quality ranges 1 (worst) to 95 (best)
    low, high = 1, 95
    best_q = high
    while low <= high:
        mid = (low + high) // 2
        img.save(output_path, "JPEG", quality=mid, optimize=True)
        size_kb = os.path.getsize(output_path) / 1024
        # print(f"Try quality={mid}: {size_kb:.1f} KB")
        if abs(size_kb - target_kb) <= tol_kb:
            best_q = mid
            break
        if size_kb > target_kb:
            high = mid - 1
        else:
            best_q = mid
            low = mid + 1

    # Final save at best_q
    img.save(output_path, "JPEG", quality=best_q, optimize=True)
    final_size = os.path.getsize(output_path) / 1024
    print(f"Saved {output_path!r} at quality={best_q}, size={final_size:.1f} KB")

# Usage
input_path = "/content/drive/MyDrive/signature.jpg"
output_path = "/content/drive/MyDrive/signature_compressed.jpg"
compress_to_target(input_path, output_path, target_kb=140)


Saved '/content/drive/MyDrive/signature_compressed.jpg' at quality=72, size=139.9 KB
