# Tutorial

In [None]:
import sys
from pathlib import Path
import requests
import json
from dotenv import load_dotenv
import os

# Add project root to path
project_root = Path.cwd().parent
if str(project_root) not in sys.path:
    sys.path.insert(0, str(project_root))

from app.helper.pdf_processor import *
from app.helper.classifer import *

load_dotenv("../.env")
token = os.getenv("API_SECRET_TOKEN")
ip_aws = os.getenv("IP_AWS")
ip_local = os.getenv("IP_LOCAL")

## Process original pdf file

### Handbook

In [3]:
process_all_handbooks(levels = "pgr")


=== Processing PGR handbook ===
Extracting text from: /Users/qianhuilin/Desktop/AI_assistant/data/original_pdf/handbook-PGR.pdf
Extracted 140182 characters of text.
Saved extracted text locally to: /Users/qianhuilin/Desktop/AI_assistant/data/extracted/handbook-PGR.txt
Uploaded extracted text to s3://original-text-bucket/handbook/handbook-pgr.txt


### Academic Integrity Regulation

In [2]:
process_other_document(type="academic-integrity")


=== Processing academic-integrity document ===
Extracting text from: /Users/qianhuilin/Desktop/AI_assistant/data/original_pdf/academic-integrity.pdf
Extracted 47863 characters of text.
Saved extracted text locally to: /Users/qianhuilin/Desktop/AI_assistant/data/extracted/academic-integrity.txt
Uploaded extracted text to s3://original-text-bucket/academic/academic-integrity.txt


## Ask

### Health check

In [None]:

ip = ip_local if os.getenv("MODE") == "development" else ip_aws
print("Using IP:", ip)
BASE_URL = f"http://{ip}:8080"
response = requests.get(f"{BASE_URL}/health")
print("Health Check:", response.json())

Using IP: localhost
Health Check: {'status': 'ok'}


### Enter question

In [3]:
question_payload = {
    "question": "How long can I be registered for a PhD?",
    "level": "pgr",
    "origin": "international"
}

### Classification

In [4]:
category = classify_category(question_payload["question"])
print("Classified Category:", category)

Classified Category: handbook


In [5]:
response = requests.post(
    f"{BASE_URL}/ask_{category}",
    json=question_payload,
    headers={"Authorization": f"Bearer {token}"}
)

print("Answer:", response.json()["answer"])
print("\nSources:", response.json()["context_used"])
print("\nCollection_used:",response.json()["collection_used"])

Answer: As an international student registered for a PhD, you have a minimum full-time registration period of 48 months and a maximum of 60 months. Any extension beyond the maximum period must be approved by Student and Programme Administration, based on evidence of your progress, and can go up to an absolute maximum of 84 months. If you are a part-time student, the minimum registration period is 48 months, with a maximum of 84 months as well. Extensions for part-time studies follow similar guidelines.

Sources: ['rd appropriate for scholarly publication.\nINTEGRATED PHD PROGRAMMES\n5. A candidate shall register at the outset for a PhD with a minimum full-time registration\nperiod of forty-eight months and maximum of sixty months. Any extension of the maximum\nperiod must be approved by Student and Programme Administration after consideration of\nevidence of the student’s progress submitted by the department concerned, up to an\nabsolute maximum of eighty-four months.\n6. A candidate s

In [1]:
question_payload_2 = {
    "question": "How is plagiarism detected in theses?",
    "level": "pgr",
    "origin": "international"
}

In [7]:
category = classify_category(question_payload_2["question"])
print("Classified Category:", category)

Classified Category: academic_integrity


In [8]:
response = requests.post(
    f"{BASE_URL}/ask_{category}",
    json=question_payload_2,
    headers={"Authorization": f"Bearer {token}"}
)

print("Answer:", response.json()["answer"])
print("\nSources:", response.json()["context_used"])
print("\nCollection_used:",response.json()["collection_used"])

Answer: Plagiarism in theses is typically detected through a combination of the following methods:

1. **Plagiarism Detection Software**: Universities often use software tools such as Turnitin, iThenticate, or Grammarly to scan submissions for similarities with existing published works, online content, and other students' theses. These tools highlight matches and provide a similarity report.

2. **Manual Review by Examiners**: Examining committees or supervisors may read the theses carefully to identify any instances of plagiarism. They look for inconsistencies in writing style, sudden shifts in quality, or sections that do not align with the student's voice.

3. **Check for Citations and References**: Reviewers assess whether all sources are properly cited and whether the student is correctly attributing ideas and content that are not their own. A lack of citations or incorrect citation practices can indicate potential plagiarism.

4. **Historical Analysis of Student’s Work**: Examine

In [9]:
data = response.json()


print("\nHistory:")
print(json.dumps(data.get("history", []), indent=2, ensure_ascii=False))


History:
[
  {
    "question": "How long can I be registered for a PhD?",
    "answer": "As an international student registered for a PhD, you have a minimum full-time registration period of 48 months and a maximum of 60 months. Any extension beyond the maximum period must be approved by Student and Programme Administration, based on evidence of your progress, and can go up to an absolute maximum of 84 months. If you are a part-time student, the minimum registration period is 48 months, with a maximum of 84 months as well. Extensions for part-time studies follow similar guidelines."
  },
  {
    "question": "How is plagiarism detected in theses?",
    "answer": "Plagiarism in theses is typically detected through a combination of the following methods:\n\n1. **Plagiarism Detection Software**: Universities often use software tools such as Turnitin, iThenticate, or Grammarly to scan submissions for similarities with existing published works, online content, and other students' theses. Th