In [None]:
pip install PyPDF2

In [10]:
import os
import re
from PyPDF2 import PdfReader
import json

# ✅ Define folder paths
FOLDER_PATHS = {
    'Depression': './Depression & Anxiety',
    'Personality & Behaviour': './Personality & Behaviour Analysis',
    'Stress & Coping': './Stress & Coping Mechanisms',
    'Trauma & PTSD': './Trauma & PTSD'
}

# ✅ Extract text from PDF
def extract_text_from_pdf(file_path):
    reader = PdfReader(file_path)
    text = ""
    for page in reader.pages:
        text += page.extract_text() or ""
    return text

# ✅ Extract fully generated questions (more aggressive)
def extract_full_questions(text):
    questions = set()
    # Pattern to find lines ending with a question mark or a period (more inclusive)
    question_like_pattern = re.compile(r"^(.*?[?\.])\s*$", re.MULTILINE)

    # Pattern to identify potential starting points of questions (e.g., numbers, Q., capitalized words)
    question_start_pattern = re.compile(r"^(?:Q\.\s*\d+|\d+\.|[A-Z][a-z]+|[A-Z]{2,})\b", re.MULTILINE)

    potential_questions = question_like_pattern.findall(text)
    for pq in potential_questions:
        cleaned_question = pq.strip()
        if cleaned_question:
            questions.add(cleaned_question)

    # Further attempt to capture questions that might span multiple lines
    multi_line_questions = set()
    lines = text.strip().split('\n')
    current_question = ""
    for line in lines:
        line = line.strip()
        if line:
            current_question += " " + line
            if line.endswith('?') or line.endswith('.'):
                multi_line_questions.add(current_question.strip())
                current_question = ""
            # Add a heuristic to break if too many non-question-starting lines
            elif not question_start_pattern.match(line) and current_question.count(' ') > 10:
                current_question = "" # Reset if a long segment doesn't look like a question

    questions.update(multi_line_questions)
    return sorted(list(questions))

# ✅ Extract fully generated questions from all PDFs in a folder
def extract_questions_from_folder(folder_path):
    all_questions = []
    for file in os.listdir(folder_path):
        if file.endswith('.pdf'):
            file_path = os.path.join(folder_path, file)
            try:
                print(f"📄 Processing {file_path}...")
                text = extract_text_from_pdf(file_path)
                extracted = extract_full_questions(text)
                if extracted:
                    all_questions.extend(extracted)
            except Exception as e:
                print(f"❌ Error reading {file_path}: {e}")
    return all_questions

# ✅ Generate the combined questions dictionary
all_category_questions = {}

for category, path in FOLDER_PATHS.items():
    if os.path.exists(path):
        print(f"\n🔎 Extracting full questions for {category}...")
        extracted_questions = extract_questions_from_folder(path)
        if extracted_questions:
            all_category_questions[category] = [{"question": q, "options": {}} for q in extracted_questions]
        else:
            print(f"⚠️ No full questions found in {category}")
    else:
        print(f"❌ Folder not found: {path}")

# ✅ Display the generated questions
print("\n📝 Extracted Fully Generated Questions:")
for category, questions in all_category_questions.items():
    print(f"\n--- {category} ---")
    for i, item in enumerate(questions, 1):
        print(f"{i}. {item['question']}")
    print("-" * 20)

# ✅ Save the combined questions to questions.json
with open('questions.json', 'w', encoding='utf-8') as f:
    json.dump(all_category_questions, f, indent=4)

print("\n✅ Fully generated questions saved to 'questions.json'")


🔎 Extracting full questions for Depression...
📄 Processing ./Depression & Anxiety\Beck Anxiety Inventory (BAI) – Measures anxiety levels..pdf...
📄 Processing ./Depression & Anxiety\Beck-Depression-Inventory-BDI(self reported test for depression symptoms).pdf...
📄 Processing ./Depression & Anxiety\GAD-7_Anxiety-updated_0(Assesses anxiety levels.).pdf...
📄 Processing ./Depression & Anxiety\HAMILTON-DEPRESSION(Used for clinical depression assessment.).pdf...
📄 Processing ./Depression & Anxiety\PHQ9 id date 08.03(to measure severity of dpession).pdf...

🔎 Extracting full questions for Personality & Behaviour...
📄 Processing ./Personality & Behaviour Analysis\MBTI-Personality-Type-Test.pdf...
📄 Processing ./Personality & Behaviour Analysis\MMPI-2-Test-Questions.pdf...

🔎 Extracting full questions for Stress & Coping...
📄 Processing ./Stress & Coping Mechanisms\COPE Inventory – Assesses different coping styles (e.g., avoidance, problem-solving)..pdf...
📄 Processing ./Stress & Coping Mechani