In [None]:
pip install PyPDF2

In [3]:
import os
import re
import json
from PyPDF2 import PdfReader

# ✅ Define folder paths
FOLDER_PATHS = {
    'Depression': './Depression & Anxiety',
    'Personality & Behaviour': './Personality & Behaviour Analysis',
    'Stress & Coping': './Stress & Coping Mechanisms',
    'Trauma & PTSD': './Trauma & PTSD'
}

# ✅ Extract text from PDF
def extract_text_from_pdf(file_path):
    reader = PdfReader(file_path)
    text = ""
    for page in reader.pages:
        text += page.extract_text() or ""
    return text

# ✅ Advanced line-wise question grouping
def extract_lines_as_questions(text):
    lines = [line.strip() for line in text.strip().split('\n') if line.strip()]
    return lines

# ✅ Extract structured questions from folder
def extract_questions_from_folder(folder_path):
    question_set = set()
    for file in os.listdir(folder_path):
        if file.endswith('.pdf'):
            file_path = os.path.join(folder_path, file)
            try:
                print(f"📄 Processing {file_path}...")
                text = extract_text_from_pdf(file_path)
                lines = extract_lines_as_questions(text)
                question_set.update(lines)
            except Exception as e:
                print(f"❌ Error reading {file_path}: {e}")
    return sorted(list(question_set))

# ✅ Extract and structure all category questions
all_category_questions = {}

for category, path in FOLDER_PATHS.items():
    if os.path.exists(path):
        print(f"\n🔎 Extracting lines for {category}...")
        extracted_lines = extract_questions_from_folder(path)
        if extracted_lines:
            all_category_questions[category] = [{
                "questions": extracted_lines
            }]
        else:
            print(f"⚠️ No questions found in {category}")
    else:
        print(f"❌ Folder not found: {path}")

# ✅ Display example
print("\n📝 Preview of extracted content:")
for category, blocks in all_category_questions.items():
    print(f"\n--- {category} ---")
    for i, q in enumerate(blocks[0]["questions"][:10], 1):  # Preview first 10
        print(f"{i}. {q}")
    print("..." + "-" * 30)

# ✅ Save to JSON file
with open('questions.json', 'w', encoding='utf-8') as f:
    json.dump(all_category_questions, f, indent=4, ensure_ascii=False)

print("\n✅ Structured questions saved to 'questions.json'")



🔎 Extracting lines for Depression...
📄 Processing ./Depression & Anxiety\Beck Anxiety Inventory (BAI) – Measures anxiety levels..pdf...
📄 Processing ./Depression & Anxiety\Beck-Depression-Inventory-BDI(self reported test for depression symptoms).pdf...
📄 Processing ./Depression & Anxiety\GAD-7_Anxiety-updated_0(Assesses anxiety levels.).pdf...
📄 Processing ./Depression & Anxiety\HAMILTON-DEPRESSION(Used for clinical depression assessment.).pdf...
📄 Processing ./Depression & Anxiety\PHQ9 id date 08.03(to measure severity of dpession).pdf...

🔎 Extracting lines for Personality & Behaviour...
📄 Processing ./Personality & Behaviour Analysis\MBTI-Personality-Type-Test.pdf...
📄 Processing ./Personality & Behaviour Analysis\MMPI-2-Test-Questions.pdf...

🔎 Extracting lines for Stress & Coping...
📄 Processing ./Stress & Coping Mechanisms\COPE Inventory – Assesses different coping styles (e.g., avoidance, problem-solving)..pdf...
📄 Processing ./Stress & Coping Mechanisms\Perceived Stress Scale 