In [None]:
pip install PyPDF2

In [1]:
import os
import re
import json
from PyPDF2 import PdfReader

# ✅ Define folder paths (ensure these paths are correct)
FOLDER_PATHS = {
    'Depression': './Depression & Anxiety',
    'Personality & Behaviour': './Personality & Behaviour Analysis',
    'Stress & Coping': './Stress & Coping Mechanisms',
    'Trauma & PTSD': './Trauma & PTSD'
}

# ✅ Extract text from PDF
def extract_text_from_pdf(file_path):
    reader = PdfReader(file_path)
    text = ""
    for page in reader.pages:
        text += page.extract_text() or ""
    return text

# ✅ More robust question extraction
def extract_lines_as_questions(text, filename):
    questions = []
    lines = [line.strip() for line in text.strip().split('\n') if line.strip()]
    current_question = ""
    in_question_block = False

    for line in lines:
        # Start of a question block (even more robust patterns)
        if re.match(r'^(?:\d+\.|[A-Z]\.|[a-z]\.)\s', line) or \
           re.match(r'^(What|Why|How|When|Where|Who|Is|Are|Do|Does)\s', line, re.IGNORECASE) or \
           re.search(r'^\s*[-–]\s', line) or \
           re.search(r'^\d+\s*\.', line) or \
           in_question_block:

            in_question_block = True

            # End of a question block (customizable based on patterns -  filename-aware)
            if (re.search(r'(?:TOTAL:|Scoring:|Interpretation:|add columns|End of test|Disclaimer:|Note:)', line, re.IGNORECASE) or
                ("HAMILTON-DEPRESSION" in filename and re.search(r'^\d+\s[A-Z\s]+\n', line))
               ):
                in_question_block = False
                if current_question:
                    questions.append(current_question.strip())
                current_question = ""
                continue

            if current_question:
                questions.append(current_question.strip())
            current_question = line
        elif in_question_block:
            # Improved continuation (Handles bullet points and indentation)
            if not re.match(r'^(?:\d+\.|[A-Z]\.|[a-z]\.)\s', line) and not re.search(r'^\s*[-–]\s', line) and not re.search(r'^\d+\s*\.', line):
                current_question += " " + line
        elif current_question:
            questions.append(current_question.strip())
            current_question = ""

    if current_question:
        questions.append(current_question.strip())

    # Further cleaning: Remove leading numbering/lettering and extra whitespace
    cleaned_questions = []
    for q in questions:
        cleaned_q = re.sub(r'^(?:\d+\.|[A-Z]\.|[a-z]\.)\s+', '', q).strip()
        cleaned_q = re.sub(r'^\d+\s*\.\s*', '', cleaned_q).strip()  # Also remove "1. "
        cleaned_q = re.sub(r'\s+', ' ', cleaned_q).strip()
        cleaned_questions.append(cleaned_q)

    return [q for q in cleaned_questions if q]


# ✅ Extract structured questions from folder
def extract_questions_from_folder(folder_path, category):
    question_data = {"category": category, "questions": []}
    for file in os.listdir(folder_path):
        if file.endswith('.pdf'):
            file_path = os.path.join(folder_path, file)
            try:
                print(f"📄 Processing {file_path}...")
                text = extract_text_from_pdf(file_path)
                questions = extract_lines_as_questions(text, os.path.basename(file))  # Pass filename
                if questions:
                    question_data["questions"].append({"filename": os.path.basename(file), "questions": questions})
            except Exception as e:
                print(f"❌ Error reading {file_path}: {e}")
    return question_data


# ✅ Extract and structure all category questions
all_category_questions = []

for category, path in FOLDER_PATHS.items():
    if os.path.exists(path):
        print(f"\n🔎 Extracting questions for {category} from {path}...")
        extracted_data = extract_questions_from_folder(path, category)
        if extracted_data["questions"]:
            all_category_questions.append(extracted_data)
        else:
            print(f"⚠️ No questions found in {category}")
    else:
        print(f"❌ Folder not found: {path}")

# ✅ Display example
print("\n📝 Preview of extracted content:")
for category_data in all_category_questions:
    print(f"\n--- {category_data['category']} ---")
    for file_data in category_data["questions"]:
        print(f"\n  -- {file_data['filename']} --")
        if file_data["questions"]:
            for i, q in enumerate(file_data["questions"][:min(5, len(file_data["questions"]))], 1):  # Preview first 5
                print(f"    {i}. {q}")
        else:
            print("    No questions extracted.")
    print("-" * 30)

# ✅ Save to JSON file
with open('questions.json', 'w', encoding='utf-8') as f:
    json.dump(all_category_questions, f, indent=4, ensure_ascii=False)

print("\n✅ Structured questions saved to 'questions.json'")


🔎 Extracting questions for Depression from ./Depression & Anxiety...
📄 Processing ./Depression & Anxiety\Beck Anxiety Inventory (BAI) – Measures anxiety levels..pdf...
📄 Processing ./Depression & Anxiety\Beck-Depression-Inventory-BDI(self reported test for depression symptoms).pdf...
📄 Processing ./Depression & Anxiety\GAD-7_Anxiety-updated_0(Assesses anxiety levels.).pdf...
📄 Processing ./Depression & Anxiety\HAMILTON-DEPRESSION(Used for clinical depression assessment.).pdf...
📄 Processing ./Depression & Anxiety\PHQ9 id date 08.03(to measure severity of dpession).pdf...

🔎 Extracting questions for Personality & Behaviour from ./Personality & Behaviour Analysis...
📄 Processing ./Personality & Behaviour Analysis\MBTI-Personality-Type-Test.pdf...
📄 Processing ./Personality & Behaviour Analysis\MMPI-2-Test-Questions.pdf...

🔎 Extracting questions for Stress & Coping from ./Stress & Coping Mechanisms...
📄 Processing ./Stress & Coping Mechanisms\COPE Inventory – Assesses different coping s