In [2]:
import re
import json

def extract_question_data(raw_text, subject="Biology", topic="Biomolecules"):
    # Split the input into blocks (for multiple questions if needed)
    blocks = re.split(r'\n(?=Question\d+)', raw_text.strip())
    
    questions = []

    for block in blocks:
        # Extract exam name and year from [NEET 2024 Re]
        exam_meta = re.search(r'\[(.*?)\]', block)
        exam_name = []
        exam_year = []

        if exam_meta:
            parts = exam_meta.group(1).split()
            for part in parts:
                if part.isdigit():
                    exam_year.append(int(part))
                else:
                    exam_name.append(part.upper())

        # Extract question text
        question_match = re.search(r'Question\d+\n(.*?)\n?\[.*?\]', block, re.DOTALL)
        question = question_match.group(1).strip() if question_match else ""

        # Extract options
        options = re.findall(r'[A-D]\.\s*(.*)', block)

        # Extract answer
        answer_match = re.search(r'Answer:\s*([A-D])', block)
        answer = answer_match.group(1) if answer_match else ""

        # Extract solution
        solution_match = re.search(r'Solution:\s*(.*)', block, re.DOTALL)
        solution = solution_match.group(1).strip() if solution_match else ""

        # Append structured question
        questions.append({
            "question": question,
            "options": options,
            "subject": subject,
            "topic": topic,
            "exam name": exam_name,
            "exam year": exam_year,
            "answer": answer,
            "solution": solution
        })

    return questions

# Example usage
raw_input = """
Question1
Which of the following is a nucleotide?
[NEET 2024 Re]
Options:

A. Uridine
B. Adenylic acid
C. Guanine
D. Guanosine

Answer: B

Solution:
Uridine is a nucleoside. Thus, option (1) is incorrect.
Adenylic acid is a nucleotide. Thus, option (2) is correct.
Guanine is a nitrogenous base. Thus, option (3) is incorrect.
Guanosine is a nucleoside. Thus, option (4) is incorrect.
"""

# Run the extractor
data = extract_question_data(raw_input)

# Convert to JSON and print
print(json.dumps(data, indent=2))


[
  {
    "question": "Which of the following is a nucleotide?",
    "options": [
      "Uridine",
      "Adenylic acid",
      "Guanine",
      "Guanosine"
    ],
    "subject": "Biology",
    "topic": "Biomolecules",
    "exam name": [
      "NEET",
      "RE"
    ],
    "exam year": [
      2024
    ],
    "answer": "B",
    "solution": "Uridine is a nucleoside. Thus, option (1) is incorrect.\nAdenylic acid is a nucleotide. Thus, option (2) is correct.\nGuanine is a nitrogenous base. Thus, option (3) is incorrect.\nGuanosine is a nucleoside. Thus, option (4) is incorrect."
  }
]


In [3]:
with open("biomolecules.txt", "r", encoding="utf-8") as f:
    raw_input = f.read()


In [4]:
data_biomolecules = extract_question_data(raw_input)

In [5]:
data_biomolecules

[{'question': 'Which of the following is a nucleotide?',
  'options': ['Uridine', 'Adenylic acid', 'Guanine', 'Guanosine'],
  'subject': 'Biology',
  'topic': 'Biomolecules',
  'exam name': ['NEET', 'RE'],
  'exam year': [2024],
  'answer': 'B',
  'solution': 'Uridine is a nucleoside. Thus, option (1) is incorrect.\nAdenylic acid is a nucleotide. Thus, option (2) is correct.\nGuanine is a nitrogenous base. Thus, option (3) is incorrect.\nGuanosine is a nucleoside. Thus, option (4) is incorrect.\n\n-------------------------------------------------------------------------------------------------'},
 {'question': 'Ligases is a class of enzymes responsible for catalysing the linking\ntogether of two compounds.\nWhich of the following bonds is not catalysed by it?',
  'options': ['C−C', 'P−O', 'C−O', 'C−N'],
  'subject': 'Biology',
  'topic': 'Biomolecules',
  'exam name': ['NEET', 'RE'],
  'exam year': [2024],
  'answer': 'A',
  'solution': 'Option (1) is the correct answer because, ligase