In [None]:
mcqs = [
    {
        "question": "Match List-I with List-II:\n\nList-I\nA. Fleming\nB. Robert Brown\nC. George Palade\nD. Camillo Golgi\n\nList-II\nI. Disc shaped sacs\nII. Chromatin\nIII. Ribosomes\nIV. Nucleus\n\nChoose the correct answer:",
        "year": "NEET 2024",
        "options": [
            "A-II, B-IV, C-III, D-I",
            "A-II, B-III, C-I, D-IV",
            "A-I, B-II, C-III, D-IV",
            "A-IV, B-II, C-III, D-I"
        ],
        "answer": "A",
        "solution": "• Nucleus was first described by Robert Brown in 1831.",
        "chapter": "Cell - The Unit of Life",
        "images": ["./mcq_images/page12_img0.png"]
    }
]


In [6]:
import re
import json
import uuid

def extract_mcqs(document_text):
    mcqs = []
    current_chapter = ""
    current_question = ""
    current_options = []
    current_answer = ""
    current_year = ""
    in_question = False
    question_number = 0

    # Split document into lines
    lines = document_text.split('\n')

    for line in lines:
        line = line.strip()

        # Detect chapter headers
        chapter_match = re.match(r'^(Topic\s*:\s*|Biological classification NEET MCQs|Plant kingdom mcq for NEET|Animal kingdom NEET Questions|Morphology of flowering Plants NEET|Morphology of flowering plant MCQ for Class 11)\s*(.*)', line, re.IGNORECASE)
        if chapter_match:
            current_chapter = chapter_match.group(2) if chapter_match.group(2) else chapter_match.group(1).strip()
            continue

        # Detect year
        year_match = re.search(r'\(NEET (\d{4}(, Phase [I|II]*)?)\)', line)
        if year_match:
            current_year = year_match.group(1)
            continue

        # Detect question start
        question_start = re.match(r'^(\d+\.\s+|A\.\s+)(.*)', line)
        if question_start:
            if in_question and current_question and len(current_options) >= 4:
                # Save previous question
                mcqs.append({
                    "question": current_question.strip(),
                    "year": current_year if current_year else "Not specified",
                    "options": current_options,
                    "answer": current_answer,
                    "solution": "",  # No solutions provided in the document
                    "chapter": current_chapter,
                    "images": []
                })
                current_question = ""
                current_options = []
                current_answer = ""
            in_question = True
            question_number = question_start.group(1).strip()
            current_question = question_start.group(2).strip()
            continue

        # Detect options
        option_match = re.match(r'^\((a|b|c|d)\)\s*(.*)', line)
        if in_question and option_match:
            current_options.append(f"({option_match.group(1)}) {option_match.group(2).strip()}")
            continue

        # Detect answer
        answer_match = re.match(r'^Answer\s*[:|=]\s*([a-dA-D1-5]|\d)\s*$', line)
        if in_question and answer_match:
            current_answer = answer_match.group(1)
            continue

        # Continue building multi-line question
        if in_question and line and not option_match and not answer_match and not re.match(r'^\s*$', line):
            current_question += " " + line

    # Append the last question
    if in_question and current_question and len(current_options) >= 4:
        mcqs.append({
            "question": current_question.strip(),
            "year": current_year if current_year else "Not specified",
            "options": current_options,
            "answer": current_answer,
            "solution": "",
            "chapter": current_chapter,
            "images": []
        })

    return mcqs

# Document text (simplified for embedding; in practice, use the full document text)
document_text = """
Topic : The Living World

1. The defining characteristic of living beings is
(a) They can reproduce
(b) They can digest their food
(c) They can respond to external stimuli
(d) They can regenerate
Answer: C

2. Metabolic Processes takes place
(a) in vitro manner
(b) in Vivo manner
(c) both a and b
(d) none of the above
Answer: C

...

Biological classification NEET MCQs

1. Plant decomposers are
(a) Monera and Fungi
(b) Fungi and Plants
(c) Protista and Animalia
(d) Animalia and Monera
Answer: a

...

Plant kingdom mcq for NEET

1. Both chlorophyll a and b are present in
(a) rhodophyceae
(b) phaeophyceae
(c) chlorophyceae
(d) None of these
Answer: c

...

Animal kingdom NEET Questions:

1. The cervical vertebrae in humans is
(a) same as in whale
(b) more than that in rabbit
(c) double than that of horse
(d) less than that in giraffe
Answer: a

...

Morphology of flowering Plants NEET

1. Plants which produce characteristics Pneumatophores and show vivipary belong to
(a) Mesophytes
(b) Psammophytes
(c) Halophytes
(d) Hydrophytes
Answer: C

...

Morphology of flowering plant MCQ for Class 11 :

1. Single cotyledon of a monocot is
(a) Plumule
(b) Epicotyl
(c) Coleorrhiza
(d) Scutellum
Answer: D
"""

# Extract MCQs
mcqs = extract_mcqs(document_text)

# Output as JSON string
mcqs_json = json.dumps(mcqs, indent=4)

# Print the JSON (for demonstration; in artifact, this is the content)
print(mcqs_json)

[
    {
        "question": "The defining characteristic of living beings is",
        "year": "Not specified",
        "options": [
            "(a) They can reproduce",
            "(b) They can digest their food",
            "(c) They can respond to external stimuli",
            "(d) They can regenerate"
        ],
        "answer": "C",
        "solution": "",
        "chapter": "The Living World",
        "images": []
    },
    {
        "question": "Metabolic Processes takes place ...",
        "year": "Not specified",
        "options": [
            "(a) in vitro manner",
            "(b) in Vivo manner",
            "(c) both a and b",
            "(d) none of the above"
        ],
        "answer": "C",
        "solution": "",
        "chapter": "Biological classification NEET MCQs",
        "images": []
    },
    {
        "question": "Plant decomposers are ...",
        "year": "Not specified",
        "options": [
            "(a) Monera and Fungi",
            "(b) Fu

In [1]:
pip install pypdf2

Collecting pypdf2
  Using cached pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Using cached pypdf2-3.0.1-py3-none-any.whl (232 kB)
Installing collected packages: pypdf2
Successfully installed pypdf2-3.0.1
Note: you may need to restart the kernel to use updated packages.


In [12]:
pip install pdfplumber

Collecting pdfplumber
  Downloading pdfplumber-0.11.6-py3-none-any.whl.metadata (42 kB)
Collecting pdfminer.six==20250327 (from pdfplumber)
  Downloading pdfminer_six-20250327-py3-none-any.whl.metadata (4.1 kB)
Collecting cryptography>=36.0.0 (from pdfminer.six==20250327->pdfplumber)
  Downloading cryptography-45.0.3-cp311-abi3-win_amd64.whl.metadata (5.7 kB)
Downloading pdfplumber-0.11.6-py3-none-any.whl (60 kB)
Downloading pdfminer_six-20250327-py3-none-any.whl (5.6 MB)
   ---------------------------------------- 0.0/5.6 MB ? eta -:--:--
   ---------------- ----------------------- 2.4/5.6 MB 12.3 MB/s eta 0:00:01
   ------------------------------- -------- 4.5/5.6 MB 12.2 MB/s eta 0:00:01
   ---------------------------------------- 5.6/5.6 MB 11.4 MB/s eta 0:00:00
Downloading cryptography-45.0.3-cp311-abi3-win_amd64.whl (3.4 MB)
   ---------------------------------------- 0.0/3.4 MB ? eta -:--:--
   --------------------------- ------------ 2.4/3.4 MB 11.2 MB/s eta 0:00:01
   --------

In [13]:
import pdfplumber
import re
import json
import uuid

def extract_mcqs_from_pdf(pdf_path):
    mcqs = []
    current_chapter = ""
    current_question = ""
    current_options = []
    current_answer = ""
    current_year = ""
    in_question = False
    question_number = 0

    # Open the PDF file
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            # Extract text from the current page
            text = page.extract_text()
            if not text:
                continue
            lines = text.split('\n')

            for line in lines:
                line = line.strip()

                # Detect chapter headers
                chapter_match = re.match(r'^(Topic\s*:\s*|Biological classification NEET MCQs|Plant kingdom mcq for NEET|Animal kingdom NEET Questions|Morphology of flowering Plants NEET|Morphology of flowering plant MCQ for Class 11)\s*(.*)', line, re.IGNORECASE)
                if chapter_match:
                    current_chapter = chapter_match.group(2) if chapter_match.group(2) else chapter_match.group(1).strip()
                    continue

                # Detect year
                year_match = re.search(r'\(NEET (\d{4}(, Phase [I|II]*)?)\)', line)
                if year_match:
                    current_year = year_match.group(1)
                    continue

                # Detect question start
                question_start = re.match(r'^(\d+\.\s+|A\.\s+)(.*)', line)
                if question_start:
                    if in_question and current_question and len(current_options) >= 4:
                        # Save previous question
                        mcqs.append({
                            "question": current_question.strip(),
                            "year": current_year if current_year else "Not specified",
                            "options": current_options,
                            "answer": current_answer,
                            "solution": "",  # No solutions provided in the document
                            "chapter": current_chapter,
                            "images": []
                        })
                        current_question = ""
                        current_options = []
                        current_answer = ""
                    in_question = True
                    question_number = question_start.group(1).strip()
                    current_question = question_start.group(2).strip()
                    continue

                # Detect options
                option_match = re.match(r'^\((a|b|c|d)\)\s*(.*)', line)
                if in_question and option_match:
                    current_options.append(f"({option_match.group(1)}) {option_match.group(2).strip()}")
                    continue

                # Detect answer
                answer_match = re.match(r'^Answer\s*[:|=]\s*([a-dA-D1-5]|\d)\s*$', line)
                if in_question and answer_match:
                    current_answer = answer_match.group(1)
                    continue

                # Continue building multi-line question
                if in_question and line and not option_match and not answer_match and not re.match(r'^\s*$', line):
                    current_question += " " + line

            # Append the last question on the page if complete
            if in_question and current_question and len(current_options) >= 4:
                mcqs.append({
                    "question": current_question.strip(),
                    "year": current_year if current_year else "Not specified",
                    "options": current_options,
                    "answer": current_answer,
                    "solution": "",
                    "chapter": current_chapter,
                    "images": []
                })
                current_question = ""
                current_options = []
                current_answer = ""
                in_question = False

    # Append the last question if it wasn't added
    if in_question and current_question and len(current_options) >= 4:
        mcqs.append({
            "question": current_question.strip(),
            "year": current_year if current_year else "Not specified",
            "options": current_options,
            "answer": current_answer,
            "solution": "",
            "chapter": current_chapter,
            "images": []
        })

    return mcqs

# Path to the PDF file
pdf_path = "pdfs/mcqneet.pdf"

# Extract MCQs
mcqs = extract_mcqs_from_pdf(pdf_path)

# Output as JSON string
mcqs_json = json.dumps(mcqs, indent=4)

# Print the JSON (for demonstration; in artifact, this is the content)
print(mcqs_json)

CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, def

[
    {
        "question": "The defining characteristic of living beings is_________",
        "year": "Not specified",
        "options": [
            "(a) They can reproduce",
            "(b) They can digest their food",
            "(c) They can respond to external stimuli",
            "(d) They can regenerate"
        ],
        "answer": "C",
        "solution": "",
        "chapter": "The Living World",
        "images": []
    },
    {
        "question": "Metabolic Processes takes place_______",
        "year": "Not specified",
        "options": [
            "(a) in vitro manner",
            "(b) in Vivo manner",
            "(c) both a and b",
            "(d) none of the above"
        ],
        "answer": "C",
        "solution": "",
        "chapter": "The Living World",
        "images": []
    },
    {
        "question": "What are the twin characteristics of growth?",
        "year": "Not specified",
        "options": [
            "(a) increase in mass",
       

In [5]:
import pdfplumber
import re
import json
import uuid

def extract_mcqs_from_pdf(pdf_path):
    mcqs = []
    current_chapter = ""
    current_question = ""
    current_options = []
    current_answer = ""
    current_year = ""
    in_question = False
    question_number = 0

    # Mapping for converting numbered options to a, b, c, d
    number_to_letter = {'1': 'a', '2': 'b', '3': 'c', '4': 'd'}

    # Open the PDF file
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            # Extract text from the current page
            text = page.extract_text()
            if not text:
                continue
            lines = text.split('\n')

            for line in lines:
                line = line.strip()

                # Detect chapter headers (only "Topic :" followed by any text)
                chapter_match = re.match(r'^Topic\s*:\s*(.*)', line, re.IGNORECASE)
                if chapter_match:
                    current_chapter = chapter_match.group(1).strip() if chapter_match.group(1).strip() else "Unknown"
                    continue

                # Detect year
                year_match = re.search(r'\(NEET (\d{4}(, Phase [I|II]*)?)\)', line)
                if year_match:
                    current_year = year_match.group(1)
                    continue

                # Detect question start
                question_start = re.match(r'^(\d+\.\s+|[A-Z]\.\s+)(.*)', line)
                if question_start:
                    if in_question and current_question and len(current_options) >= 4:
                        # Normalize answer to lowercase a, b, c, d
                        if current_answer in number_to_letter:
                            current_answer = number_to_letter[current_answer]
                        elif current_answer.lower() in ['a', 'b', 'c', 'd']:
                            current_answer = current_answer.lower()
                        # Save previous question
                        mcqs.append({
                            "question": current_question.strip(),
                            "year": current_year if current_year else "Not specified",
                            "options": current_options,
                            "answer": current_answer,
                            "solution": "",  # No solutions provided in the document
                            "chapter": current_chapter,
                            "images": []
                        })
                        current_question = ""
                        current_options = []
                        current_answer = ""
                    in_question = True
                    question_number = question_start.group(1).strip()
                    current_question = question_start.group(2).strip()
                    continue

                # Detect options (support (a), A., and 1. formats)
                option_match = re.match(r'^\(([a-dA-D])\)\s*(.*)|([a-dA-D])\.\s*(.*)|([1-4])\.\s*(.*)', line)
                if in_question and option_match:
                    if option_match.group(1):  # (a) format
                        option_label = f"({option_match.group(1).lower()})"
                        option_text = option_match.group(2).strip()
                    elif option_match.group(3):  # A. format
                        option_label = f"({option_match.group(3).lower()})"
                        option_text = option_match.group(4).strip()
                    else:  # 1. format
                        option_label = f"({number_to_letter[option_match.group(5)]})"
                        option_text = option_match.group(6).strip()
                    current_options.append(f"{option_label} {option_text}")
                    continue

                # Detect answer
                answer_match = re.match(r'^Answer\s*[:|=]\s*([a-dA-D1-5]|\d)\s*$', line, re.IGNORECASE)
                if in_question and answer_match:
                    current_answer = answer_match.group(1)
                    continue

                # Continue building multi-line question
                if in_question and line and not option_match and not answer_match and not re.match(r'^\s*$', line):
                    current_question += " " + line

            # Append the last question on the page if complete
            if in_question and current_question and len(current_options) >= 4:
                # Normalize answer to lowercase a, b, c, d
                if current_answer in number_to_letter:
                    current_answer = number_to_letter[current_answer]
                elif current_answer.lower() in ['a', 'b', 'c', 'd']:
                    current_answer = current_answer.lower()
                mcqs.append({
                    "question": current_question.strip(),
                    "year": current_year if current_year else "Not specified",
                    "options": current_options,
                    "answer": current_answer,
                    "solution": "",
                    "chapter": current_chapter,
                    "images": []
                })
                current_question = ""
                current_options = []
                current_answer = ""
                in_question = False

    # Append the last question if it wasn't added
    if in_question and current_question and len(current_options) >= 4:
        # Normalize answer to lowercase a, b, c, d
        if current_answer in number_to_letter:
            current_answer = number_to_letter[current_answer]
        elif current_answer.lower() in ['a', 'b', 'c', 'd']:
            current_answer = current_answer.lower()
        mcqs.append({
            "question": current_question.strip(),
            "year": current_year if current_year else "Not specified",
            "options": current_options,
            "answer": current_answer,
            "solution": "",
            "chapter": current_chapter,
            "images": []
        })

    return mcqs

# Path to the PDF file
pdf_path = "pdfs/MCQNEET.pdf"

# Extract MCQs
mcqs = extract_mcqs_from_pdf(pdf_path)

# Output as JSON string
mcqs_json = json.dumps(mcqs, indent=4)

# Print the JSON (for demonstration; in artifact, this is the content)
print(mcqs_json)

CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, def

[
    {
        "question": "The defining characteristic of living beings is_________",
        "year": "Not specified",
        "options": [
            "(a) They can reproduce",
            "(b) They can digest their food",
            "(c) They can respond to external stimuli",
            "(d) They can regenerate"
        ],
        "answer": "c",
        "solution": "",
        "chapter": "The Living World",
        "images": []
    },
    {
        "question": "Metabolic Processes takes place_______",
        "year": "Not specified",
        "options": [
            "(a) in vitro manner",
            "(b) in Vivo manner",
            "(c) both a and b",
            "(d) none of the above"
        ],
        "answer": "c",
        "solution": "",
        "chapter": "The Living World",
        "images": []
    },
    {
        "question": "What are the twin characteristics of growth?",
        "year": "Not specified",
        "options": [
            "(a) increase in mass",
       

In [6]:
mcqs_json

'[\n    {\n        "question": "The defining characteristic of living beings is_________",\n        "year": "Not specified",\n        "options": [\n            "(a) They can reproduce",\n            "(b) They can digest their food",\n            "(c) They can respond to external stimuli",\n            "(d) They can regenerate"\n        ],\n        "answer": "c",\n        "solution": "",\n        "chapter": "The Living World",\n        "images": []\n    },\n    {\n        "question": "Metabolic Processes takes place_______",\n        "year": "Not specified",\n        "options": [\n            "(a) in vitro manner",\n            "(b) in Vivo manner",\n            "(c) both a and b",\n            "(d) none of the above"\n        ],\n        "answer": "c",\n        "solution": "",\n        "chapter": "The Living World",\n        "images": []\n    },\n    {\n        "question": "What are the twin characteristics of growth?",\n        "year": "Not specified",\n        "options": [\n         

In [30]:
mcqs

[{'question': 'The defining characteristic of living beings is_________',
  'year': 'Not specified',
  'options': ['(a) They can reproduce',
   '(b) They can digest their food',
   '(c) They can respond to external stimuli',
   '(d) They can regenerate'],
  'answer': 'c',
  'solution': '',
  'chapter': 'The Living World',
  'images': []},
 {'question': 'Metabolic Processes takes place_______',
  'year': 'Not specified',
  'options': ['(a) in vitro manner',
   '(b) in Vivo manner',
   '(c) both a and b',
   '(d) none of the above'],
  'answer': 'c',
  'solution': '',
  'chapter': 'The Living World',
  'images': []},
 {'question': 'What are the twin characteristics of growth?',
  'year': 'Not specified',
  'options': ['(a) increase in mass',
   '(b) increase in number',
   '(c) both a and b',
   '(d) none of the above'],
  'answer': 'c',
  'solution': '',
  'chapter': 'The Living World',
  'images': []},
 {'question': 'Growth in living organisms is from_______',
  'year': 'Not specified'

In [1]:
from pinecone import Pinecone, ServerlessSpec
# from sentence_transformers import SentenceTransformer
from datetime import datetime

In [2]:
import os 
from dotenv import load_dotenv
load_dotenv()

True

In [3]:
YOUR_PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")

In [4]:
pc = Pinecone(api_key=YOUR_PINECONE_API_KEY)  # Replace with your Pinecone API key
index_name = "neet-mcq-index"

# Check if index exists, create if it doesn't
if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        dimension=768,  # Dimension of all-MiniLM-L6-v2 embeddings
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region="us-east-1")
    )

# Connect to the index
index = pc.Index(index_name)

# Initialize sentence transformer model

  from .autonotebook import tqdm as notebook_tqdm


In [12]:
pip install langchain-google-genai

Collecting langchain-google-genai
  Downloading langchain_google_genai-2.1.5-py3-none-any.whl.metadata (5.2 kB)
Collecting filetype<2.0.0,>=1.2.0 (from langchain-google-genai)
  Using cached filetype-1.2.0-py2.py3-none-any.whl.metadata (6.5 kB)
Collecting google-ai-generativelanguage<0.7.0,>=0.6.18 (from langchain-google-genai)
  Using cached google_ai_generativelanguage-0.6.18-py3-none-any.whl.metadata (9.8 kB)
Collecting langchain-core<0.4.0,>=0.3.62 (from langchain-google-genai)
  Downloading langchain_core-0.3.63-py3-none-any.whl.metadata (5.8 kB)
Collecting google-api-core!=2.0.*,!=2.1.*,!=2.10.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,!=2.8.*,!=2.9.*,<3.0.0,>=1.34.1 (from google-api-core[grpc]!=2.0.*,!=2.1.*,!=2.10.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,!=2.8.*,!=2.9.*,<3.0.0,>=1.34.1->google-ai-generativelanguage<0.7.0,>=0.6.18->langchain-google-genai)
  Downloading google_api_core-2.25.0-py3-none-any.whl.metadata (3.0 kB)
Collecting google-auth!=2.24.0,!=2.25

In [7]:
GOOGLE_API_KEY = os.getenv("GEMINI_API_KEY")
os.environ["GOOGLE_API_KEY"] = GOOGLE_API_KEY

In [5]:
from langchain_google_genai import (ChatGoogleGenerativeAI,GoogleGenerativeAI,
                                    GoogleGenerativeAIEmbeddings)

In [None]:
# model = SentenceTransformer('all-MiniLM-L6-v2')

In [8]:
embedding = GoogleGenerativeAIEmbeddings(model="models/embedding-001")

In [26]:
for i, mcq in enumerate(mcqs):
    print(mcq['question'])
    question_embedding = embedding.embed_query(mcq['question'])
    break

The defining characteristic of living beings is_________


In [27]:
question_embedding

[0.0038249758072197437,
 -0.04345383867621422,
 0.0053563546389341354,
 0.030108550563454628,
 0.01671411655843258,
 0.048103321343660355,
 0.04282274469733238,
 0.013452754355967045,
 -0.020856749266386032,
 0.02638455294072628,
 0.028406722471117973,
 0.032862305641174316,
 -0.0031781503930687904,
 0.01248121727257967,
 0.0014180350117385387,
 0.019771577790379524,
 0.04241036996245384,
 -0.0018285228870809078,
 0.0033135120756924152,
 -0.06168758496642113,
 -0.012956629507243633,
 -0.03615153208374977,
 0.016235440969467163,
 -0.054093968123197556,
 0.02584381215274334,
 0.0044069597497582436,
 0.013460288755595684,
 -0.07428047806024551,
 -0.028150683268904686,
 0.05892173573374748,
 -0.06670114398002625,
 0.0342709980905056,
 -0.06402906775474548,
 -0.01134917140007019,
 -0.005995880346745253,
 -0.030499164015054703,
 0.06064718961715698,
 -0.007357964292168617,
 0.017825046554207802,
 0.050923410803079605,
 -0.011602132581174374,
 -0.0022172636818140745,
 -0.06118451803922653,
 -

In [25]:
embedding.embed_query('The defining characteristic of living beings is_________')

[0.0038249758072197437,
 -0.04345383867621422,
 0.0053563546389341354,
 0.030108550563454628,
 0.01671411655843258,
 0.048103321343660355,
 0.04282274469733238,
 0.013452754355967045,
 -0.020856749266386032,
 0.02638455294072628,
 0.028406722471117973,
 0.032862305641174316,
 -0.0031781503930687904,
 0.01248121727257967,
 0.0014180350117385387,
 0.019771577790379524,
 0.04241036996245384,
 -0.0018285228870809078,
 0.0033135120756924152,
 -0.06168758496642113,
 -0.012956629507243633,
 -0.03615153208374977,
 0.016235440969467163,
 -0.054093968123197556,
 0.02584381215274334,
 0.0044069597497582436,
 0.013460288755595684,
 -0.07428047806024551,
 -0.028150683268904686,
 0.05892173573374748,
 -0.06670114398002625,
 0.0342709980905056,
 -0.06402906775474548,
 -0.01134917140007019,
 -0.005995880346745253,
 -0.030499164015054703,
 0.06064718961715698,
 -0.007357964292168617,
 0.017825046554207802,
 0.050923410803079605,
 -0.011602132581174374,
 -0.0022172636818140745,
 -0.06118451803922653,
 -

In [None]:
def store_mcqs_in_pinecone(mcqs):
    # Generate embeddings for questions
    vectors = []

    for i, mcq in enumerate(mcqs):
        # Create embedding for the question
        question_embedding = embedding.embed_query(mcq['question']) # .tolist()\
        print("embedding done for mcq", i)

        # Create a unique ID for the vector
        vector_id = str(uuid.uuid4())
        # Prepare metadata
        metadata = {
            "subject": "Biology",
            "question": mcq['question'],
            "year": mcq['year'],
            "chapter": mcq['chapter'],
            "options": mcq['options'],
            "answer": mcq['answer'],
            "solution": mcq['solution'],
            "images": mcq['images']

        }
        vectors.append((vector_id, question_embedding, metadata))
    
    # Upsert vectors to Pinecone
    index.upsert(vectors=vectors)
    print(f"Stored {len(vectors)} MCQs in Pinecone index '{index_name}'.")

In [33]:
store_mcqs_in_pinecone(mcqs)


embedding done for mcq 0
embedding done for mcq 1
embedding done for mcq 2
embedding done for mcq 3
embedding done for mcq 4
embedding done for mcq 5
embedding done for mcq 6
embedding done for mcq 7
embedding done for mcq 8
embedding done for mcq 9
embedding done for mcq 10
embedding done for mcq 11
embedding done for mcq 12
embedding done for mcq 13
embedding done for mcq 14
embedding done for mcq 15
embedding done for mcq 16
embedding done for mcq 17
embedding done for mcq 18
embedding done for mcq 19
embedding done for mcq 20
embedding done for mcq 21
embedding done for mcq 22
embedding done for mcq 23
embedding done for mcq 24
embedding done for mcq 25
embedding done for mcq 26
embedding done for mcq 27
embedding done for mcq 28
embedding done for mcq 29
embedding done for mcq 30
embedding done for mcq 31
embedding done for mcq 32
embedding done for mcq 33
embedding done for mcq 34
embedding done for mcq 35
embedding done for mcq 36
embedding done for mcq 37
embedding done for mcq

In [None]:


def retrieve_mcqs(query, year=None, chapter=None, top_k=5):
    # Generate embedding for the query
    query_embedding = embedding.embed_query(query)
    
    # Prepare filter
    filter_conditions = {}
    if year:
        filter_conditions["year"] = year
    if chapter:
        filter_conditions["chapter"] = chapter
    
    # Query Pinecone
    results = index.query(
        vector=query_embedding,
        top_k=top_k,
        include_metadata=True,
        filter=filter_conditions if filter_conditions else None
    )
    
    # Format results
    retrieved_mcqs = []
    for match in results['matches']:
        metadata = match['metadata']
        retrieved_mcqs.append({
            "question": metadata['question'],
            "year": metadata['year'],
            "options": metadata['options'],
            "answer": metadata['answer'],
            "solution": metadata['solution'],
            "chapter": metadata['chapter'],
            "images": metadata['images'],
            "score": match['score']
        })
    
    return retrieved_mcqs


In [None]:
retrieve_mcqs("What is the defining characteristic of living beings?", year="2024", chapter="The Living World", top_k=3)

In [9]:
def retrieve_mcqs(chapter, page=1):
    """
    Retrieve 10 MCQs from the specified chapter, paginated by page number.
    
    Args:
        chapter (str): The chapter to filter by (e.g., "The Living World").
        page (int): The page number (1-based) to retrieve (default: 1).
    
    Returns:
        List of up to 10 MCQs with metadata.
    """
    # Validate inputs
    if not chapter:
        raise ValueError("Chapter must be specified.")
    if page < 1:
        raise ValueError("Page number must be at least 1.")
    
    # Generate a dummy query embedding (since we're filtering by chapter only)
    # Use a neutral query to fetch all questions in the chapter
    query = ""  # Neutral query to avoid bias
    query_embedding = embedding.embed_query(query)
    
    # Prepare filter
    filter_conditions = {"chapter": chapter}
    
    # Calculate top_k and offset
    questions_per_page = 10
    top_k = questions_per_page
    offset = (page - 1) * questions_per_page
    
    # Query Pinecone
    results = index.query(
        vector=query_embedding,
        top_k=top_k + offset,  # Fetch enough to skip previous pages
        include_metadata=True,
        filter=filter_conditions
    )
    
    # Extract the relevant page of results
    retrieved_mcqs = []
    matches = results['matches'][offset:offset + questions_per_page]
    for match in matches:
        metadata = match['metadata']
        retrieved_mcqs.append({
            "question": metadata['question'],
            "year": metadata['year'],
            "options": metadata['options'],
            "answer": metadata['answer'],
            "solution": metadata['solution'],
            "chapter": metadata['chapter'],
            "images": metadata['images'],
            "score": match['score']
        })
    
    return retrieved_mcqs

In [16]:
def retrieve_mcqs(query=None, year=None, chapter=None, page=1):
    """
    Retrieve 10 MCQs based on query, year, and/or chapter, paginated by page number.
    
    Args:
        query (str, optional): Question text to search for (partial or full).
        year (str, optional): Year to filter by (e.g., "NEET 2022").
        chapter (str, optional): Chapter to filter by (e.g., "The Living World").
        page (int): The page number (1-based) to retrieve (default: 1).
    
    Returns:
        List of up to 10 MCQs with metadata.
    """
    # Validate inputs
    if page < 1:
        raise ValueError("Page number must be at least 1.")
    
    # Use a neutral query if none provided
    query_text = query if query else ""
    query_embedding = embedding.embed_query(query_text)
    
    # Prepare filter
    filter_conditions = {}
    if year:
        filter_conditions["year"] = year
    if chapter:
        filter_conditions["chapter"] = chapter
    
    # Calculate top_k and offset
    questions_per_page = 10
    top_k = questions_per_page
    offset = (page - 1) * questions_per_page
    
    # Query Pinecone
    results = index.query(
        vector=query_embedding,
        top_k=top_k + offset,  # Fetch enough to skip previous pages
        include_metadata=True,
        filter=filter_conditions if filter_conditions else None
    )
    
    # Extract the relevant page of results
    retrieved_mcqs = []
    matches = results['matches'][offset:offset + questions_per_page]
    for match in matches:
        metadata = match['metadata']
        retrieved_mcqs.append({
            "question": metadata['question'],
            "year": metadata['year'],
            "options": metadata['options'],
            "answer": metadata['answer'],
            "solution": metadata['solution'],
            "chapter": metadata['chapter'],
            "images": metadata['images'],
            "score": match['score']
        })
    
    return retrieved_mcqs


In [17]:
chapter = "The Living World"
page = 1
query = None
year = None
retrieved_mcqslist3 = retrieve_mcqs(query=query,year=year,chapter=chapter, page=page)

In [18]:
retrieved_mcqslist3

[{'question': 'Identify the incorrect match.',
  'year': 'Not specified',
  'options': ['(a) Physiology – Study of functions and processes of life',
   '(b) Pedology – Soil science',
   '(c) Limnology – Study of freshwater',
   '(d) Kinesiology – Fossil study'],
  'answer': 'd',
  'solution': '',
  'chapter': 'The Living World',
  'images': [],
  'score': 0.733659},
 {'question': 'ICBN stands for___________',
  'year': 'Not specified',
  'options': ['(a) International Code of Botanical Nomenclature',
   '(b) International Congress of Biological Names',
   '(c) Indian Code of Botanical Nomenclature',
   '(d) Indian Congress of Biological Names'],
  'answer': 'a',
  'solution': '',
  'chapter': 'The Living World',
  'images': [],
  'score': 0.671307445},
 {'question': 'Which of the following is self-conscious?',
  'year': 'Not specified',
  'options': ['(a) human being',
   '(b) salamander',
   '(c) Earthworm',
   '(d) None of these'],
  'answer': 'a',
  'solution': '',
  'chapter': 'The

In [13]:
retrieved_mcqslist

[{'question': 'Identify the incorrect match.',
  'year': 'Not specified',
  'options': ['(a) Physiology – Study of functions and processes of life',
   '(b) Pedology – Soil science',
   '(c) Limnology – Study of freshwater',
   '(d) Kinesiology – Fossil study'],
  'answer': 'd',
  'solution': '',
  'chapter': 'The Living World',
  'images': [],
  'score': 0.733659},
 {'question': 'ICBN stands for___________',
  'year': 'Not specified',
  'options': ['(a) International Code of Botanical Nomenclature',
   '(b) International Congress of Biological Names',
   '(c) Indian Code of Botanical Nomenclature',
   '(d) Indian Congress of Biological Names'],
  'answer': 'a',
  'solution': '',
  'chapter': 'The Living World',
  'images': [],
  'score': 0.671307445},
 {'question': 'Which of the following is self-conscious?',
  'year': 'Not specified',
  'options': ['(a) human being',
   '(b) salamander',
   '(c) Earthworm',
   '(d) None of these'],
  'answer': 'a',
  'solution': '',
  'chapter': 'The

In [14]:
chapter = "The Living World"
page = 2
retrieved_mcqslist2 = retrieve_mcqs(chapter=chapter, page=page)

In [15]:
retrieved_mcqslist2

[{'question': 'What are the twin characteristics of growth?',
  'year': 'Not specified',
  'options': ['(a) increase in mass',
   '(b) increase in number',
   '(c) both a and b',
   '(d) none of the above'],
  'answer': 'c',
  'solution': '',
  'chapter': 'The Living World',
  'images': [],
  'score': 0.635274649},
 {'question': 'Musca domestica is common name of______________',
  'year': 'Not specified',
  'options': ['(a) Housefly', '(b) Mosquito', '(c) Snail', '(d) Ant'],
  'answer': 'a',
  'solution': '',
  'chapter': 'The Living World',
  'images': [],
  'score': 0.634492397},
 {'question': 'The total of all the chemical reactions occurring in the body is known as______________',
  'year': 'Not specified',
  'options': ['(a) metabolism',
   '(b) catabolism',
   '(c) anabolism',
   '(d) None of these'],
  'answer': 'a',
  'solution': '',
  'chapter': 'The Living World',
  'images': [],
  'score': 0.631413579},
 {'question': 'Growth in living organisms is from_______',
  'year': 'No

In [None]:
# pip install pinecone

Collecting pinecone
  Downloading pinecone-7.0.2-py3-none-any.whl.metadata (9.5 kB)
Collecting pinecone-plugin-assistant<2.0.0,>=1.6.0 (from pinecone)
  Downloading pinecone_plugin_assistant-1.6.1-py3-none-any.whl.metadata (27 kB)
Collecting pinecone-plugin-interface<0.0.8,>=0.0.7 (from pinecone)
  Using cached pinecone_plugin_interface-0.0.7-py3-none-any.whl.metadata (1.2 kB)
Collecting packaging<25.0,>=24.2 (from pinecone-plugin-assistant<2.0.0,>=1.6.0->pinecone)
  Using cached packaging-24.2-py3-none-any.whl.metadata (3.2 kB)
Downloading pinecone-7.0.2-py3-none-any.whl (516 kB)
Downloading pinecone_plugin_assistant-1.6.1-py3-none-any.whl (239 kB)
Using cached packaging-24.2-py3-none-any.whl (65 kB)
Using cached pinecone_plugin_interface-0.0.7-py3-none-any.whl (6.2 kB)
Installing collected packages: pinecone-plugin-interface, packaging, pinecone-plugin-assistant, pinecone

   ---------------------------------------- 0/4 [pinecone-plugin-interface]
  Attempting uninstall: packaging
  